From d0e4af8a88dc7a0377677000d0c92104ff215347 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 15 Apr 2025 11:19:19 -0400 Subject: [PATCH 001/710] Silence -Wcast-function-type warnings on idiomatic Windows code (#135660) On Windows, GetProcAddress() is the API used to dynamically load function pointers (similar to dlsym on Linux). This API returns a function pointer (a typedef named FARPROC), which means that casting from the call to the eventual correct type is technically a function type mismatch on the cast. However, because this is idiomatic code on Windows, we should accept it unless -Wcast-function-type-strict is passed. This was brought up in post-commit review feedback on https://github.com/llvm/llvm-project/pull/86131 --- clang/docs/ReleaseNotes.rst | 10 ++++++ clang/lib/Sema/SemaCast.cpp | 23 ++++++++++++ clang/test/Sema/warn-cast-function-type-win.c | 36 +++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 clang/test/Sema/warn-cast-function-type-win.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6d1daaa84caaa..2d2606085998c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -358,6 +358,16 @@ Improvements to Clang's diagnostics - Now correctly diagnose a tentative definition of an array with static storage duration in pedantic mode in C. (#GH50661) +- No longer diagnosing idiomatic function pointer casts on Windows under + ``-Wcast-function-type-mismatch`` (which is enabled by ``-Wextra``). Clang + would previously warn on this construct, but will no longer do so on Windows: + + .. code-block:: c + + typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); + HMODULE Lib = LoadLibrary("kernel32"); + PGNSI FnPtr = (PGNSI)GetProcAddress(Lib, "GetNativeSystemInfo"); + - An error is now emitted when a ``musttail`` call is made to a function marked with the ``not_tail_called`` attribute. (#GH133509). diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 2824dfce1572c..1591075ff05d8 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -1153,10 +1153,33 @@ static unsigned int checkCastFunctionType(Sema &Self, const ExprResult &SrcExpr, return false; }; + auto IsFarProc = [](const FunctionType *T) { + // The definition of FARPROC depends on the platform in terms of its return + // type, which could be int, or long long, etc. We'll look for a source + // signature for: (*)() and call that "close enough" to + // FARPROC to be sufficient to silence the diagnostic. This is similar to + // how we allow casts between function pointers and void * for supporting + // dlsym. + // Note: we could check for __stdcall on the function pointer as well, but + // that seems like splitting hairs. + if (!T->getReturnType()->isIntegerType()) + return false; + if (const auto *PT = T->getAs()) + return !PT->isVariadic() && PT->getNumParams() == 0; + return true; + }; + // Skip if either function type is void(*)(void) if (IsVoidVoid(SrcFTy) || IsVoidVoid(DstFTy)) return 0; + // On Windows, GetProcAddress() returns a FARPROC, which is a typedef for a + // function pointer type (with no prototype, in C). We don't want to diagnose + // this case so we don't diagnose idiomatic code on Windows. + if (Self.getASTContext().getTargetInfo().getTriple().isOSWindows() && + IsFarProc(SrcFTy)) + return 0; + // Check return type. if (!argTypeIsABIEquivalent(SrcFTy->getReturnType(), DstFTy->getReturnType(), Self.Context)) diff --git a/clang/test/Sema/warn-cast-function-type-win.c b/clang/test/Sema/warn-cast-function-type-win.c new file mode 100644 index 0000000000000..4e7ba33b258d8 --- /dev/null +++ b/clang/test/Sema/warn-cast-function-type-win.c @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 %s -triple x86_64-windows -fsyntax-only -Wcast-function-type -Wno-cast-function-type-strict -verify=windows +// RUN: %clang_cc1 %s -triple x86_64-windows -fsyntax-only -Wcast-function-type -Wno-cast-function-type-strict -x c++ -verify=windows +// RUN: %clang_cc1 %s -triple x86_64-pc-linux -fsyntax-only -Wcast-function-type -Wno-cast-function-type-strict -verify=linux +// RUN: %clang_cc1 %s -triple x86_64-pc-linux -fsyntax-only -Wcast-function-type -Wno-cast-function-type-strict -x c++ -verify=linux,linux-cpp +// RUN: %clang_cc1 %s -triple x86_64-windows -fsyntax-only -Wcast-function-type -Wcast-function-type-strict -x c++ -verify=strict +// windows-no-diagnostics + +// On Windows targets, this is expected to compile fine, and on non-Windows +// targets, this should diagnose the mismatch. This is to allow for idiomatic +// use of GetProcAddress, similar to what we do for dlsym. On non-Windows +// targets, this should be diagnosed. +typedef int (*FARPROC1)(); +typedef unsigned long long (*FARPROC2)(); + +FARPROC1 GetProcAddress1(void); +FARPROC2 GetProcAddress2(void); + +typedef int (*test1_type)(int); +typedef float(*test2_type)(); + +void test(void) { + // This does not diagnose on Linux in C mode because FARPROC1 has a matching + // return type to test1_type, but FARPROC1 has no prototype and so checking + // is disabled for further compatibility issues. In C++ mode, all functions + // have a prototype and so the check happens. + test1_type t1 = (test1_type)GetProcAddress1(); + // linux-cpp-warning@-1 {{cast from 'FARPROC1' (aka 'int (*)()') to 'test1_type' (aka 'int (*)(int)') converts to incompatible function type}} + // strict-warning@-2 {{cast from 'FARPROC1' (aka 'int (*)()') to 'test1_type' (aka 'int (*)(int)') converts to incompatible function type}} + + // This case is diagnosed in both C and C++ modes on Linux because the return + // type of FARPROC2 does not match the return type of test2_type. + test2_type t2 = (test2_type)GetProcAddress2(); + // linux-warning@-1 {{cast from 'FARPROC2' (aka 'unsigned long long (*)()') to 'test2_type' (aka 'float (*)()') converts to incompatible function type}} + // strict-warning@-2 {{cast from 'FARPROC2' (aka 'unsigned long long (*)()') to 'test2_type' (aka 'float (*)()') converts to incompatible function type}} +} + From 88b6229dc3f65876b3f627616a024cd5dbcadcb0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 15 Apr 2025 08:41:39 -0700 Subject: [PATCH 002/710] [TableGen] Remove unhelpful error messages from PseudoLoweringEmitter. (#135747) All of the notes using the location of ResultInst will just print the location inside of the PseudoInstExpansion class. There was one note using the location of DI->getDef(), but knowing where one of the two mismatched types is defined isn't helpful. The operand types need to be the same, so the mismatch message we already printed should be enough. --- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 67 +++++++------------ 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index 56b9e499cb4aa..7f67c13c0bbbd 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -90,14 +90,12 @@ unsigned PseudoLoweringEmitter::addDagOperandMapping( // problem. // FIXME: We probably shouldn't ever get a non-zero BaseIdx here. assert(BaseIdx == 0 && "Named subargument in pseudo expansion?!"); - if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', operand type '" + DI->getDef()->getName() + - "' does not match expansion operand type '" + - Insn.Operands[BaseIdx + i].Rec->getName() + "'"); - PrintFatalNote(DI->getDef(), - "Value was assigned at the following location:"); - } + if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', operand type '" + DI->getDef()->getName() + + "' does not match expansion operand type '" + + Insn.Operands[BaseIdx + i].Rec->getName() + + "'"); // Source operand maps to destination operand. The Data element // will be filled in later, just set the Kind for now. Do it // for each corresponding MachineInstr operand, not just the first. @@ -138,38 +136,26 @@ void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { LLVM_DEBUG(dbgs() << " Result: " << *Dag << "\n"); const DefInit *OpDef = dyn_cast(Dag->getOperator()); - if (!OpDef) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator is not a record"); - PrintFatalNote(Rec->getValue("ResultInst"), - "Result was assigned at the following location:"); - } + if (!OpDef) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', result operator is not a record"); const Record *Operator = OpDef->getDef(); - if (!Operator->isSubClassOf("Instruction")) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' is not an instruction"); - PrintFatalNote(Rec->getValue("ResultInst"), - "Result was assigned at the following location:"); - } + if (!Operator->isSubClassOf("Instruction")) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', result operator '" + Operator->getName() + + "' is not an instruction"); CodeGenInstruction Insn(Operator); - if (Insn.isCodeGenOnly || Insn.isPseudo) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' cannot be a pseudo instruction"); - PrintFatalNote(Rec->getValue("ResultInst"), - "Result was assigned at the following location:"); - } + if (Insn.isCodeGenOnly || Insn.isPseudo) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', result operator '" + Operator->getName() + + "' cannot be a pseudo instruction"); - if (Insn.Operands.size() != Dag->getNumArgs()) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', result operator '" + Operator->getName() + - "' has the wrong number of operands"); - PrintFatalNote(Rec->getValue("ResultInst"), - "Result was assigned at the following location:"); - } + if (Insn.Operands.size() != Dag->getNumArgs()) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', result operator '" + Operator->getName() + + "' has the wrong number of operands"); unsigned NumMIOperands = 0; for (const auto &Op : Insn.Operands) @@ -202,13 +188,10 @@ void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { continue; StringMap::iterator SourceOp = SourceOperands.find(Dag->getArgNameStr(i)); - if (SourceOp == SourceOperands.end()) { - PrintError(Rec, "In pseudo instruction '" + Rec->getName() + - "', output operand '" + Dag->getArgNameStr(i) + - "' has no matching source operand"); - PrintFatalNote(Rec->getValue("ResultInst"), - "Value was assigned at the following location:"); - } + if (SourceOp == SourceOperands.end()) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', output operand '" + Dag->getArgNameStr(i) + + "' has no matching source operand"); // Map the source operand to the destination operand index for each // MachineInstr operand. for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I) From 7b5a459611212b650e863c0ad6a9fa49c07e29df Mon Sep 17 00:00:00 2001 From: Paul Bowen-Huggett Date: Tue, 15 Apr 2025 17:52:00 +0200 Subject: [PATCH 003/710] [RISCV] Just reporting an error shouldn't generate a crash diagnostic (#134040) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wanting to examine some of generated code, I tried MCA with the command: ~~~bash llvm-mca -mtriple=riscv32-unknown-unknown -mcpu=rocket -iterations=300 core_list_join.s ~~~ I was greeted with the following error message: ~~~ LLVM ERROR: RV32 target requires an RV32 CPU PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: … ~~~ On beginning to investigate the “bug”, I discovered that the code was simply attempting to report a user error. It used report_fatal_error() to do so but with the “bool GenCrashDiag” argument enabled (the default). This tiny change adds a wrapper function which calls report_fatal_error() as before but with GenCrashDiag disabled. --- .../RISCV/MCTargetDesc/RISCVBaseInfo.cpp | 18 +++++++++++++----- llvm/test/MC/RISCV/target-abi-invalid.s | 6 +++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index 3b3460c308d7e..6d2659aa1236e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -51,6 +51,14 @@ namespace RISCV { #include "RISCVGenSearchableTables.inc" } // namespace RISCV +// Report an error but don't ask the user to report a bug. +[[noreturn]] static void reportError(const char *Reason) { + report_fatal_error(Reason, /*gen_crash_diag=*/false); +} +[[noreturn]] static void reportError(Error Err) { + report_fatal_error(std::move(Err), /*gen_crash_diag=*/false); +} + namespace RISCVABI { ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, StringRef ABIName) { @@ -87,7 +95,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, if ((TargetABI == RISCVABI::ABI::ABI_ILP32E || (TargetABI == ABI_Unknown && IsRVE && !IsRV64)) && FeatureBits[RISCV::FeatureStdExtD]) - report_fatal_error("ILP32E cannot be used with the D ISA extension"); + reportError("ILP32E cannot be used with the D ISA extension"); if (TargetABI != ABI_Unknown) return TargetABI; @@ -95,7 +103,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, // If no explicit ABI is given, try to compute the default ABI. auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits); if (!ISAInfo) - report_fatal_error(ISAInfo.takeError()); + reportError(ISAInfo.takeError()); return getTargetABI((*ISAInfo)->computeDefaultABI()); } @@ -127,12 +135,12 @@ namespace RISCVFeatures { void validate(const Triple &TT, const FeatureBitset &FeatureBits) { if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit]) - report_fatal_error("RV64 target requires an RV64 CPU"); + reportError("RV64 target requires an RV64 CPU"); if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit]) - report_fatal_error("RV32 target requires an RV32 CPU"); + reportError("RV32 target requires an RV32 CPU"); if (FeatureBits[RISCV::Feature32Bit] && FeatureBits[RISCV::Feature64Bit]) - report_fatal_error("RV32 and RV64 can't be combined"); + reportError("RV32 and RV64 can't be combined"); } llvm::Expected> diff --git a/llvm/test/MC/RISCV/target-abi-invalid.s b/llvm/test/MC/RISCV/target-abi-invalid.s index f78b1481b1e48..253af3f1a395a 100644 --- a/llvm/test/MC/RISCV/target-abi-invalid.s +++ b/llvm/test/MC/RISCV/target-abi-invalid.s @@ -30,7 +30,7 @@ # RUN: | FileCheck -check-prefix=RV32E-LP64 %s # RUN: llvm-mc -triple=riscv32 -mattr=+e,+f -target-abi lp64f < %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32EF-LP64F %s -# RUN: not --crash llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi lp64f < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi lp64f < %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32EFD-LP64D %s # RUN: llvm-mc -triple=riscv32 -mattr=+e -target-abi lp64e %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32E-LP64E %s @@ -70,9 +70,9 @@ # RUN: | FileCheck -check-prefix=RV32EF-ILP32F %s # RUN: llvm-mc -triple=riscv32 -mattr=+e,+f -target-abi ilp32f < %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32EF-ILP32F %s -# RUN: not --crash llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi ilp32f < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi ilp32f < %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32EFD-ILP32F %s -# RUN: not --crash llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi ilp32d < %s 2>&1 \ +# RUN: not llvm-mc -triple=riscv32 -mattr=+e,+d -target-abi ilp32d < %s 2>&1 \ # RUN: | FileCheck -check-prefix=RV32EFD-ILP32D %s # RV32E-ILP32: Only the ilp32e ABI is supported for RV32E (ignoring target-abi) From a399c6926a8701083c767cbb041e22ff92e9d717 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 15 Apr 2025 13:00:49 -0300 Subject: [PATCH 004/710] [CI] monolithic-linux improvements (#135499) Some improvements to monolithic-linux CI: 1) Add correct configuration and dependencies for LLDB testing which is actually relevant for clang changes. 2) Skip clang installation and separate configuration for runtimes. They will be built with the just built clang either way. This avoids building the runtimes twice when LLDB is also tested. 3) Make sure any generated clang reproducers end up as artifacts. 4) Set up llvm-symbolizer environment variable so that its preferred over any symbolizer just built, as it can be much slower when built for debugging. 5) Add all projects as dependencies of `.ci`, to make sure everything is tested when it changes. --- .ci/compute_projects.py | 10 +++-- .ci/compute_projects_test.py | 13 +++++++ .ci/monolithic-linux.sh | 72 ++++++++++++++++-------------------- lldb/test/requirements.txt | 2 + 4 files changed, 53 insertions(+), 44 deletions(-) diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py index ff43547c9bbe5..17a2136a270d5 100644 --- a/.ci/compute_projects.py +++ b/.ci/compute_projects.py @@ -52,6 +52,9 @@ "clang": {"clang-tools-extra", "compiler-rt", "cross-project-tests"}, "clang-tools-extra": {"libc"}, "mlir": {"flang"}, + # Test everything if ci scripts are changed. + # FIXME: Figure out what is missing and add here. + ".ci": {"llvm", "clang", "lld", "lldb"}, } DEPENDENT_RUNTIMES_TO_TEST = {"clang": {"libcxx", "libcxxabi", "libunwind"}} @@ -130,12 +133,11 @@ def _add_dependencies(projects: Set[str]) -> Set[str]: def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]: projects_to_test = set() for modified_project in modified_projects: - # Skip all projects where we cannot run tests. - if modified_project not in PROJECT_CHECK_TARGETS: - continue if modified_project in RUNTIMES: continue - projects_to_test.add(modified_project) + # Skip all projects where we cannot run tests. + if modified_project in PROJECT_CHECK_TARGETS: + projects_to_test.add(modified_project) if modified_project not in DEPENDENTS_TO_TEST: continue for dependent_project in DEPENDENTS_TO_TEST[modified_project]: diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py index e787fd8133c86..1ab1c82498932 100644 --- a/.ci/compute_projects_test.py +++ b/.ci/compute_projects_test.py @@ -188,6 +188,19 @@ def test_exclude_gn(self): self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") + def test_ci(self): + env_variables = compute_projects.get_env_variables( + [".ci/compute_projects.py"], "Linux" + ) + self.assertEqual(env_variables["projects_to_build"], + "clang;lld;llvm;lldb") + self.assertEqual(env_variables["project_check_targets"], "check-clang + check-lld check-llvm check-lldb") + self.assertEqual(env_variables["runtimes_to_build"], + "libcxx;libcxxabi;libunwind") + self.assertEqual(env_variables["runtimes_check_targets"], "check-cxx + check-cxxabi check-unwind") + if __name__ == "__main__": unittest.main() diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 6461c9d40ad59..f81a14cca6cb3 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -18,7 +18,6 @@ set -o pipefail MONOREPO_ROOT="${MONOREPO_ROOT:="$(git rev-parse --show-toplevel)"}" BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}" -INSTALL_DIR="${BUILD_DIR}/install" rm -rf "${BUILD_DIR}" ccache --zero-stats @@ -28,10 +27,14 @@ if [[ -n "${CLEAR_CACHE:-}" ]]; then ccache --clear fi +mkdir -p artifacts/reproducers + +# Make sure any clang reproducers will end up as artifacts. +export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers` + function at-exit { retcode=$? - mkdir -p artifacts ccache --print-stats > artifacts/ccache_stats.txt cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log @@ -50,17 +53,28 @@ trap at-exit EXIT projects="${1}" targets="${2}" +runtimes="${3}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" echo "--- cmake" + export PIP_BREAK_SYSTEM_PACKAGES=1 pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt + +# Set the system llvm-symbolizer as preferred. +export LLVM_SYMBOLIZER_PATH=`which llvm-symbolizer` +[[ ! -f "${LLVM_SYMBOLIZER_PATH}" ]] && echo "llvm-symbolizer not found!" + +# Set up all runtimes either way. libcxx is a dependency of LLDB. +# If it ends up being unused, not much harm. cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ + -D LLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \ -G Ninja \ + -D CMAKE_PREFIX_PATH="${HOME}/.local" \ -D CMAKE_BUILD_TYPE=Release \ -D LLVM_ENABLE_ASSERTIONS=ON \ -D LLVM_BUILD_EXAMPLES=ON \ @@ -69,69 +83,47 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_LLD=ON \ -D CMAKE_CXX_FLAGS=-gmlt \ -D LLVM_CCACHE_BUILD=ON \ + -D LIBCXX_CXX_ABI=libcxxabi \ -D MLIR_ENABLE_BINDINGS_PYTHON=ON \ - -D CMAKE_INSTALL_PREFIX="${INSTALL_DIR}" + -D LLDB_ENABLE_PYTHON=ON \ + -D LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS=ON echo "--- ninja" + # Targets are not escaped as they are passed as separate arguments. ninja -C "${BUILD_DIR}" -k 0 ${targets} -runtimes="${3}" runtime_targets="${4}" -# Compiling runtimes with just-built Clang and running their tests -# as an additional testing for Clang. +# Run runtimes tests. +# We don't need to do a clean separate build of runtimes, because runtimes +# will be built against just built clang, and because LIBCXX_TEST_PARAMS +# and LIBCXXABI_TEST_PARAMS only affect lit configuration, which successfully +# propagates without a clean build. Other that those two variables, builds +# are supposed to be the same. if [[ "${runtimes}" != "" ]]; then if [[ "${runtime_targets}" == "" ]]; then echo "Runtimes to build are specified, but targets are not." exit 1 fi - echo "--- ninja install-clang" - - ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers - - RUNTIMES_BUILD_DIR="${MONOREPO_ROOT}/build-runtimes" - INSTALL_DIR="${BUILD_DIR}/install" - mkdir -p ${RUNTIMES_BUILD_DIR} - echo "--- cmake runtimes C++26" - rm -rf "${RUNTIMES_BUILD_DIR}" - cmake -S "${MONOREPO_ROOT}/runtimes" -B "${RUNTIMES_BUILD_DIR}" -GNinja \ - -D CMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \ - -D CMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \ - -D LLVM_ENABLE_RUNTIMES="${runtimes}" \ - -D LIBCXX_CXX_ABI=libcxxabi \ - -D CMAKE_BUILD_TYPE=RelWithDebInfo \ - -D CMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ + cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LIBCXX_TEST_PARAMS="std=c++26" \ - -D LIBCXXABI_TEST_PARAMS="std=c++26" \ - -D LLVM_LIT_ARGS="${lit_args}" + -D LIBCXXABI_TEST_PARAMS="std=c++26" echo "--- ninja runtimes C++26" - ninja -vC "${RUNTIMES_BUILD_DIR}" ${runtime_targets} + ninja -vC "${BUILD_DIR}" ${runtime_targets} echo "--- cmake runtimes clang modules" - # We don't need to do a clean build of runtimes, because LIBCXX_TEST_PARAMS - # and LIBCXXABI_TEST_PARAMS only affect lit configuration, which successfully - # propagates without a clean build. Other that those two variables, builds - # are supposed to be the same. - - cmake -S "${MONOREPO_ROOT}/runtimes" -B "${RUNTIMES_BUILD_DIR}" -GNinja \ - -D CMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \ - -D CMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \ - -D LLVM_ENABLE_RUNTIMES="${runtimes}" \ - -D LIBCXX_CXX_ABI=libcxxabi \ - -D CMAKE_BUILD_TYPE=RelWithDebInfo \ - -D CMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ + cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LIBCXX_TEST_PARAMS="enable_modules=clang" \ - -D LIBCXXABI_TEST_PARAMS="enable_modules=clang" \ - -D LLVM_LIT_ARGS="${lit_args}" + -D LIBCXXABI_TEST_PARAMS="enable_modules=clang" echo "--- ninja runtimes clang modules" - ninja -vC "${RUNTIMES_BUILD_DIR}" ${runtime_targets} + ninja -vC "${BUILD_DIR}" ${runtime_targets} fi diff --git a/lldb/test/requirements.txt b/lldb/test/requirements.txt index 708f2327a6a04..c187180697d0b 100644 --- a/lldb/test/requirements.txt +++ b/lldb/test/requirements.txt @@ -5,3 +5,5 @@ psutil>=5.9.4 # See llvm.org/pr22274. pexpect>=4.9.0; sys_platform != 'win32' packaging +# Required for python tests +swig From 289baf1f42c8b5773271b611cd235d4ab94bb4e8 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Tue, 15 Apr 2025 12:04:35 -0400 Subject: [PATCH 005/710] [clang][AST] Handle implicit first argument in CallExpr::getBeginLoc() (#135757) Fixes https://github.com/llvm/llvm-project/issues/135522 --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/AST/Expr.cpp | 7 +++++-- clang/test/SemaCXX/cxx2b-deducing-this.cpp | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2d2606085998c..c106148855436 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -411,6 +411,8 @@ Bug Fixes in This Version - ``#embed`` directive now diagnoses use of a non-character file (device file) such as ``/dev/urandom`` as an error. This restriction may be relaxed in the future. See (#GH126629). +- Fixed a clang 20 regression where diagnostics attached to some calls to member functions + using C++23 "deducing this" did not have a diagnostic location (#GH135522) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 5fab2c73f214b..59c0e47c7c195 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -1652,8 +1652,11 @@ SourceLocation CallExpr::getBeginLoc() const { if (!isTypeDependent()) { if (const auto *Method = dyn_cast_if_present(getCalleeDecl()); - Method && Method->isExplicitObjectMemberFunction()) - return getArg(0)->getBeginLoc(); + Method && Method->isExplicitObjectMemberFunction()) { + if (auto FirstArgLoc = getArg(0)->getBeginLoc(); FirstArgLoc.isValid()) { + return FirstArgLoc; + } + } } SourceLocation begin = getCallee()->getBeginLoc(); diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 6f17ce7275456..7e392213710a4 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -1134,3 +1134,10 @@ struct S { static_assert((S{} << 11) == a); // expected-error@-1 {{use of undeclared identifier 'a'}} } + +namespace GH135522 { +struct S { + auto f(this auto) -> S; + bool g() { return f(); } // expected-error {{no viable conversion from returned value of type 'S' to function return type 'bool'}} +}; +} From d41e517748e2dbb51e27bed217f3dd7a4c4fb86f Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Wed, 16 Apr 2025 00:08:59 +0800 Subject: [PATCH 006/710] [SLP] Make getSameOpcode support interchangeable instructions. (#135797) We use the term "interchangeable instructions" to refer to different operators that have the same meaning (e.g., `add x, 0` is equivalent to `mul x, 1`). Non-constant values are not supported, as they may incur high costs with little benefit. --------- Co-authored-by: Alexey Bataev --- .../Transforms/Vectorize/SLPVectorizer.cpp | 462 ++++++++++++++++-- .../SLPVectorizer/AArch64/vec3-base.ll | 8 +- ...reversed-strided-node-with-external-ptr.ll | 7 +- .../SLPVectorizer/RISCV/vec3-base.ll | 8 +- .../X86/BinOpSameOpcodeHelper.ll | 36 ++ .../SLPVectorizer/X86/barriercall.ll | 4 +- .../X86/bottom-to-top-reorder.ll | 11 +- .../buildvector-postpone-for-dependency.ll | 8 +- .../SLPVectorizer/X86/bv-shuffle-mask.ll | 4 +- .../X86/extract-scalar-from-undef.ll | 28 +- .../SLPVectorizer/X86/extractcost.ll | 4 +- ...gathered-delayed-nodes-with-reused-user.ll | 34 +- .../X86/minbitwidth-drop-wrapping-flags.ll | 4 +- .../X86/multi-extracts-bv-combined.ll | 6 +- .../non-scheduled-inst-reused-as-last-inst.ll | 44 +- .../SLPVectorizer/X86/propagate_ir_flags.ll | 12 +- .../reduced-val-vectorized-in-transform.ll | 6 +- .../X86/reorder_diamond_match.ll | 4 +- .../X86/shuffle-mask-emission.ll | 8 +- .../Transforms/SLPVectorizer/X86/vec3-base.ll | 19 +- .../X86/vect_copyable_in_binops.ll | 8 +- .../alternate-opcode-sindle-bv.ll | 35 +- .../Transforms/SLPVectorizer/bbi-106161.ll | 19 + .../Transforms/SLPVectorizer/isOpcodeOrAlt.ll | 61 +++ .../resized-alt-shuffle-after-minbw.ll | 4 +- .../SLPVectorizer/shuffle-mask-resized.ll | 4 +- 26 files changed, 647 insertions(+), 201 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/bbi-106161.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc775e4b260dc..253933a2438cd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -599,6 +599,28 @@ static std::optional getElementIndex(const Value *Inst, return Index; } +/// \returns true if all of the values in \p VL use the same opcode. +/// For comparison instructions, also checks if predicates match. +/// PoisonValues are considered matching. +/// Interchangeable instructions are not considered. +static bool allSameOpcode(ArrayRef VL) { + auto *It = find_if(VL, IsaPred); + if (It == VL.end()) + return true; + Instruction *MainOp = cast(*It); + unsigned Opcode = MainOp->getOpcode(); + bool IsCmpOp = isa(MainOp); + CmpInst::Predicate BasePred = IsCmpOp ? cast(MainOp)->getPredicate() + : CmpInst::BAD_ICMP_PREDICATE; + return std::all_of(It, VL.end(), [&](Value *V) { + if (auto *CI = dyn_cast(V)) + return BasePred == CI->getPredicate(); + if (auto *I = dyn_cast(V)) + return I->getOpcode() == Opcode; + return isa(V); + }); +} + namespace { /// Specifies the way the mask should be analyzed for undefs/poisonous elements /// in the shuffle mask. @@ -814,6 +836,272 @@ static std::optional getExtractIndex(const Instruction *E) { } namespace { +/// \returns true if \p Opcode is allowed as part of the main/alternate +/// instruction for SLP vectorization. +/// +/// Example of unsupported opcode is SDIV that can potentially cause UB if the +/// "shuffled out" lane would result in division by zero. +bool isValidForAlternation(unsigned Opcode) { + return !Instruction::isIntDivRem(Opcode); +} + +/// Helper class that determines VL can use the same opcode. +/// Alternate instruction is supported. In addition, it supports interchangeable +/// instruction. An interchangeable instruction is an instruction that can be +/// converted to another instruction with same semantics. For example, x << 1 is +/// equal to x * 2. x * 1 is equal to x | 0. +class BinOpSameOpcodeHelper { + using MaskType = std::uint_fast16_t; + /// Sort SupportedOp because it is used by binary_search. + constexpr static std::initializer_list SupportedOp = { + Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl, + Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor}; + enum : MaskType { + ShlBIT = 0b1, + AShrBIT = 0b10, + MulBIT = 0b100, + AddBIT = 0b1000, + SubBIT = 0b10000, + AndBIT = 0b100000, + OrBIT = 0b1000000, + XorBIT = 0b10000000, + MainOpBIT = 0b100000000, + LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT) + }; + /// Return a non-nullptr if either operand of I is a ConstantInt. + /// The second return value represents the operand position. We check the + /// right-hand side first (1). If the right hand side is not a ConstantInt and + /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand + /// side (0). + static std::pair + isBinOpWithConstantInt(const Instruction *I) { + unsigned Opcode = I->getOpcode(); + assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode."); + (void)SupportedOp; + auto *BinOp = cast(I); + if (auto *CI = dyn_cast(BinOp->getOperand(1))) + return {CI, 1}; + if (Opcode == Instruction::Sub || Opcode == Instruction::Shl || + Opcode == Instruction::AShr) + return {nullptr, 0}; + if (auto *CI = dyn_cast(BinOp->getOperand(0))) + return {CI, 0}; + return {nullptr, 0}; + } + struct InterchangeableInfo { + const Instruction *I = nullptr; + /// The bit it sets represents whether MainOp can be converted to. + MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | + MulBIT | AShrBIT | ShlBIT; + /// We cannot create an interchangeable instruction that does not exist in + /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0], + /// but << does not exist in VL. In the end, we convert VL to [x * 1, y * + /// 1]. SeenBefore is used to know what operations have been seen before. + MaskType SeenBefore = 0; + InterchangeableInfo(const Instruction *I) : I(I) {} + /// Return false allows BinOpSameOpcodeHelper to find an alternate + /// instruction. Directly setting the mask will destroy the mask state, + /// preventing us from determining which instruction it should convert to. + bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) { + if (Mask & InterchangeableMask) { + SeenBefore |= OpcodeInMaskForm; + Mask &= InterchangeableMask; + return true; + } + return false; + } + bool equal(unsigned Opcode) { + if (Opcode == I->getOpcode()) + return trySet(MainOpBIT, MainOpBIT); + return false; + } + unsigned getOpcode() const { + MaskType Candidate = Mask & SeenBefore; + if (Candidate & MainOpBIT) + return I->getOpcode(); + if (Candidate & ShlBIT) + return Instruction::Shl; + if (Candidate & AShrBIT) + return Instruction::AShr; + if (Candidate & MulBIT) + return Instruction::Mul; + if (Candidate & AddBIT) + return Instruction::Add; + if (Candidate & SubBIT) + return Instruction::Sub; + if (Candidate & AndBIT) + return Instruction::And; + if (Candidate & OrBIT) + return Instruction::Or; + if (Candidate & XorBIT) + return Instruction::Xor; + llvm_unreachable("Cannot find interchangeable instruction."); + } + SmallVector getOperand(const Instruction *To) const { + unsigned ToOpcode = To->getOpcode(); + unsigned FromOpcode = I->getOpcode(); + if (FromOpcode == ToOpcode) + return SmallVector(I->operands()); + assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode."); + auto [CI, Pos] = isBinOpWithConstantInt(I); + const APInt &FromCIValue = CI->getValue(); + unsigned FromCIValueBitWidth = FromCIValue.getBitWidth(); + APInt ToCIValue; + switch (FromOpcode) { + case Instruction::Shl: + if (ToOpcode == Instruction::Mul) { + ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth, + FromCIValue.getZExtValue()); + } else { + assert(FromCIValue.isZero() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::And + ? APInt::getAllOnes(FromCIValueBitWidth) + : APInt::getZero(FromCIValueBitWidth); + } + break; + case Instruction::Mul: + assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction."); + if (ToOpcode == Instruction::Shl) { + ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2()); + } else { + assert(FromCIValue.isOne() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::And + ? APInt::getAllOnes(FromCIValueBitWidth) + : APInt::getZero(FromCIValueBitWidth); + } + break; + case Instruction::Add: + case Instruction::Sub: + if (FromCIValue.isZero()) { + ToCIValue = APInt::getZero(FromCIValueBitWidth); + } else { + assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) && + "Cannot convert the instruction."); + ToCIValue = FromCIValue; + ToCIValue.negate(); + } + break; + case Instruction::And: + assert(FromCIValue.isAllOnes() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::Mul + ? APInt::getOneBitSet(FromCIValueBitWidth, 0) + : APInt::getZero(FromCIValueBitWidth); + break; + default: + assert(FromCIValue.isZero() && "Cannot convert the instruction."); + ToCIValue = APInt::getZero(FromCIValueBitWidth); + break; + } + Value *LHS = I->getOperand(1 - Pos); + Constant *RHS = + ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue); + if (Pos == 1) + return SmallVector({LHS, RHS}); + return SmallVector({RHS, LHS}); + } + }; + InterchangeableInfo MainOp; + InterchangeableInfo AltOp; + bool isValidForAlternation(const Instruction *I) const { + return ::isValidForAlternation(MainOp.I->getOpcode()) && + ::isValidForAlternation(I->getOpcode()); + } + bool initializeAltOp(const Instruction *I) { + if (AltOp.I) + return true; + if (!isValidForAlternation(I)) + return false; + AltOp.I = I; + return true; + } + +public: + BinOpSameOpcodeHelper(const Instruction *MainOp, + const Instruction *AltOp = nullptr) + : MainOp(MainOp), AltOp(AltOp) { + assert(is_sorted(SupportedOp) && "SupportedOp is not sorted."); + } + bool add(const Instruction *I) { + assert(isa(I) && + "BinOpSameOpcodeHelper only accepts BinaryOperator."); + unsigned Opcode = I->getOpcode(); + MaskType OpcodeInMaskForm; + // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp. + switch (Opcode) { + case Instruction::Shl: + OpcodeInMaskForm = ShlBIT; + break; + case Instruction::AShr: + OpcodeInMaskForm = AShrBIT; + break; + case Instruction::Mul: + OpcodeInMaskForm = MulBIT; + break; + case Instruction::Add: + OpcodeInMaskForm = AddBIT; + break; + case Instruction::Sub: + OpcodeInMaskForm = SubBIT; + break; + case Instruction::And: + OpcodeInMaskForm = AndBIT; + break; + case Instruction::Or: + OpcodeInMaskForm = OrBIT; + break; + case Instruction::Xor: + OpcodeInMaskForm = XorBIT; + break; + default: + return MainOp.equal(Opcode) || + (initializeAltOp(I) && AltOp.equal(Opcode)); + } + MaskType InterchangeableMask = OpcodeInMaskForm; + ConstantInt *CI = isBinOpWithConstantInt(I).first; + if (CI) { + constexpr MaskType CanBeAll = + XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT; + const APInt &CIValue = CI->getValue(); + switch (Opcode) { + case Instruction::Shl: + if (CIValue.ult(CIValue.getBitWidth())) + InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT; + break; + case Instruction::Mul: + if (CIValue.isOne()) { + InterchangeableMask = CanBeAll; + break; + } + if (CIValue.isPowerOf2()) + InterchangeableMask = MulBIT | ShlBIT; + break; + case Instruction::Add: + case Instruction::Sub: + InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT; + break; + case Instruction::And: + if (CIValue.isAllOnes()) + InterchangeableMask = CanBeAll; + break; + default: + if (CIValue.isZero()) + InterchangeableMask = CanBeAll; + break; + } + } + return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) || + (initializeAltOp(I) && + AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); + } + unsigned getMainOpcode() const { return MainOp.getOpcode(); } + bool hasAltOp() const { return AltOp.I; } + unsigned getAltOpcode() const { + return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); + } + SmallVector getOperand(const Instruction *I) const { + return MainOp.getOperand(I); + } +}; /// Main data required for vectorization of instructions. class InstructionsState { @@ -861,9 +1149,27 @@ class InstructionsState { /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return getMainOp() != getAltOp(); } - bool isOpcodeOrAlt(Instruction *I) const { - unsigned CheckedOpcode = I->getOpcode(); - return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; + /// Checks if the instruction matches either the main or alternate opcode. + /// \returns + /// - MainOp if \param I matches MainOp's opcode directly or can be converted + /// to it + /// - AltOp if \param I matches AltOp's opcode directly or can be converted to + /// it + /// - nullptr if \param I cannot be matched or converted to either opcode + Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { + assert(MainOp && "MainOp cannot be nullptr."); + if (I->getOpcode() == MainOp->getOpcode()) + return MainOp; + // Prefer AltOp instead of interchangeable instruction of MainOp. + assert(AltOp && "AltOp cannot be nullptr."); + if (I->getOpcode() == AltOp->getOpcode()) + return AltOp; + if (!I->isBinaryOp()) + return nullptr; + BinOpSameOpcodeHelper Converter(MainOp); + if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) + return MainOp; + return AltOp; } /// Checks if main/alt instructions are shift operations. @@ -913,23 +1219,41 @@ class InstructionsState { static InstructionsState invalid() { return {nullptr, nullptr}; } }; -} // end anonymous namespace - -/// \returns true if \p Opcode is allowed as part of the main/alternate -/// instruction for SLP vectorization. -/// -/// Example of unsupported opcode is SDIV that can potentially cause UB if the -/// "shuffled out" lane would result in division by zero. -static bool isValidForAlternation(unsigned Opcode) { - if (Instruction::isIntDivRem(Opcode)) - return false; - - return true; +std::pair> +convertTo(Instruction *I, const InstructionsState &S) { + Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I); + assert(SelectedOp && "Cannot convert the instruction."); + if (I->isBinaryOp()) { + BinOpSameOpcodeHelper Converter(I); + return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp)); + } + return std::make_pair(SelectedOp, SmallVector(I->operands())); } +} // end anonymous namespace + static InstructionsState getSameOpcode(ArrayRef VL, const TargetLibraryInfo &TLI); +/// Find an instruction with a specific opcode in VL. +/// \param VL Array of values to search through. Must contain only Instructions +/// and PoisonValues. +/// \param Opcode The instruction opcode to search for +/// \returns +/// - The first instruction found with matching opcode +/// - nullptr if no matching instruction is found +Instruction *findInstructionWithOpcode(ArrayRef VL, unsigned Opcode) { + for (Value *V : VL) { + if (isa(V)) + continue; + assert(isa(V) && "Only accepts PoisonValue and Instruction."); + auto *Inst = cast(V); + if (Inst->getOpcode() == Opcode) + return Inst; + } + return nullptr; +} + /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. /// compatible instructions or constants, or just some other regular values. static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, @@ -993,6 +1317,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, unsigned Opcode = MainOp->getOpcode(); unsigned AltOpcode = Opcode; + BinOpSameOpcodeHelper BinOpHelper(MainOp); bool SwappedPredsCompatible = IsCmpOp && [&]() { SetVector UniquePreds, UniqueNonSwappedPreds; UniquePreds.insert(BasePred); @@ -1039,14 +1364,8 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState::invalid(); unsigned InstOpcode = I->getOpcode(); if (IsBinOp && isa(I)) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (BinOpHelper.add(I)) continue; - if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && - isValidForAlternation(Opcode)) { - AltOpcode = InstOpcode; - AltOp = I; - continue; - } } else if (IsCastOp && isa(I)) { Value *Op0 = MainOp->getOperand(0); Type *Ty0 = Op0->getType(); @@ -1147,7 +1466,22 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState::invalid(); } - return InstructionsState(MainOp, AltOp); + if (IsBinOp) { + MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode()); + assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper."); + AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode()); + assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper."); + } + assert((MainOp == AltOp || !allSameOpcode(VL)) && + "Incorrect implementation of allSameOpcode."); + InstructionsState S(MainOp, AltOp); + assert(all_of(VL, + [&](Value *V) { + return isa(V) || + S.getMatchingMainOpOrAltOp(cast(V)); + }) && + "Invalid InstructionsState."); + return S; } /// \returns true if all of the values in \p VL have the same type or false @@ -2560,11 +2894,11 @@ class BoUpSLP { // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - bool IsInverseOperation = !isCommutative(cast(V)); + auto [SelectedOp, Ops] = convertTo(cast(VL[Lane]), S); + bool IsInverseOperation = !isCommutative(SelectedOp); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][Lane] = {cast(V)->getOperand(OpIdx), APO, - false}; + OpsVec[OpIdx][Lane] = {Ops[OpIdx], APO, false}; } } } @@ -3542,14 +3876,16 @@ class BoUpSLP { /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return S.isAltShuffle(); } - bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } + Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { + return S.getMatchingMainOpOrAltOp(I); + } /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is /// \p OpValue. Value *isOneOf(Value *Op) const { auto *I = dyn_cast(Op); - if (I && isOpcodeOrAlt(I)) + if (I && getMatchingMainOpOrAltOp(I)) return Op; return S.getMainOp(); } @@ -8428,11 +8764,15 @@ static std::pair generateKeySubkey( return std::make_pair(Key, SubKey); } +/// Checks if the specified instruction \p I is an main operation for the given +/// \p MainOp and \p AltOp instructions. +static bool isMainInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI); + /// Checks if the specified instruction \p I is an alternate operation for /// the given \p MainOp and \p AltOp instructions. -static bool isAlternateInstruction(const Instruction *I, - const Instruction *MainOp, - const Instruction *AltOp, +static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI); bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, @@ -9245,7 +9585,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, continue; } if ((LocalState.getAltOpcode() != LocalState.getOpcode() && - I->getOpcode() == LocalState.getOpcode()) || + isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(), + *TLI)) || (LocalState.getAltOpcode() == LocalState.getOpcode() && !isAlternateInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(), *TLI))) { @@ -10344,9 +10685,14 @@ void BoUpSLP::TreeEntry::buildAltOpShuffleMask( } } -static bool isAlternateInstruction(const Instruction *I, - const Instruction *MainOp, - const Instruction *AltOp, +static bool isMainInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, + const TargetLibraryInfo &TLI) { + return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp; +} + +static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI) { if (auto *MainCI = dyn_cast(MainOp)) { auto *AltCI = cast(AltOp); @@ -10366,7 +10712,7 @@ static bool isAlternateInstruction(const Instruction *I, "their swap."); return MainP != P && MainP != SwappedP; } - return I->getOpcode() == AltOp->getOpcode(); + return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp; } TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef Ops) { @@ -11129,7 +11475,9 @@ void BoUpSLP::transformNodes() { // same opcode and same parent block or all constants. if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || !(!E.hasState() || E.getOpcode() == Instruction::Load || - E.isAltShuffle() || !allSameBlock(VL)) || + // We use allSameOpcode instead of isAltShuffle because we don't + // want to use interchangeable instruction here. + !allSameOpcode(VL) || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load) @@ -11174,7 +11522,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S || S.isAltShuffle() || !allSameBlock(Slice) || + if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || (S.getOpcode() != Instruction::Load && @@ -12974,14 +13322,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (isa(UniqueValues[Idx])) return InstructionCost(TTI::TCC_Free); - auto *VI = cast(UniqueValues[Idx]); - unsigned OpIdx = isa(VI) ? 0 : 1; - TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); - TTI::OperandValueInfo Op2Info = - TTI::getOperandInfo(VI->getOperand(OpIdx)); - SmallVector Operands(VI->operand_values()); + // We cannot retrieve the operand from UniqueValues[Idx] because an + // interchangeable instruction may be used. The order and the actual + // operand might differ from what is retrieved from UniqueValues[Idx]. + Value *Op1 = E->getOperand(0)[Idx]; + Value *Op2; + SmallVector Operands(1, Op1); + if (isa(UniqueValues[Idx])) { + Op2 = Op1; + } else { + Op2 = E->getOperand(1)[Idx]; + Operands.push_back(Op2); + } + TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1); + TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2); return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, - Op1Info, Op2Info, Operands, VI); + Op1Info, Op2Info, Operands); }; auto GetVectorCost = [=](InstructionCost CommonCost) { if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { @@ -13211,7 +13567,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return InstructionCost(TTI::TCC_Free); auto *VI = cast(UniqueValues[Idx]); - assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(VI) && + "Unexpected main/alternate opcode"); (void)E; return TTI->getInstructionCost(VI, CostKind); }; @@ -13279,7 +13636,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallVector Mask; E->buildAltOpShuffleMask( [&](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(I) && + "Unexpected main/alternate opcode"); return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), *TLI); }, @@ -15441,7 +15799,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { !isa(V)) return true; auto *I = dyn_cast(V); - return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB || + return !I || !E->getMatchingMainOpOrAltOp(I) || + I->getParent() == BB || isVectorLikeInstWithConstOps(I); })) && "Expected gathered loads or GEPs or instructions from same basic " @@ -17585,7 +17944,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateBinOp( static_cast(E->getOpcode()), LHS, RHS); - propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); + propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end()); if (auto *I = dyn_cast(V)) { V = ::propagateMetadata(I, E->Scalars); // Drop nuw flags for abs(sub(commutative), true). @@ -18005,7 +18364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector Mask; E->buildAltOpShuffleMask( [E, this](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(I) && + "Unexpected main/alternate opcode"); return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), *TLI); }, @@ -21796,7 +22156,7 @@ class HorizontalReduction { // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S || !S.isOpcodeOrAlt(Inst))) || + (!S || !S.getMatchingMainOpOrAltOp(Inst))) || (S && !Inst)) continue; Candidates.push_back(RdxVal); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll index feb4ad865f314..d527d38adbee3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll @@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[ARRAYIDX_I1887]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll index fd3d4ab80b29c..ff897180cc9b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: -; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison) ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 7ab5e4d6cb787..481d586e6658a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[ARRAYIDX_I1887]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll b/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll new file mode 100644 index 0000000000000..6f27555aeb3f1 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -S %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 0, 0 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, 1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 0 +; CHECK-NEXT: [[UMIN120:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 0, 0 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 0 +; CHECK-NEXT: [[UMIN122:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN120]], i64 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = add i64 0, 1 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 0 +; CHECK-NEXT: [[UMIN123:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN122]], i64 [[TMP6]]) +; CHECK-NEXT: [[UMIN124:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN123]], i64 0) +; CHECK-NEXT: ret void +; +entry: + %0 = mul i64 0, 0 + %1 = lshr i64 %0, 0 + %2 = sub i64 0, 1 + %3 = lshr i64 %2, 0 + %umin120 = call i64 @llvm.umin.i64(i64 %1, i64 %3) + %4 = sub i64 0, 0 + %5 = lshr i64 %4, 0 + %umin122 = call i64 @llvm.umin.i64(i64 %umin120, i64 %5) + %6 = add i64 0, 1 + %7 = lshr i64 %6, 0 + %umin123 = call i64 @llvm.umin.i64(i64 %umin122, i64 %7) + %umin124 = call i64 @llvm.umin.i64(i64 %umin123, i64 0) + ret void +} + +declare i64 @llvm.umin.i64(i64, i64) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index f46a5d84a86cc..a39e602e2da71 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) { ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll index 889f5a95c81d6..299677ca80b34 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -10,15 +10,10 @@ define void @test(ptr %0, ptr %1, ptr %2) { ; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> , [[TMP8]] ; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> , [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll index 43c42c1ea2bfb..03a89e54e4212 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll @@ -8,15 +8,13 @@ define void @test() { ; CHECK: [[BB1:.*]]: ; CHECK-NEXT: br label %[[BB2:.*]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ poison, %[[BB1]] ], [ [[TMP5:%.*]], %[[BB6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ poison, %[[BB1]] ], [ [[TMP4:%.*]], %[[BB6]] ] ; CHECK-NEXT: ret void ; CHECK: [[BB6]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP4]] = mul <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> ; CHECK-NEXT: [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: br i1 false, label %[[BB2]], label %[[BB6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll index 766916fe71f35..c4ddc5d63cc04 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll @@ -7,14 +7,12 @@ define i16 @test(i16 %v1, i16 %v2) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> , i16 [[V2]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> , i16 [[V1]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP11]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 1c62e57edfc46..514d5f974cb16 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,17 +4,13 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP5:%.*]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP8:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> , <8 x i32> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) @@ -29,7 +25,7 @@ bb: %tmp4 = xor i32 %tmp3, 0 %tmp6 = sub i32 0, 0 %tmp8 = sub i32 %tmp7, 0 - %tmp9 = sub nsw i32 0, undef + %tmp9 = sub nsw i32 0, poison %tmp10 = add nsw i32 0, %tmp6 %tmp11 = sub nsw i32 0, %tmp8 %tmp12 = add i32 0, %tmp10 @@ -44,10 +40,10 @@ bb: %tmp21 = add i32 %tmp20, %tmp17 %tmp22 = sub i32 0, 0 %tmp23 = add i32 0, 0 - %tmp24 = sub i32 undef, 0 - %tmp25 = add nsw i32 %tmp23, undef + %tmp24 = sub i32 poison, 0 + %tmp25 = add nsw i32 %tmp23, poison %tmp26 = add nsw i32 %tmp24, %tmp22 - %tmp27 = sub nsw i32 undef, %tmp24 + %tmp27 = sub nsw i32 poison, %tmp24 %tmp28 = add i32 0, %tmp25 %tmp29 = xor i32 %tmp28, 0 %tmp30 = add i32 0, %tmp26 @@ -58,7 +54,7 @@ bb: %tmp35 = add i32 %tmp34, %tmp29 %tmp36 = add i32 %tmp35, 0 %tmp37 = add i32 %tmp36, %tmp33 - %tmp38 = sub nsw i32 0, undef + %tmp38 = sub nsw i32 0, poison %tmp39 = add i32 0, %tmp38 %tmp40 = xor i32 %tmp39, 0 %tmp41 = add i32 0, %tmp37 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll index 02c3173adc654..c6f5308cf54aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -9,9 +9,7 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll index b39480b12496b..5a9ea0d292fa0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll @@ -6,26 +6,26 @@ define i64 @foo() { ; CHECK-LABEL: define i64 @foo() { ; CHECK-NEXT: bb: +; CHECK-NEXT: [[ADD7:%.*]] = add i64 0, 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ] ; CHECK-NEXT: ret i64 0 ; CHECK: bb3: -; CHECK-NEXT: [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0 -; CHECK-NEXT: [[TMP9]] = or i64 [[PHI5]], 0 -; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i64> , i64 [[ADD]], i32 0 +; CHECK-NEXT: [[PHI4:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP3:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> , <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> , i64 [[PHI4]], i32 0 +; CHECK-NEXT: [[TMP3]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5]] = add <2 x i64> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[ADD7]] +; CHECK-NEXT: [[OR:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[OR]], 0 ; CHECK-NEXT: br i1 false, label [[BB3]], label [[BB1:%.*]] ; ; FORCED-LABEL: define i64 @foo() { ; FORCED-NEXT: bb: -; FORCED-NEXT: [[TMP8:%.*]] = add i64 0, 0 +; FORCED-NEXT: [[ADD7:%.*]] = add i64 0, 0 ; FORCED-NEXT: br label [[BB3:%.*]] ; FORCED: bb1: ; FORCED-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ] @@ -36,12 +36,10 @@ define i64 @foo() { ; FORCED-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> ; FORCED-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> , i64 [[PHI5]], i32 0 ; FORCED-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]] -; FORCED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; FORCED-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] -; FORCED-NEXT: [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]] -; FORCED-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 +; FORCED-NEXT: [[TMP5]] = add <2 x i64> [[TMP1]], [[TMP2]] +; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[ADD7]] +; FORCED-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP8]], 0 ; FORCED-NEXT: br i1 false, label [[BB3]], label [[BB1:%.*]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll index 2a5bfa7390770..0198b1c5cb846 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll @@ -9,9 +9,7 @@ define i32 @test() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll index e6a166c27ac49..230e165e43edc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll @@ -8,10 +8,8 @@ define i32 @foo() { ; CHECK-NEXT: [[D:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[D]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: store <8 x i32> [[TMP2]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll index 1163c8219dabe..034fe82862950 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll @@ -4,6 +4,24 @@ ; RUN: -slp-skip-early-profitability-check < %s | FileCheck %s --check-prefixes=FORCED define void @foo() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 0 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] +; CHECK: bb5: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP2]], [[BB4]] ] +; CHECK-NEXT: ret void +; ; FORCED-LABEL: define void @foo() { ; FORCED-NEXT: bb: ; FORCED-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 0 @@ -11,9 +29,7 @@ define void @foo() { ; FORCED: bb1: ; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] ; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]] -; FORCED-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[TMP0]] -; FORCED-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP1]], <2 x i32> +; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> ; FORCED-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer ; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 ; FORCED-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) @@ -21,29 +37,9 @@ define void @foo() { ; FORCED: bb4: ; FORCED-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] ; FORCED: bb5: -; FORCED-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP4]], [[BB4]] ] +; FORCED-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP2]], [[BB4]] ] ; FORCED-NEXT: ret void ; -; CHECK-LABEL: define void @foo() { -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB1:%.*]] -; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP2]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SHL]], i32 0 -; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) -; CHECK-NEXT: br label [[BB4]] -; CHECK: bb4: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] -; CHECK: bb5: -; CHECK-NEXT: [[PHI6:%.*]] = phi i32 [ [[SHL]], [[BB4]] ] -; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ [[TMP8]], [[BB4]] ] -; CHECK-NEXT: ret void -; bb: br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll index cb02f4d10923c..ad8e905a8ca02 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll @@ -330,9 +330,7 @@ define void @only_arcp(ptr %x) { define void @addsub_all_nsw(ptr %x) { ; CHECK-LABEL: @addsub_all_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; @@ -361,9 +359,7 @@ define void @addsub_all_nsw(ptr %x) { define void @addsub_some_nsw(ptr %x) { ; CHECK-LABEL: @addsub_some_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; @@ -392,9 +388,7 @@ define void @addsub_some_nsw(ptr %x) { define void @addsub_no_nsw(ptr %x) { ; CHECK-LABEL: @addsub_no_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll index 81f3bf99f3fd8..7fe6941d52da7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll @@ -9,16 +9,16 @@ define i32 @test(i1 %cond) { ; CHECK: [[BB]]: ; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = or i32 1, 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[P1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[OR92]] = or i32 1, 0 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> , i32 [[OR92]], i32 0 +; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP8]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP1]], i32 1 ; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 [[OP_RDX]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll index cda88620ab88a..fff2b72df613e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -14,10 +14,8 @@ define void @test() { ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP13]] to <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP2]], align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll index fcc295de62adf..a17ccb4b46ef9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll @@ -6,11 +6,9 @@ define i1 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll index 6e2a43ac5f9f1..15dd6756cd7db 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll @@ -242,13 +242,18 @@ exit: } define void @store_try_reorder(ptr %dst) { -; CHECK-LABEL: @store_try_reorder( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0 -; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @store_try_reorder( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @store_try_reorder( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %add = add i32 0, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 869a9d1aee80e..4f3d551e21122 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -192,9 +192,7 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4 ; CHECK-NEXT: ret void ; @@ -225,9 +223,7 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1) -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP0]], ; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index c250029519590..9b6511d0d8284 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -1,18 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define <2 x i32> @test(i32 %arg) { -; CHECK-LABEL: define <2 x i32> @test( -; CHECK-SAME: i32 [[ARG:%.*]]) { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 0, 1 -; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 -; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; X86-LABEL: define <2 x i32> @test( +; X86-SAME: i32 [[ARG:%.*]]) { +; X86-NEXT: bb: +; X86-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 +; X86-NEXT: [[MUL:%.*]] = mul i32 0, 1 +; X86-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] +; X86-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; X86-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 +; X86-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 +; X86-NEXT: ret <2 x i32> [[TMP1]] +; +; AARCH64-LABEL: define <2 x i32> @test( +; AARCH64-SAME: i32 [[ARG:%.*]]) { +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 +; AARCH64-NEXT: [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer +; AARCH64-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; AARCH64-NEXT: [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; AARCH64-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; AARCH64-NEXT: ret <2 x i32> [[TMP1]] ; bb: %or = or i32 %arg, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll b/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll new file mode 100644 index 0000000000000..29339c8a8aac8 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s + +define i128 @f_768_3162(i16 %0) { +; CHECK-LABEL: @f_768_3162( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL22:%.*]] = shl i16 0, -1 +; CHECK-NEXT: [[MUL23:%.*]] = mul i16 0, [[TMP0:%.*]] +; CHECK-NEXT: [[MUL24:%.*]] = mul i16 [[SHL22]], [[MUL23]] +; CHECK-NEXT: store i16 [[MUL24]], ptr null, align 1 +; CHECK-NEXT: ret i128 0 +; +entry: + %shl22 = shl i16 0, -1 + %mul23 = mul i16 0, %0 + %mul24 = mul i16 %shl22, %mul23 + store i16 %mul24, ptr null, align 1 + ret i128 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll b/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll new file mode 100644 index 0000000000000..414997fe8e6f7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S -slp-max-reg-size=1024 %s | FileCheck %s + +define void @test1(ptr %a, ptr %b) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[GEP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[GEP4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %gep0 = getelementptr inbounds i32, ptr %a, i64 0 + %gep1 = getelementptr inbounds i32, ptr %a, i64 1 + %gep2 = getelementptr inbounds i32, ptr %a, i64 2 + %gep3 = getelementptr inbounds i32, ptr %a, i64 3 + %0 = load i32, ptr %gep0, align 4 + %1 = load i32, ptr %gep1, align 4 + %2 = load i32, ptr %gep2, align 4 + %3 = load i32, ptr %gep3, align 4 + %op0 = shl i32 %0, 1 + %op1 = add i32 %1, zeroinitializer + %op2 = mul i32 %2, 2 + %op3 = shl i32 %3, zeroinitializer + %gep4 = getelementptr inbounds i32, ptr %b, i64 0 + %gep5 = getelementptr inbounds i32, ptr %b, i64 1 + %gep6 = getelementptr inbounds i32, ptr %b, i64 2 + %gep7 = getelementptr inbounds i32, ptr %b, i64 3 + store i32 %op0, ptr %gep4, align 4 + store i32 %op1, ptr %gep5, align 4 + store i32 %op2, ptr %gep6, align 4 + store i32 %op3, ptr %gep7, align 4 + ret void +} + +define void @test2(i64 %_xstride) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[_XSTRIDE:%.*]], 1 +; CHECK-NEXT: [[MUL5:%.*]] = mul i64 0, 0 +; CHECK-NEXT: [[MUL9:%.*]] = sub i64 0, [[_XSTRIDE]] +; CHECK-NEXT: [[MUL12:%.*]] = shl i64 [[_XSTRIDE]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr null, i64 [[MUL3]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr i8, ptr null, i64 [[MUL5]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr null, i64 [[MUL9]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i8, ptr null, i64 [[MUL12]] +; CHECK-NEXT: ret void +; +entry: + %mul3 = mul i64 %_xstride, 1 + %mul5 = mul i64 0, 0 + %mul9 = sub i64 0, %_xstride + %mul12 = shl i64 %_xstride, 1 + %arrayidx = getelementptr i8, ptr null, i64 %mul3 + %arrayidx6 = getelementptr i8, ptr null, i64 %mul5 + %arrayidx10 = getelementptr i8, ptr null, i64 %mul9 + %arrayidx13 = getelementptr i8, ptr null, i64 %mul12 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 056b6222cae72..caca410f056c1 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -6,11 +6,9 @@ define void @func(i32 %0) { ; CHECK-SAME: i32 [[TMP0:%.*]]) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll index 732b50396a460..cf5927bf58327 100644 --- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll @@ -12,9 +12,7 @@ define i32 @test() { ; CHECK-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP5]] = add <2 x i32> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4: ; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ] From c8121b99a99fe1785add732aa062039b7c5fdd32 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 15 Apr 2025 09:10:04 -0700 Subject: [PATCH 007/710] [RISCV] Xqcilb: remove RISCVMCExpr::VK_QC_E_JUMP_PLT and drop `@plt` parsing Follow-up to the just landed #135044 . Remove `@plt` parsing (only needed by legacy `call foo@plt`). MCParser's `@` parsing is problematic. Supporting target variations like (`foo+2@plt foo@plt+2 (foo+2)@plt`) involves messy hacks. We should refrain from adding new `@` uses. Remove unneeded `RISCVMCExpr::VK_QC_E_JUMP_PLT` (should only be used when an instruction might have multiple reasonable relocations https://maskray.me/blog/2025-03-16-relocation-generation-in-assemblers). --- GCC's initial initial RISC-V port made a mistake by having both `call foo` (non-PIC) and `call foo@plt` (PIC), likely misled by x86/SystemZ. It was determined that the `@plt` was not needed. Since R_RISCV_CALL had questionable undefined weak semantics in GNU ld (which has been removed then), we kept R_RISCV_CALL_PLT and deprecated R_RISCV_CALL. For RISC-V instructions, we only keep `@` in call/jump for backward compatibility and discourage it for all other instructions. ( There is disagreement about whether `PLT` in `JUMP_PLT` is useful or misleading. MaskRay's opnion: For new branch relocations with procedure call semantics, use `_CALL` and avoid `_PLT` in the relocation name. `_PLT` should only be used in data directives (e.g. R_RISCV_PLT32) to indicate that the address of a function is not significant. ) Pull Request: https://github.com/llvm/llvm-project/pull/135507 --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 32 ++-------------- .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 34 +---------------- .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 5 +-- .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h | 1 - llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 38 ++++--------------- llvm/test/MC/RISCV/xqcilb-invalid.s | 6 ++- llvm/test/MC/RISCV/xqcilb-relocations.s | 10 ----- 7 files changed, 18 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 2f8e002f291db..013944787ff2d 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -587,17 +587,6 @@ struct RISCVOperand final : public MCParsedAsmOperand { (VK == RISCVMCExpr::VK_CALL || VK == RISCVMCExpr::VK_CALL_PLT); } - bool isPseudoQCJumpSymbol() const { - int64_t Imm; - // Must be of 'immediate' type but not a constant. - if (!isImm() || evaluateConstantImm(getImm(), Imm)) - return false; - - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; - return RISCVAsmParser::classifySymbolRef(getImm(), VK) && - VK == RISCVMCExpr::VK_QC_E_JUMP_PLT; - } - bool isPseudoJumpSymbol() const { int64_t Imm; // Must be of 'immediate' type but not a constant. @@ -1598,11 +1587,11 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError(Operands, ErrorInfo, std::numeric_limits::min(), std::numeric_limits::max()); - case Match_InvalidSImm32Lsb0: + case Match_InvalidBareSImm32Lsb0: return generateImmOutOfRangeError( Operands, ErrorInfo, std::numeric_limits::min(), std::numeric_limits::max() - 1, - "operand must be a multiple of 2 bytes in the range "); + "operand must be a multiple of 2 bytes in the range"); case Match_InvalidRnumArg: { return generateImmOutOfRangeError(Operands, ErrorInfo, 0, 10); } @@ -2151,25 +2140,10 @@ ParseStatus RISCVAsmParser::parsePseudoQCJumpSymbol(OperandVector &Operands) { std::string Identifier(getTok().getIdentifier()); SMLoc E = getTok().getEndLoc(); - - if (getLexer().peekTok().is(AsmToken::At)) { - Lex(); - Lex(); - SMLoc PLTLoc = getLoc(); - StringRef PLT; - E = getTok().getEndLoc(); - if (getParser().parseIdentifier(PLT) || PLT != "plt") - return Error(PLTLoc, - "'@plt' is the only valid operand for this instruction"); - } else { - Lex(); - } - - RISCVMCExpr::Specifier Kind = RISCVMCExpr::VK_QC_E_JUMP_PLT; + Lex(); MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); Res = MCSymbolRefExpr::create(Sym, getContext()); - Res = RISCVMCExpr::create(Res, Kind, getContext()); Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); return ParseStatus::Success; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index f324907d49fd9..6283f1d120aaa 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -56,10 +56,6 @@ class RISCVMCCodeEmitter : public MCCodeEmitter { SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - void expandQCJump(const MCInst &MI, SmallVectorImpl &CB, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - void expandTLSDESCCall(const MCInst &MI, SmallVectorImpl &CB, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -173,26 +169,6 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, support::endian::write(CB, Binary, llvm::endianness::little); } -void RISCVMCCodeEmitter::expandQCJump(const MCInst &MI, - SmallVectorImpl &CB, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - MCOperand Func = MI.getOperand(0); - assert(Func.isExpr() && "Expected expression"); - - auto Opcode = - (MI.getOpcode() == RISCV::PseudoQC_E_J) ? RISCV::QC_E_J : RISCV::QC_E_JAL; - MCInst Jump = MCInstBuilder(Opcode).addExpr(Func.getExpr()); - - uint64_t Bits = getBinaryCodeForInstr(Jump, Fixups, STI) & 0xffff'ffff'ffffu; - SmallVector Encoding; - support::endian::write(Encoding, Bits, llvm::endianness::little); - assert(Encoding[6] == 0 && Encoding[7] == 0 && - "Unexpected encoding for 48-bit instruction"); - Encoding.truncate(6); - CB.append(Encoding); -} - void RISCVMCCodeEmitter::expandTLSDESCCall(const MCInst &MI, SmallVectorImpl &CB, SmallVectorImpl &Fixups, @@ -464,11 +440,6 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, expandTLSDESCCall(MI, CB, Fixups, STI); MCNumEmitted += 1; return; - case RISCV::PseudoQC_E_J: - case RISCV::PseudoQC_E_JAL: - expandQCJump(MI, CB, Fixups, STI); - MCNumEmitted += 1; - return; } switch (Size) { @@ -685,9 +656,6 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, case RISCVMCExpr::VK_QC_ABS20: FixupKind = RISCV::fixup_riscv_qc_abs20_u; break; - case RISCVMCExpr::VK_QC_E_JUMP_PLT: - FixupKind = RISCV::fixup_riscv_qc_e_jump_plt; - break; } } else if (Kind == MCExpr::SymbolRef || Kind == MCExpr::Binary) { // FIXME: Sub kind binary exprs have chance of underflow. @@ -705,6 +673,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, FixupKind = RISCV::fixup_riscv_qc_e_branch; } else if (MIFrm == RISCVII::InstFormatQC_EAI) { FixupKind = RISCV::fixup_riscv_qc_e_32; + } else if (MIFrm == RISCVII::InstFormatQC_EJ) { + FixupKind = RISCV::fixup_riscv_qc_e_jump_plt; } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index 99f72620f97ed..d6650e156c8b3 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -34,8 +34,7 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S, void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { Specifier S = getSpecifier(); - bool HasVariant = ((S != VK_None) && (S != VK_CALL) && (S != VK_CALL_PLT) && - (S != VK_QC_E_JUMP_PLT)); + bool HasVariant = ((S != VK_None) && (S != VK_CALL) && (S != VK_CALL_PLT)); if (HasVariant) OS << '%' << getSpecifierName(S) << '('; @@ -168,8 +167,6 @@ StringRef RISCVMCExpr::getSpecifierName(Specifier S) { return "pltpcrel"; case VK_QC_ABS20: return "qc.abs20"; - case VK_QC_E_JUMP_PLT: - return "qc_e_jump_plt"; } llvm_unreachable("Invalid ELF symbol kind"); } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h index d60879d34dc17..e0aa7ff244521 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h @@ -44,7 +44,6 @@ class RISCVMCExpr : public MCTargetExpr { VK_TLSDESC_ADD_LO, VK_TLSDESC_CALL, VK_QC_ABS20, - VK_QC_E_JUMP_PLT }; private: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4ac17c8283866..8eaa5e394a91c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -139,32 +139,20 @@ def bare_simm32 : RISCVOp { } // A 32-bit signed immediate where the least significant bit is zero. -def simm32_lsb0 : Operand { - let ParserMatchClass = SImmAsmOperand<32, "Lsb0">; +def bare_simm32_lsb0 : Operand { + let ParserMatchClass = BareSImmNLsb0AsmOperand<32>; let PrintMethod = "printBranchOperand"; let EncoderMethod = "getImmOpValueAsr1"; let DecoderMethod = "decodeSImmOperandAndLsl1<32>"; let MCOperandPredicate = [{ int64_t Imm; - if (!MCOp.evaluateAsConstantImm(Imm)) - return false; - return isShiftedInt<31, 1>(Imm); + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedInt<31, 1>(Imm); + return MCOp.isBareSymbolRef(); }]; let OperandType = "OPERAND_PCREL"; } -def PseudoQCJumpSymbol : AsmOperandClass { - let Name = "PseudoQCJumpSymbol"; - let RenderMethod = "addImmOperands"; - let DiagnosticType = "InvalidPseudoQCJumpSymbol"; - let DiagnosticString = "operand must be a valid jump target"; - let ParserMethod = "parsePseudoQCJumpSymbol"; -} - -def pseudo_qc_jump_symbol : Operand { - let ParserMatchClass = PseudoQCJumpSymbol; -} - //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -313,7 +301,7 @@ def InsnQC_EJ : DirectiveInsnQC_EJ<(outs), uimm3:$func3, uimm2:$func2, uimm5:$func5, - simm32_lsb0:$imm31), + bare_simm32_lsb0:$imm31), "$opcode, $func3, $func2, $func5, $imm31">; def InsnQC_ES : DirectiveInsnQC_ES<(outs), (ins uimm7_opcode:$opcode, @@ -365,7 +353,7 @@ def : InstAlias<".insn_qc.ej $opcode, $func3, $func2, $func5, $imm31", uimm3:$func3, uimm2:$func2, uimm5:$func5, - simm32_lsb0:$imm31)>; + bare_simm32_lsb0:$imm31)>; def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, ${imm26}(${rs1})", (InsnQC_ES uimm7_opcode:$opcode, uimm3:$func3, @@ -719,7 +707,7 @@ class QCIRVInstEI funct3, bits<2> funct2, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class QCIRVInst48EJ func2, string opcodestr> - : RVInst48<(outs), (ins simm32_lsb0:$imm31), + : RVInst48<(outs), (ins bare_simm32_lsb0:$imm31), opcodestr, "$imm31", [], InstFormatQC_EJ> { bits<31> imm31; @@ -1231,16 +1219,6 @@ def PseudoQC_E_SH : PseudoStore<"qc.e.sh">; def PseudoQC_E_SW : PseudoStore<"qc.e.sw">; } // Predicates = [HasVendorXqcilo, IsRV32] -let isCall = 0, isBarrier = 1, isTerminator = 1, - isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in -def PseudoQC_E_J : Pseudo<(outs), (ins pseudo_qc_jump_symbol:$func), [], - "qc.e.j", "$func">; - -let isCall = 1, Defs = [X1], isCodeGenOnly = 0, hasSideEffects = 0, - mayStore = 0, mayLoad = 0 in -def PseudoQC_E_JAL: Pseudo<(outs), (ins pseudo_qc_jump_symbol:$func), [], - "qc.e.jal", "$func">; - //===----------------------------------------------------------------------===// // Code Gen Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/RISCV/xqcilb-invalid.s b/llvm/test/MC/RISCV/xqcilb-invalid.s index 10d456c8ac0aa..1c584da890dd3 100644 --- a/llvm/test/MC/RISCV/xqcilb-invalid.s +++ b/llvm/test/MC/RISCV/xqcilb-invalid.s @@ -13,6 +13,8 @@ qc.e.j -2147483649 # CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilb' (Qualcomm uC Long Branch Extension) qc.e.j -2147483648 +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilb' (Qualcomm uC Long Branch Extension) +qc.e.j foo # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction qc.e.jal @@ -23,5 +25,5 @@ qc.e.jal 2147483649 # CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcilb' (Qualcomm uC Long Branch Extension) qc.e.jal 2147483640 -# CHECK: :[[@LINE+1]]:12: error: '@plt' is the only valid operand for this instruction -qc.e.j foo@rlt +# CHECK: :[[@LINE+1]]:11: error: unexpected token +qc.e.j foo@plt diff --git a/llvm/test/MC/RISCV/xqcilb-relocations.s b/llvm/test/MC/RISCV/xqcilb-relocations.s index a475cde3f6bfd..92e543d94f6b6 100644 --- a/llvm/test/MC/RISCV/xqcilb-relocations.s +++ b/llvm/test/MC/RISCV/xqcilb-relocations.s @@ -15,16 +15,6 @@ qc.e.j foo # INSTR: qc.e.j foo # FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_qc_e_jump_plt -qc.e.j foo@plt -# RELOC: R_RISCV_CUSTOM195 foo 0x0 -# INSTR: qc.e.j foo -# FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_qc_e_jump_plt - -qc.e.jal foo@plt -# RELOC: R_RISCV_CUSTOM195 foo 0x0 -# INSTR: qc.e.jal foo -# FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_qc_e_jump_plt - qc.e.jal foo # RELOC: R_RISCV_CUSTOM195 foo 0x0 # INSTR: qc.e.jal foo From 98534eee847dfa80ff88b213a628a6149f8754d9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 15 Apr 2025 09:10:19 -0700 Subject: [PATCH 008/710] [mlir] Migrate away from PointerUnion::dyn_cast (NFC) (#135770) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we have a call to dyn_cast earlier in the function, implying that attrOrProp is nonnull. --- mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 2431807ce463d..edcb2f507ae50 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -1828,7 +1828,7 @@ void OpEmitter::genPropertiesSupportForBytecode( name, tgfmt(namedProperty->prop.getWriteToMlirBytecodeCall(), &fctx)); continue; } - const auto *namedAttr = attrOrProp.dyn_cast(); + const auto *namedAttr = dyn_cast(attrOrProp); StringRef name = namedAttr->attrName; if (namedAttr->isRequired) { readPropertiesMethod << formatv(R"( From ece10a64cb180ba931b60cbd939d80412973eada Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 15 Apr 2025 12:21:04 -0400 Subject: [PATCH 009/710] Allow some attributes on declarations after definitions (#135791) The deprecated, maybe_unused, and nodiscard standard attributes may all be applied to a redeclaration after a definition has already appeared. We were previously dropping the attribute in that case, now we retain the attribute after the redeclaration. Note: someday we may want to tablegen this as part of information from Attr.td. We may also want to relax the restriction here so that the syntax used does not matter. This is an intentionally conservative fix. Fixes #135481 --- clang/docs/ReleaseNotes.rst | 5 +++ clang/lib/Sema/SemaDecl.cpp | 15 +++++++ .../Sema/attr-decl-after-definition-std.c | 41 +++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 clang/test/Sema/attr-decl-after-definition-std.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c106148855436..e63de32a0b2aa 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -439,6 +439,11 @@ Bug Fixes to Attribute Support - No longer crashing on ``__attribute__((align_value(N)))`` during template instantiation when the function parameter type is not a pointer or reference. (#GH26612) +- Now allowing the ``[[deprecated]]``, ``[[maybe_unused]]``, and + ``[[nodiscard]]`` to be applied to a redeclaration after a definition in both + C and C++ mode for the standard spellings (other spellings, such as + ``__attribute__((unused))`` are still ignored after the definition, though + this behavior may be relaxed in the future). (#GH135481) Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index e9805c345b6af..240ce5391af81 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -2996,6 +2996,21 @@ static void checkNewAttributesAfterDef(Sema &S, Decl *New, const Decl *Old) { // msvc will allow a subsequent definition to add an uuid to a class ++I; continue; + } else if (isa( + NewAttribute) && + NewAttribute->isStandardAttributeSyntax()) { + // C++14 [dcl.attr.deprecated]p3: A name or entity declared without the + // deprecated attribute can later be re-declared with the attribute and + // vice-versa. + // C++17 [dcl.attr.unused]p4: A name or entity declared without the + // maybe_unused attribute can later be redeclared with the attribute and + // vice versa. + // C++20 [dcl.attr.nodiscard]p2: A name or entity declared without the + // nodiscard attribute can later be redeclared with the attribute and + // vice-versa. + // C23 6.7.13.3p3, 6.7.13.4p3. and 6.7.13.5p5 give the same allowances. + ++I; + continue; } else if (const AlignedAttr *AA = dyn_cast(NewAttribute)) { if (AA->isAlignas()) { // C++11 [dcl.align]p6: diff --git a/clang/test/Sema/attr-decl-after-definition-std.c b/clang/test/Sema/attr-decl-after-definition-std.c new file mode 100644 index 0000000000000..bab52b4dd97ad --- /dev/null +++ b/clang/test/Sema/attr-decl-after-definition-std.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -fsyntax-only -Wignored-attributes -verify -std=c23 %s +// RUN: %clang_cc1 -fsyntax-only -Wignored-attributes -verify -x c++ %s +// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s + +inline int frob(int x) { return x; } + +[[deprecated]] int frob(int); // expected-note 2 {{'frob' has been explicitly marked deprecated here}} + +void use1() { + // Using this should give a deprecation warning, but not a nodiscard warning. + frob(0); // expected-warning {{'frob' is deprecated}} +} + +[[nodiscard]] int frob(int); + +void use2() { + // This should give both warnings. + frob(0); // expected-warning {{'frob' is deprecated}} \ + expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +} + +[[maybe_unused]] int frob(int); + +// Currently, this is only allowed for the standard spelling of the attributes. +void blob() {} // expected-note {{previous definition is here}} +__attribute__((deprecated)) void blob(); // expected-warning {{attribute declaration must precede definition}} + +// CHECK: FunctionDecl {{.*}} frob + +// CHECK: FunctionDecl {{.*}} prev {{.*}} frob +// CHECK: DeprecatedAttr + +// CHECK: FunctionDecl {{.*}} prev {{.*}} frob +// CHECK: DeprecatedAttr +// CHECK: WarnUnusedResultAttr + +// CHECK: FunctionDecl {{.*}} prev {{.*}} frob +// CHECK: DeprecatedAttr +// CHECK: WarnUnusedResultAttr +// CHECK: UnusedAttr + From a9553990fb6de8b4d99d05c95fe949deef6357e1 Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Tue, 15 Apr 2025 09:31:03 -0700 Subject: [PATCH 010/710] [DWARFLinker] Update `stmt-seq-macho.test` to use `update_test_body.py` (#133363) In this change we update DWARFLinker test `llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test` to be self-contained and easy to regenerate via `update_test_body.py`. As relating to [this comment](https://github.com/llvm/llvm-project/pull/132875/files#r2012471834) - this would be approach nr.2 Updating the test can be done via: ``` PATH=/path/to/llvm/bin:$PATH llvm/utils/update_test_body.py llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test ``` --- .../tools/dsymutil/ARM/stmt-seq-macho.test | 1862 ++++++++++++++++- .../private/tmp/stmt_seq/stmt_seq_macho.exe | Bin 17360 -> 0 bytes .../private/tmp/stmt_seq/stmt_seq_macho.o | Bin 4704 -> 0 bytes 3 files changed, 1842 insertions(+), 20 deletions(-) delete mode 100755 llvm/test/tools/dsymutil/Inputs/private/tmp/stmt_seq/stmt_seq_macho.exe delete mode 100644 llvm/test/tools/dsymutil/Inputs/private/tmp/stmt_seq/stmt_seq_macho.o diff --git a/llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test b/llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test index 1dd1f61f1f7fb..1e08bc07e14cb 100644 --- a/llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test +++ b/llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test @@ -1,5 +1,10 @@ -RUN: dsymutil --flat -oso-prepend-path %p/../Inputs %p/../Inputs/private/tmp/stmt_seq/stmt_seq_macho.exe -o %t.stmt_seq_macho.dSYM -RUN: llvm-dwarfdump --debug-info --debug-line -v %t.stmt_seq_macho.dSYM | sort | FileCheck %s -check-prefix=CHECK_DSYM +## Test that verifies DW_AT_LLVM_stmt_sequence attributes are correctly patched in the dSYM + +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: yaml2obj %t/stmt_seq_macho.exe.yaml -o %t/stmt_seq_macho.exe +# RUN: yaml2obj %t/stmt_seq_macho.o.yaml -o %t/stmt_seq_macho.o +# RUN: dsymutil --flat --verify-dwarf=none -oso-prepend-path %t %t/stmt_seq_macho.exe -o %t/stmt_seq_macho.dSYM +# RUN: llvm-dwarfdump --debug-info --debug-line -v %t/stmt_seq_macho.dSYM | sort | FileCheck %s -check-prefix=CHECK_DSYM # CHECK_DSYM: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] ([[OFFSET1:(0x[0-9a-f]+)]]) # CHECK_DSYM: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] ([[OFFSET2:(0x[0-9a-f]+)]]) @@ -11,27 +16,18 @@ RUN: llvm-dwarfdump --debug-info --debug-line -v %t.stmt_seq_macho.dSYM | sort | # CHECK_DSYM: [[OFFSET3]]: 00 DW_LNE_set_address # CHECK_DSYM: [[OFFSET4]]: 00 DW_LNE_set_address - -######## Generate stmt_seq_macho.exe & stmt_seq_macho.o via script: ########## -# ------------------------------------------------------------------------------ -#!/bin/bash -TOOLCHAIN=/path/to/llvm/bin - -# ------------------------------------------------------------------------------ -# Create the stmt_seq_macho.cpp source file -# ------------------------------------------------------------------------------ -cat > stmt_seq_macho.cpp << 'EOF' +#--- stmt_seq_macho.cpp #define ATTRIB extern "C" __attribute__((noinline)) ATTRIB int function3_copy1(int a) { int b = a + 3; return b + 1; } - + ATTRIB int function2_copy1(int a) { return a - 22; } - + ATTRIB int function3_copy2(int a) { int b = a + 3; return b + 1; @@ -45,11 +41,11 @@ ATTRIB int function2_copy2(int a) { struct logic_error { logic_error(const char* s) {} }; - + struct length_error : public logic_error { __attribute__((noinline)) explicit length_error(const char* s) : logic_error(s) {} }; - + int main() { int sum = 0; sum += function2_copy2(3); @@ -58,11 +54,12 @@ int main() { length_error e("test"); return sum; } -EOF -"$TOOLCHAIN/clang" \ - --target=arm64-apple-macos11 \ +#--- gen +# Compile to an object file +clang --target=arm64-apple-macos11 \ -c \ + -fdebug-compilation-dir=/private/tmp/stmt_seq \ -g \ -gdwarf-4 \ -fno-unwind-tables \ @@ -73,11 +70,1836 @@ EOF stmt_seq_macho.cpp \ -o stmt_seq_macho.o -"$TOOLCHAIN/ld64.lld" \ +# Link into an executable +ld64.lld \ -arch arm64 \ -platform_version macos 11.0.0 11.0.0 \ -o stmt_seq_macho.exe \ stmt_seq_macho.o \ -dead_strip \ --icf=all \ + -oso_prefix $(pwd)/ \ --keep-icf-stabs + +# Convert executable to YAML for the test +echo "#--- stmt_seq_macho.o.yaml" +obj2yaml stmt_seq_macho.o +echo "" +echo "#--- stmt_seq_macho.exe.yaml" +obj2yaml stmt_seq_macho.exe + +#--- stmt-seq-macho.yaml +#--- stmt_seq_macho.o.yaml +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 5 + sizeofcmds: 1176 + flags: 0x2000 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 1032 + segname: '' + vmaddr: 0 + vmsize: 2793 + fileoff: 1208 + filesize: 2793 + maxprot: 7 + initprot: 7 + nsects: 12 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 128 + offset: 0x4B8 + align: 2 + reloff: 0xFA8 + nreloc: 7 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00100011C0035FD600580051C0035FD600100011C0035FD600580051C0035FD6FFC300D1F44F01A9FD7B02A9FD8300916000805200000094F30300AA20058052000000941400130B6001805200000094F30300AA0100009021000091E03F0091000000948002130BFD7B42A9F44F41A9FFC30091C0035FD600000014C0035FD6 + relocations: + - address: 0x78 + symbolnum: 4 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x60 + symbolnum: 3 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x58 + symbolnum: 1 + pcrel: false + length: 2 + extern: true + type: 4 + scattered: false + value: 0 + - address: 0x54 + symbolnum: 1 + pcrel: true + length: 2 + extern: true + type: 3 + scattered: false + value: 0 + - address: 0x4C + symbolnum: 5 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x40 + symbolnum: 8 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x34 + symbolnum: 6 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - sectname: __cstring + segname: __TEXT + addr: 0x80 + size: 5 + offset: 0x538 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '7465737400' + - sectname: __debug_loc + segname: __DWARF + addr: 0x85 + size: 412 + offset: 0x53D + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00000000000000000400000000000000010050040000000000000008000000000000000400A301509F0000000000000000000000000000000000000000000000000400000000000000030070039F0000000000000000000000000000000008000000000000000C000000000000000100500C0000000000000010000000000000000400A301509F0000000000000000000000000000000010000000000000001400000000000000010050140000000000000018000000000000000400A301509F0000000000000000000000000000000010000000000000001400000000000000030070039F0000000000000000000000000000000018000000000000001C000000000000000100501C0000000000000020000000000000000400A301509F000000000000000000000000000000001C0000000000000020000000000000000100500000000000000000000000000000000030000000000000003C00000000000000030011009F3C0000000000000048000000000000000100634800000000000000540000000000000001006400000000000000000000000000000000 + - sectname: __debug_abbrev + segname: __DWARF + addr: 0x221 + size: 359 + offset: 0x6D9 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + - sectname: __debug_info + segname: __DWARF + addr: 0x388 + size: 686 + offset: 0x840 + align: 0 + reloff: 0xFE0 + nreloc: 14 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + relocations: + - address: 0x26A + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x251 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x216 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1B8 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1A5 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x191 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x17E + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x140 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x104 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xC8 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x9B + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x5F + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x37 + symbolnum: 2 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x22 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - sectname: __debug_str + segname: __DWARF + addr: 0x636 + size: 239 + offset: 0xAEE + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + - sectname: __apple_names + segname: __DWARF + addr: 0x725 + size: 260 + offset: 0xBDD + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000008000000080000000C000000000000000100000001000600000000000200000005000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF90D9F86F88CB36CF4908311CD1125E5389CB36CF4A08311C522B70536A7F9A7C8000000094000000A4000000B4000000C4000000D4000000E4000000F40000008A0000000200000015020000690200000000000055000000010000009A0000000000000045000000010000005E00000000000000A3000000010000001502000000000000750000000100000003010000000000006500000001000000C700000000000000BB00000001000000690200000000000085000000010000003F01000000000000 + - sectname: __apple_objc + segname: __DWARF + addr: 0x829 + size: 36 + offset: 0xCE1 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF + - sectname: __apple_namespac + segname: __DWARF + addr: 0x84D + size: 36 + offset: 0xD05 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF + - sectname: __apple_types + segname: __DWARF + addr: 0x871 + size: 195 + offset: 0xD29 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48534148010000000500000005000000140000000000000003000000010006000300050004000B000000000002000000FFFFFFFF03000000040000007CA8F05D90D9F86F5B738CDC3080880B6320957C64000000770000008A0000009D000000B00000009700000001000000EA010000130000000000008A00000001000000C80100001300000000000031000000010000005700000024000000000000D300000001000000A1020000240000000000002C000000010000005000000024000000000000 + - sectname: __debug_frame + segname: __DWARF + addr: 0x938 + size: 208 + offset: 0xDF0 + align: 3 + reloff: 0x1050 + nreloc: 7 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 14000000FFFFFFFF0400080001781E0C1F00000000000000140000000000000000000000000000000800000000000000140000000000000008000000000000000800000000000000140000000000000010000000000000000800000000000000140000000000000018000000000000000800000000000000240000000000000020000000000000005800000000000000500C1D109E019D02930394040000000014000000000000007800000000000000040000000000000014000000000000007C000000000000000400000000000000 + relocations: + - address: 0xC0 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xA8 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x80 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x68 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x50 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x38 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x20 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - sectname: __debug_line + segname: __DWARF + addr: 0xA08 + size: 225 + offset: 0xEC0 + align: 0 + reloff: 0x1088 + nreloc: 7 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + relocations: + - address: 0xD1 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xBD + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x92 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x7E + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x66 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x50 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x3A + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - cmd: LC_BUILD_VERSION + cmdsize: 24 + platform: 1 + minos: 720896 + sdk: 0 + ntools: 0 + - cmd: LC_LINKER_OPTIMIZATION_HINT + cmdsize: 16 + dataoff: 4288 + datasize: 8 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4296 + nsyms: 10 + stroff: 4456 + strsize: 144 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 3 + iextdefsym: 3 + nextdefsym: 7 + iundefsym: 10 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 138 + n_type: 0xE + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 128 + - n_strx: 132 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 128 + - n_strx: 39 + n_type: 0xF + n_sect: 1 + n_desc: 192 + n_value: 120 + - n_strx: 14 + n_type: 0xF + n_sect: 1 + n_desc: 192 + n_value: 124 + - n_strx: 115 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 8 + - n_strx: 81 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 24 + - n_strx: 98 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 64 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16 + - n_strx: 8 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 32 + StringTable: + - '' + - l_.str + - _main + - __ZN12length_errorC2EPKc + - __ZN12length_errorC1EPKc + - _function3_copy2 + - _function2_copy2 + - _function3_copy1 + - _function2_copy1 + - ltmp1 + - ltmp0 +DWARF: + debug_str: + - '' + - stmt_seq_macho.cpp + - '/' + - '/private/tmp/stmt_seq' + - char + - __ARRAY_SIZE_TYPE__ + - function3_copy1 + - function2_copy1 + - function3_copy2 + - function2_copy2 + - main + - length_error + - logic_error + - _ZN12length_errorC1EPKc + - _ZN12length_errorC2EPKc + - int + - a + - b + - result + - e + - sum + - this + - s + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_LLVM_sysroot + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + - Attribute: DW_AT_comp_dir + Form: DW_FORM_strp + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Code: 0x2 + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Code: 0x3 + Tag: DW_TAG_array_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x4 + Tag: DW_TAG_subrange_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_count + Form: DW_FORM_data1 + - Code: 0x5 + Tag: DW_TAG_const_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x6 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Code: 0x7 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Code: 0x8 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_APPLE_omit_frame_ptr + Form: DW_FORM_flag_present + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_sec_offset + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0x9 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_sec_offset + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xA + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_sec_offset + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xB + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_sec_offset + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0xC + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xD + Tag: DW_TAG_call_site + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_call_origin + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_return_pc + Form: DW_FORM_addr + - Code: 0xE + Tag: DW_TAG_call_site_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_value + Form: DW_FORM_exprloc + - Code: 0xF + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_calling_convention + Form: DW_FORM_data1 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Code: 0x10 + Tag: DW_TAG_inheritance + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_data_member_location + Form: DW_FORM_data1 + - Code: 0x11 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Attribute: DW_AT_explicit + Form: DW_FORM_flag_present + - Code: 0x12 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + - Code: 0x13 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x14 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0x15 + Tag: DW_TAG_pointer_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x16 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_APPLE_omit_frame_ptr + Form: DW_FORM_flag_present + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_sec_offset + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_object_pointer + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_linkage_name + Form: DW_FORM_strp + - Attribute: DW_AT_specification + Form: DW_FORM_ref4 + - Code: 0x17 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + - Code: 0x18 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x19 + Tag: DW_TAG_call_site + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_call_origin + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_tail_call + Form: DW_FORM_flag_present + - Attribute: DW_AT_call_pc + Form: DW_FORM_addr + debug_info: + - Length: 0x2AA + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x0 + - Value: 0x21 + - Value: 0x1 + - Value: 0x14 + - Value: 0x0 + - Value: 0x16 + - Value: 0x1 + - Value: 0x0 + - Value: 0x80 + - AbbrCode: 0x2 + Values: + - Value: 0x3F + - Value: 0x1 + - Value: 0x23 + - Value: 0x9 + BlockData: [ 0x3, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0 ] + - AbbrCode: 0x3 + Values: + - Value: 0x4B + - AbbrCode: 0x4 + Values: + - Value: 0x57 + - Value: 0x5 + - AbbrCode: 0x0 + - AbbrCode: 0x5 + Values: + - Value: 0x50 + - AbbrCode: 0x6 + Values: + - Value: 0x2C + - Value: 0x6 + - Value: 0x1 + - AbbrCode: 0x7 + Values: + - Value: 0x31 + - Value: 0x8 + - Value: 0x7 + - AbbrCode: 0x8 + Values: + - Value: 0x0 + - Value: 0x8 + - Value: 0x1 + - Value: 0x34 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x45 + - Value: 0x1 + - Value: 0x3 + - Value: 0x2A1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x9 + Values: + - Value: 0x0 + - Value: 0xD7 + - Value: 0x1 + - Value: 0x3 + - Value: 0x2A1 + - AbbrCode: 0xA + Values: + - Value: 0x39 + - Value: 0xD9 + - Value: 0x1 + - Value: 0x4 + - Value: 0x2A1 + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x8 + - Value: 0x8 + - Value: 0x1 + - Value: 0x4A + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x55 + - Value: 0x1 + - Value: 0x8 + - Value: 0x2A1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x9 + Values: + - Value: 0x5E + - Value: 0xD7 + - Value: 0x1 + - Value: 0x8 + - Value: 0x2A1 + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x10 + - Value: 0x8 + - Value: 0x1 + - Value: 0x60 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x65 + - Value: 0x1 + - Value: 0xC + - Value: 0x2A1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x9 + Values: + - Value: 0x97 + - Value: 0xD7 + - Value: 0x1 + - Value: 0xC + - Value: 0x2A1 + - AbbrCode: 0xA + Values: + - Value: 0xD0 + - Value: 0xD9 + - Value: 0x1 + - Value: 0xD + - Value: 0x2A1 + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x18 + - Value: 0x8 + - Value: 0x1 + - Value: 0x78 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x75 + - Value: 0x1 + - Value: 0x11 + - Value: 0x2A1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x9 + Values: + - Value: 0xF5 + - Value: 0xD7 + - Value: 0x1 + - Value: 0x11 + - Value: 0x2A1 + - AbbrCode: 0xA + Values: + - Value: 0x12E + - Value: 0xDB + - Value: 0x1 + - Value: 0x12 + - Value: 0x2A1 + - AbbrCode: 0x0 + - AbbrCode: 0xB + Values: + - Value: 0x20 + - Value: 0x58 + - Value: 0x8F + - Value: 0x1 + BlockData: [ 0x6D ] + - Value: 0x1 + - Value: 0x85 + - Value: 0x1 + - Value: 0x1E + - Value: 0x2A1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xC + Values: + - Value: 0x2 + BlockData: [ 0x8F, 0xF ] + - Value: 0xE2 + - Value: 0x1 + - Value: 0x23 + - Value: 0x1C8 + - AbbrCode: 0xA + Values: + - Value: 0x151 + - Value: 0xE4 + - Value: 0x1 + - Value: 0x1F + - Value: 0x2A1 + - AbbrCode: 0xD + Values: + - Value: 0x103 + - Value: 0x38 + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x1 + BlockData: [ 0x33 ] + - AbbrCode: 0x0 + - AbbrCode: 0xD + Values: + - Value: 0xC7 + - Value: 0x44 + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x2 + BlockData: [ 0x10, 0x29 ] + - AbbrCode: 0x0 + - AbbrCode: 0xD + Values: + - Value: 0x9A + - Value: 0x50 + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x1 + BlockData: [ 0x3B ] + - AbbrCode: 0x0 + - AbbrCode: 0xD + Values: + - Value: 0x215 + - Value: 0x64 + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x2 + BlockData: [ 0x8F, 0xF ] + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0xF + Values: + - Value: 0x5 + - Value: 0x8A + - Value: 0x1 + - Value: 0x1 + - Value: 0x1A + - AbbrCode: 0x10 + Values: + - Value: 0x1EA + - Value: 0x0 + - AbbrCode: 0x11 + Values: + - Value: 0x8A + - Value: 0x1 + - Value: 0x1B + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x12 + Values: + - Value: 0x210 + - Value: 0x1 + - AbbrCode: 0x13 + Values: + - Value: 0x20B + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0xF + Values: + - Value: 0x5 + - Value: 0x97 + - Value: 0x1 + - Value: 0x1 + - Value: 0x16 + - AbbrCode: 0x14 + Values: + - Value: 0x97 + - Value: 0x1 + - Value: 0x17 + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x12 + Values: + - Value: 0x206 + - Value: 0x1 + - AbbrCode: 0x13 + Values: + - Value: 0x20B + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x15 + Values: + - Value: 0x1EA + - AbbrCode: 0x15 + Values: + - Value: 0x4B + - AbbrCode: 0x15 + Values: + - Value: 0x1C8 + - AbbrCode: 0x16 + Values: + - Value: 0x78 + - Value: 0x4 + - Value: 0x1 + - Value: 0xB7 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x234 + - Value: 0x1 + - Value: 0xA3 + - Value: 0x1D7 + - AbbrCode: 0x17 + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0xE8 + - Value: 0x2A8 + - Value: 0x1 + - AbbrCode: 0x18 + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0xED + - Value: 0x1 + - Value: 0x1B + - Value: 0x20B + - AbbrCode: 0x19 + Values: + - Value: 0x269 + - Value: 0x1 + - Value: 0x78 + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x3 + BlockData: [ 0xA3, 0x1, 0x50 ] + - AbbrCode: 0xE + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0x3 + BlockData: [ 0xA3, 0x1, 0x51 ] + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x16 + Values: + - Value: 0x7C + - Value: 0x4 + - Value: 0x1 + - Value: 0xCB + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x288 + - Value: 0x1 + - Value: 0xBB + - Value: 0x1D7 + - AbbrCode: 0x17 + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0xE8 + - Value: 0x2A8 + - Value: 0x1 + - AbbrCode: 0x18 + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0xED + - Value: 0x1 + - Value: 0x1B + - Value: 0x20B + - AbbrCode: 0x0 + - AbbrCode: 0x6 + Values: + - Value: 0xD3 + - Value: 0x5 + - Value: 0x4 + - AbbrCode: 0x15 + Values: + - Value: 0x1C8 + - AbbrCode: 0x0 + debug_line: + - Length: 221 + Version: 4 + PrologueLength: 42 + MinInstLength: 1 + MaxOpsPerInst: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + Files: + - Name: stmt_seq_macho.cpp + DirIdx: 0 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 0 + - Opcode: 0x16 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 8 + - Opcode: 0x1A + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 16 + - Opcode: DW_LNS_advance_line + SData: 13 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 20 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 24 + - Opcode: DW_LNS_advance_line + SData: 17 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: 0x4B + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 32 + - Opcode: DW_LNS_advance_line + SData: 29 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 12 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: 0xF4 + Data: 0 + - Opcode: 0xBB + Data: 0 + - Opcode: DW_LNS_set_column + Data: 9 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x82 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 12 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4B + Data: 0 + - Opcode: DW_LNS_set_column + Data: 18 + - Opcode: 0xBB + Data: 0 + - Opcode: DW_LNS_set_column + Data: 9 + - Opcode: 0xF1 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_set_epilogue_begin + Data: 0 + - Opcode: 0x4C + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 85 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 120 + - Opcode: DW_LNS_advance_line + SData: 26 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 86 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 124 + - Opcode: DW_LNS_advance_line + SData: 26 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 4 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 +... + +#--- stmt_seq_macho.exe.yaml +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 13 + sizeofcmds: 688 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __PAGEZERO + vmaddr: 0 + vmsize: 4294967296 + fileoff: 0 + filesize: 0 + maxprot: 0 + initprot: 0 + nsects: 0 + flags: 0 + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x1000002F0 + size: 112 + offset: 0x2F0 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00580051C0035FD600100011C0035FD6FFC300D1F44F01A9FD7B02A9FD83009160008052F7FFFF97F30300AA20058052F6FFFF971400130B60018052F1FFFF97F30300AA610100101F2003D5E03F0091060000948002130BFD7B42A9F44F41A9FFC30091C0035FD601000014C0035FD6 + - sectname: __cstring + segname: __TEXT + addr: 0x100000360 + size: 5 + offset: 0x360 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '7465737400' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294983680 + vmsize: 960 + fileoff: 16384 + filesize: 960 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_DYLD_INFO_ONLY + cmdsize: 48 + rebase_off: 0 + rebase_size: 0 + bind_off: 0 + bind_size: 0 + weak_bind_off: 0 + weak_bind_size: 0 + lazy_bind_off: 0 + lazy_bind_size: 0 + export_off: 16384 + export_size: 96 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 16488 + nsyms: 22 + stroff: 16840 + strsize: 192 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 17 + iextdefsym: 17 + nextdefsym: 5 + iundefsym: 22 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_LOAD_DYLINKER + cmdsize: 32 + name: 12 + Content: '/usr/lib/dyld' + ZeroPadBytes: 7 + - cmd: LC_UUID + cmdsize: 24 + uuid: 4C4C4480-5555-3144-A138-E5DA50CC68DB + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 720896 + sdk: 720896 + ntools: 1 + Tools: + - tool: 4 + version: 1376256 + - cmd: LC_MAIN + cmdsize: 24 + entryoff: 768 + stacksize: 0 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 16480 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 16488 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 17040 + datasize: 304 +LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 0 + NodeOffset: 5 + Name: _ + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 2 + NodeOffset: 43 + Name: _mh_execute_header + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + - TerminalSize: 3 + NodeOffset: 47 + Name: main + Flags: 0x0 + Address: 0x300 + Other: 0x0 + ImportName: '' + - TerminalSize: 0 + NodeOffset: 52 + Name: function + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 0 + NodeOffset: 71 + Name: 2_copy + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 79 + Name: '1' + Flags: 0x0 + Address: 0x2F0 + Other: 0x0 + ImportName: '' + - TerminalSize: 3 + NodeOffset: 84 + Name: '2' + Flags: 0x0 + Address: 0x2F0 + Other: 0x0 + ImportName: '' + - TerminalSize: 3 + NodeOffset: 89 + Name: 3_copy2 + Flags: 0x0 + Address: 0x2F8 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 129 + n_type: 0x64 + n_sect: 0 + n_desc: 0 + n_value: 0 + - n_strx: 170 + n_type: 0x66 + n_sect: 0 + n_desc: 1 + n_value: 0 + - n_strx: 59 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968152 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 4 + - n_strx: 84 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968156 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 4 + - n_strx: 2 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968064 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 88 + - n_strx: 8 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968048 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 8 + - n_strx: 25 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968056 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 8 + - n_strx: 42 + n_type: 0x24 + n_sect: 1 + n_desc: 0 + n_value: 4294968048 + - n_strx: 1 + n_type: 0x24 + n_sect: 0 + n_desc: 0 + n_value: 8 + - n_strx: 1 + n_type: 0x64 + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 59 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 4294968152 + - n_strx: 84 + n_type: 0x1E + n_sect: 1 + n_desc: 0 + n_value: 4294968156 + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294968064 + - n_strx: 8 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294968048 + - n_strx: 25 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294968056 + - n_strx: 42 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294968048 + - n_strx: 109 + n_type: 0xF + n_sect: 1 + n_desc: 16 + n_value: 4294967296 + StringTable: + - ' ' + - _main + - _function2_copy1 + - _function3_copy2 + - _function2_copy2 + - __ZN12length_errorC1EPKc + - __ZN12length_errorC2EPKc + - __mh_execute_header + - '/private/tmp/stmt_seq/stmt_seq_macho.cpp' + - stmt_seq_macho.o + - '' + - '' + - '' + - '' + - '' + FunctionStarts: [ 0x2F0, 0x2F8, 0x300, 0x358, 0x35C ] +... diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/stmt_seq/stmt_seq_macho.exe b/llvm/test/tools/dsymutil/Inputs/private/tmp/stmt_seq/stmt_seq_macho.exe deleted file mode 100755 index 4dd4ee8deb0b4f5dfed1c148d9cc302e466a4059..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17360 zcmeI4T}V_>5P)azY6)g#T3J$UA<_T3TLe+jYA$9b>4Olym=+C$noQ@f_0?I8 zd{hGo*~yZU8v{RiRg~>1cbAmOv$~8tkK}%|Nu*6RCv#i@5HHkMO!IJIO}K>W$V2tJ z02bi*pa|e8j+v;Hs3h5@(phNDzsPLdWJASw2J9`dsJ%IG$llP_+<-PoJ}T8wQBiVq z|9(fwm3uR{m2@pU??j&s)i@(nm8s9ua0W0RR^fSRA4onlM_GwF7q!j>kb&cB)24Hh zEJ3AsPccBDk(}I9ix>6|TE!FGluhHrT^JhmSbE1!D7|AH&|L@dsxK207e85{&z2sq z`i%2j$jPj;#H&7-{(1|{U1qaBez^m>X8>G?D><2CCyIMVJw?5kubc8>DY-OiYAoq( zow_WJFaajO1egF5U;<2l2`~XBzyz286JP>NfC(@GCcp%k025#WOn?b60Vco%m;e)C z0!)AjFaajO1egF5U;<2l2`~XBzyz286JP>NfC(@GCcp%k025#WOn?b60Vco%z#_1} z-YSBAp|+~N7EKj?wZ1`(!rEYcAO!ZtmXJ>ighQ~2TFk8w$Bt*kDSY8b8*P z;sB2ms;!W~Y{ug&^_eHqGcvr`rf?DP33`tZ-}CQ{-70O^1AWW zEAFu=`ezDg&ottaaf__nEExY2GGvTrR=z#Zy!`2*@!@wZ&vO@VY47mg?pp9|_4Vba zM=lLt&mTC|f72Vj|E6&;ZFp$tu;THiPn^7U@_6U7JLh(tU3c|=<8ua&%}Zo&>R|b#mo-pOQ+ln_*DaR_Uyo;!xgOH6QCw4D zYgGxyna!pb=bd~hqwPnZ9G&{~TFO_o_8qlTLOfK5hcKtRMg}jHPFJ!gx!%oqyL9%v z#(T%NR%(=nc)8+Si3#<1XDzqv$GU37@7m+ozo4r&#KR`4Xg%IOJ)fub{QlPBf@~P; z@KWV+KI;@yg=`f`^?0jn29Q_dO_;`T-5eX@mC`5F3#rGupx3us*MFvM9Y|{p@zngv zsZ2fIFY54~y+gd(bLEEXeOKeXp(k_UL-E}Ho2y_0QByy^?=`zX*EHS?UMgSt6>~ne z?4R!ET#;~F@XGpq-q3h^+Fj+n+u$V-?ghKAPak^NF)&KRo)GQ{NNzgw24M3M*C5Y9 zvLXp4ao%LM#MZWg^G>Kkgsxi7@5GFlRI<+g;0vKZT&zdU%lOeOtvM;)|Mm#VNMFrjmjM-HAXklE;otOGAdsU>COrpPRl^- zbdiW|)TnkfMz7C8q0V0u16q4fOG|{rtHv1DPAM`*H6IWuX4f!GR!Dv{7`Cn8ndM+d z&yL`Y(~*$u#I5ET6cgbCfd>K}ti+OXB8=_W1>X>Q)czP0A$+@q6?`}lfTxLSAn-6$ ze@qhM&!;0VF87S|J|D4y#F+>mi~wDTW?OiAj1R3nxXBE-)s0JAk=wgo+BPmvhrft^ z3{Jp*3KMM)%Y6Y(gipkg+lhG&LkY#CTlQc?%_4(&blM0GLJ3nr!`9^p(cP{B+cqy~ zHvD{~7<>#BJy;2*2^#aRsC2 zAO?AFvqghs9me2&5`!&lwuA2?2if!rgulrg0d=(u!X-;TVl0dQhADAaf#E zyTq!}4UQpI`wyt9R+%j^Oz<&IvQyOtq6m{_z%PMi;eC#GGurBTsr_;0-uouTmPrgr zn)fi+W_BQbfPoo&qLkaZl4#ngDJ(tEL@#IISDZd%w+l9L%g;Fe2E{lHAU#{d%rfS*#TA|!;GDKz}Rlz5EbZTn!m~(XEsWE5zSaQs9#N1*r zQ^Q>MI+;@Wx%ir$sI%R8qAs2gg;cI6^4a2iZNbS_Dy53Zm*#UBtvgRW9#3qHJ`x{G z9?poZ{)F=9iZziEX;H~m7xOid71hOps4e8GqAJFZj*ere)k)<+FrO;=q;P*t;(e)q zFg|EN=id2O>Fitk-k$KscV6#&=IGa;AA&yhiSp4Cr(a%PLG5|)55YeJ{~G*T@Sngp z!E9LKv$@$vq}RcGKFxbaEq%tP`BYZtS*8s5;m!5}V{E?z>~brB|Ayl1qFQ|7e1 zXb8Dqm=SL!8Qsm87v!}bcBQ*Y!S`6wQzyiw(k(iT(4Q*_Dx>1 zN~=v8e-|;#DyoK#X5SgR)pywLg!;a12afPzpJlmOq_thr z_c{8^0^##`iRcnju@`;EA<@NiDZ& zc|~oI@cl^3RV`0xIie;1FT|hkP2f>}TT4gFAxJ>_KBDcbT3*s}Mau;(*^0y&)-tN4 zKGMElY5N;mp3-toOH+S9+fANnZiPBRK>!<~8pOm_wBsvv_I Date: Tue, 15 Apr 2025 09:59:23 -0700 Subject: [PATCH 011/710] [clang][depscan] Centralize logic for populating StableDirs, NFC (#135704) Pass a reference to `StableDirs` when creating ModuleDepCollector. This avoids needing to create one from the same ScanInstance for each call to `handleTopLevelModule` & reduces the amount of potential downstream changes needed for handling StableDirs. --- .../DependencyScanning/ModuleDepCollector.h | 6 +++- .../DependencyScanningWorker.cpp | 29 ++++++++++--------- .../DependencyScanning/ModuleDepCollector.cpp | 20 +++++-------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h index ce5e67d2624d9..d2d0d56e5212c 100644 --- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h +++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h @@ -282,7 +282,8 @@ class ModuleDepCollector final : public DependencyCollector { CompilerInstance &ScanInstance, DependencyConsumer &C, DependencyActionController &Controller, CompilerInvocation OriginalCI, - const PrebuiltModulesAttrsMap PrebuiltModulesASTMap); + const PrebuiltModulesAttrsMap PrebuiltModulesASTMap, + const ArrayRef StableDirs); void attachToPreprocessor(Preprocessor &PP) override; void attachToASTReader(ASTReader &R) override; @@ -305,6 +306,9 @@ class ModuleDepCollector final : public DependencyCollector { /// Mapping from prebuilt AST filepaths to their attributes referenced during /// dependency collecting. const PrebuiltModulesAttrsMap PrebuiltModulesASTMap; + /// Directory paths known to be stable through an active development and build + /// cycle. + const ArrayRef StableDirs; /// Path to the main source file. std::string MainFile; /// Hash identifying the compilation conditions of the current TU. diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 6595f8ff5dc55..bae436afe0897 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -100,7 +100,7 @@ class PrebuiltModuleListener : public ASTReaderListener { PrebuiltModulesAttrsMap &PrebuiltModulesASTMap, const HeaderSearchOptions &HSOpts, const LangOptions &LangOpts, DiagnosticsEngine &Diags, - const llvm::SmallVector &StableDirs) + const ArrayRef StableDirs) : PrebuiltModuleFiles(PrebuiltModuleFiles), NewModuleFiles(NewModuleFiles), PrebuiltModulesASTMap(PrebuiltModulesASTMap), ExistingHSOpts(HSOpts), @@ -199,7 +199,7 @@ class PrebuiltModuleListener : public ASTReaderListener { const LangOptions &ExistingLangOpts; DiagnosticsEngine &Diags; std::string CurrentFile; - const llvm::SmallVector &StableDirs; + const ArrayRef StableDirs; }; /// Visit the given prebuilt module and collect all of the modules it @@ -208,16 +208,8 @@ static bool visitPrebuiltModule(StringRef PrebuiltModuleFilename, CompilerInstance &CI, PrebuiltModuleFilesT &ModuleFiles, PrebuiltModulesAttrsMap &PrebuiltModulesASTMap, - DiagnosticsEngine &Diags) { - - // Gather the set of stable directories to use as transitive dependencies are - // discovered. - llvm::SmallVector StableDirs; - std::string SysrootToUse(CI.getHeaderSearchOpts().Sysroot); - if (!SysrootToUse.empty() && - (llvm::sys::path::root_directory(SysrootToUse) != SysrootToUse)) - StableDirs = {SysrootToUse, CI.getHeaderSearchOpts().ResourceDir}; - + DiagnosticsEngine &Diags, + const ArrayRef StableDirs) { // List of module files to be processed. llvm::SmallVector Worklist; @@ -448,6 +440,15 @@ class DependencyScanningAction : public tooling::ToolAction { auto *FileMgr = ScanInstance.createFileManager(FS); ScanInstance.createSourceManager(*FileMgr); + // Create a collection of stable directories derived from the ScanInstance + // for determining whether module dependencies would fully resolve from + // those directories. + llvm::SmallVector StableDirs; + const StringRef Sysroot = ScanInstance.getHeaderSearchOpts().Sysroot; + if (!Sysroot.empty() && + (llvm::sys::path::root_directory(Sysroot) != Sysroot)) + StableDirs = {Sysroot, ScanInstance.getHeaderSearchOpts().ResourceDir}; + // Store a mapping of prebuilt module files and their properties like header // search options. This will prevent the implicit build to create duplicate // modules and will force reuse of the existing prebuilt module files @@ -459,7 +460,7 @@ class DependencyScanningAction : public tooling::ToolAction { ScanInstance.getPreprocessorOpts().ImplicitPCHInclude, ScanInstance, ScanInstance.getHeaderSearchOpts().PrebuiltModuleFiles, - PrebuiltModulesASTMap, ScanInstance.getDiagnostics())) + PrebuiltModulesASTMap, ScanInstance.getDiagnostics(), StableDirs)) return false; // Create the dependency collector that will collect the produced @@ -489,7 +490,7 @@ class DependencyScanningAction : public tooling::ToolAction { case ScanningOutputFormat::Full: MDC = std::make_shared( Service, std::move(Opts), ScanInstance, Consumer, Controller, - OriginalInvocation, std::move(PrebuiltModulesASTMap)); + OriginalInvocation, std::move(PrebuiltModulesASTMap), StableDirs); ScanInstance.addDependencyCollector(MDC); break; } diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index ebd392fbfa7d6..429bf823616da 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -763,14 +763,9 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { MD.IsSystem = M->IsSystem; // Start off with the assumption that this module is shareable when there - // is a sysroot provided. As more dependencies are discovered, check if those - // come from the provided shared directories. - const llvm::SmallVector StableDirs = { - MDC.ScanInstance.getHeaderSearchOpts().Sysroot, - MDC.ScanInstance.getHeaderSearchOpts().ResourceDir}; - MD.IsInStableDirectories = - !StableDirs[0].empty() && - (llvm::sys::path::root_directory(StableDirs[0]) != StableDirs[0]); + // are stable directories. As more dependencies are discovered, check if those + // come from the provided directories. + MD.IsInStableDirectories = !MDC.StableDirs.empty(); // For modules which use export_as link name, the linked product that of the // corresponding export_as-named module. @@ -817,7 +812,7 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { auto FullFilePath = ASTReader::ResolveImportedPath( PathBuf, IFI.UnresolvedImportedFilename, MF->BaseDirectory); MD.IsInStableDirectories = - isPathInStableDir(StableDirs, *FullFilePath); + isPathInStableDir(MDC.StableDirs, *FullFilePath); } if (!(IFI.TopLevel && IFI.ModuleMap)) return; @@ -864,7 +859,7 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) { // IsInStableDirectories. if (MD.IsInStableDirectories) MD.IsInStableDirectories = - areOptionsInStableDir(StableDirs, CI.getHeaderSearchOpts()); + areOptionsInStableDir(MDC.StableDirs, CI.getHeaderSearchOpts()); MDC.associateWithContextHash(CI, IgnoreCWD, MD); @@ -978,11 +973,12 @@ ModuleDepCollector::ModuleDepCollector( std::unique_ptr Opts, CompilerInstance &ScanInstance, DependencyConsumer &C, DependencyActionController &Controller, CompilerInvocation OriginalCI, - const PrebuiltModulesAttrsMap PrebuiltModulesASTMap) + const PrebuiltModulesAttrsMap PrebuiltModulesASTMap, + const ArrayRef StableDirs) : Service(Service), ScanInstance(ScanInstance), Consumer(C), Controller(Controller), PrebuiltModulesASTMap(std::move(PrebuiltModulesASTMap)), - Opts(std::move(Opts)), + StableDirs(StableDirs), Opts(std::move(Opts)), CommonInvocation( makeCommonInvocationForModuleBuild(std::move(OriginalCI))) {} From 3192ecfa89a48b5f56ff36956abe7e84327ced5d Mon Sep 17 00:00:00 2001 From: Akira Hatanaka Date: Tue, 15 Apr 2025 10:12:18 -0700 Subject: [PATCH 012/710] [CodeComplete] Don't drop ArrayToPointerDecay when doing member completion (#134951) Fixes https://github.com/llvm/llvm-project/issues/123146. rdar://138851576 --- clang/lib/Sema/SemaCodeComplete.cpp | 5 ++++- clang/test/CodeCompletion/member-access.c | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 793ffb6a00b86..f6ec4cb0f069e 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -5726,7 +5726,10 @@ class ConceptInfo { QualType getApproximateType(const Expr *E, HeuristicResolver &Resolver) { if (E->getType().isNull()) return QualType(); - E = E->IgnoreParenImpCasts(); + // Don't drop implicit cast if it's an array decay. + if (auto *ICE = dyn_cast(E); + !ICE || ICE->getCastKind() != CK_ArrayToPointerDecay) + E = E->IgnoreParenImpCasts(); QualType Unresolved = E->getType(); // Resolve DependentNameType if (const auto *DNT = Unresolved->getAs()) { diff --git a/clang/test/CodeCompletion/member-access.c b/clang/test/CodeCompletion/member-access.c index fc54993152815..f08d1957fb90a 100644 --- a/clang/test/CodeCompletion/member-access.c +++ b/clang/test/CodeCompletion/member-access.c @@ -36,3 +36,12 @@ void test4(struct Point *p) { } // RUN: %clang_cc1 -fsyntax-only -code-completion-with-fixits -code-completion-at=%s:%(line-3):13 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s // RUN: %clang_cc1 -fsyntax-only -code-completion-with-fixits -code-completion-at=%s:%(line-3):23 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s + +float test5(void) { + struct Point array[4]; + return array->x; +} +// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:%(line-2):17 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s +// CHECK-CC4: COMPLETION: x : [#float#]x +// CHECK-CC4: COMPLETION: y : [#float#]y +// CHECK-CC4: COMPLETION: z : [#float#]z From 1545f1139127b92be15fcd2964114028a2d07194 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Tue, 15 Apr 2025 10:39:08 -0700 Subject: [PATCH 013/710] [NFC][NVPTX] Use StringRef for Modifier arg in NVPTXInstPrinter (#135793) - Use StringRef type for Modifier instead of const char *. - Remove Modifier arg from functions that do not need them. --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 28 +++++++----------- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 29 ++++++++----------- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index e42e738b9973f..4e2e4c99df803 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -95,10 +95,9 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *M) { + StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int64_t Imm = MO.getImm(); - llvm::StringRef Modifier(M); if (Modifier == "ftz") { // FTZ flag @@ -155,10 +154,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, } void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *M) { + StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int64_t Imm = MO.getImm(); - llvm::StringRef Modifier(M); if (Modifier == "ftz") { // FTZ flag @@ -229,8 +227,7 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, } void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *M) { - llvm::StringRef Modifier(M); + raw_ostream &O, StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); if (Modifier == "sem") { @@ -329,10 +326,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, } void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *M) { + StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); - llvm::StringRef Modifier(M); if (Modifier.empty() || Modifier == "version") { O << Imm; // Just print out PTX version return; @@ -346,9 +342,8 @@ void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, } void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *M) { + raw_ostream &O, StringRef Modifier) { printOperand(MI, OpNum, O); - llvm::StringRef Modifier(M); if (Modifier == "add") { O << ", "; @@ -363,7 +358,7 @@ void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, } void NVPTXInstPrinter::printOffseti32imm(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { auto &Op = MI->getOperand(OpNum); assert(Op.isImm() && "Invalid operand"); if (Op.getImm() != 0) { @@ -373,13 +368,13 @@ void NVPTXInstPrinter::printOffseti32imm(const MCInst *MI, int OpNum, } void NVPTXInstPrinter::printHexu32imm(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { int64_t Imm = MI->getOperand(OpNum).getImm(); O << formatHex(Imm) << "U"; } void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); assert(Op.isExpr() && "Call prototype is not an MCExpr?"); const MCExpr *Expr = Op.getExpr(); @@ -388,7 +383,7 @@ void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, } void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier) { + raw_ostream &O) { const MCOperand &MO = MI->getOperand(OpNum); int64_t Imm = MO.getImm(); @@ -419,10 +414,9 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum, } void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum, - raw_ostream &O, - const char *Modifier) { + raw_ostream &O) { const MCOperand &MO = MI->getOperand(OpNum); - using RedTy = llvm::nvvm::TMAReductionOp; + using RedTy = nvvm::TMAReductionOp; switch (static_cast(MO.getImm())) { case RedTy::ADD: diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 2b19386ef17fe..a2dd772cd86d0 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -37,25 +37,20 @@ class NVPTXInstPrinter : public MCInstPrinter { void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); + StringRef Modifier = {}); void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); + StringRef Modifier = {}); + void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O, + StringRef Modifier = {}); void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); - void printOffseti32imm(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printHexu32imm(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printProtoIdent(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = nullptr); - void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); - void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = nullptr); + StringRef Modifier = {}); + void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, + StringRef Modifier = {}); + void printOffseti32imm(const MCInst *MI, int OpNum, raw_ostream &O); + void printHexu32imm(const MCInst *MI, int OpNum, raw_ostream &O); + void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O); + void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O); + void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O); }; } From 68383fc20880f2d6ec6618d8870cd89e727bdc19 Mon Sep 17 00:00:00 2001 From: Nicolas van Kempen Date: Tue, 15 Apr 2025 13:42:54 -0400 Subject: [PATCH 014/710] [NFC][clang-tidy] Remove {{^}} clauses in some tests (1/N) (#134737) `check_clang_tidy` now matches full lines only, so `{{^}}` clauses are no longer necessary. I am splitting those changes over multiple PRs to make review easier. Numbering them but the actual order doesn't matter. --- .../checkers/performance/for-range-copy.cpp | 16 +- .../performance/inefficient-algorithm.cpp | 34 +- .../performance/trivially-destructible.cpp | 6 +- .../performance/type-promotion-in-math-fn.cpp | 126 ++++---- .../readability/delete-null-pointer.cpp | 20 +- .../readability/else-after-return-no-warn.cpp | 6 +- ...g-hungarian-notation-lower-case-prefix.cpp | 302 +++++++++--------- .../identifier-naming-outofline.cpp | 6 +- .../readability/isolate-declaration-cxx17.cpp | 22 +- .../readability/isolate-declaration.c | 2 +- .../redundant-declaration-ignore-macros.cpp | 10 +- 11 files changed, 274 insertions(+), 276 deletions(-) diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp index 00e135bd2c920..0b5ef50fdbd7f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy --match-partial-fixes %s performance-for-range-copy %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy %s performance-for-range-copy %t -- -- -fno-delayed-template-parsing namespace std { @@ -79,7 +79,7 @@ template void uninstantiated() { for (const S S1 : View>()) {} // CHECK-MESSAGES: [[@LINE-1]]:16: warning: the loop variable's type is not a reference type; this creates a copy in each iteration; consider making this a reference [performance-for-range-copy] - // CHECK-FIXES: {{^}} for (const S& S1 : View>()) {} + // CHECK-FIXES: for (const S& S1 : View>()) {} // Don't warn on dependent types. for (const T t1 : View>()) { @@ -90,15 +90,15 @@ template void instantiated() { for (const S S2 : View>()) {} // CHECK-MESSAGES: [[@LINE-1]]:16: warning: the loop variable's type is {{.*}} - // CHECK-FIXES: {{^}} for (const S& S2 : View>()) {} + // CHECK-FIXES: for (const S& S2 : View>()) {} for (const auto [X, Y] : View>()) {} // CHECK-MESSAGES: [[@LINE-1]]:19: warning: the loop variable's type is - // CHECK-FIXES: {{^}} for (const auto& [X, Y] : View>()) {} + // CHECK-FIXES: for (const auto& [X, Y] : View>()) {} for (const T T2 : View>()) {} // CHECK-MESSAGES: [[@LINE-1]]:16: warning: the loop variable's type is {{.*}} - // CHECK-FIXES: {{^}} for (const T& T2 : View>()) {} + // CHECK-FIXES: for (const T& T2 : View>()) {} } template @@ -311,10 +311,8 @@ View> createView(S) { return View>(); } void positiveValueIteratorUsedElseWhere() { for (const S SS : createView(*ValueReturningIterator())) { - // CHECK-MESSAGES: [[@LINE-1]]:16: warning: the loop variable's type is not - // a reference type; this creates a copy in each iteration; consider making - // this a reference [performance-for-range-copy] CHECK-FIXES: for (const S& - // SS : createView(*ValueReturningIterator())) { + // CHECK-MESSAGES: [[@LINE-1]]:16: warning: the loop variable's type is not a reference type; this creates a copy in each iteration; consider making this a reference [performance-for-range-copy] + // CHECK-FIXES: for (const S& SS : createView(*ValueReturningIterator())) { } } diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-algorithm.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-algorithm.cpp index 19a6701c5b6aa..dafff8c946bb0 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-algorithm.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-algorithm.cpp @@ -69,83 +69,83 @@ template void f(const T &t) { std::set s; find(s.begin(), s.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}s.find(46);{{$}} + // CHECK-FIXES: s.find(46); find(t.begin(), t.end(), 46); - // CHECK-FIXES: {{^ }}find(t.begin(), t.end(), 46);{{$}} + // CHECK-FIXES: find(t.begin(), t.end(), 46); } int main() { std::set s; auto it = std::find(s.begin(), s.end(), 43); // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: this STL algorithm call should be replaced with a container method [performance-inefficient-algorithm] - // CHECK-FIXES: {{^ }}auto it = s.find(43);{{$}} + // CHECK-FIXES: auto it = s.find(43); auto c = count(s.begin(), s.end(), 43); // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}auto c = s.count(43);{{$}} + // CHECK-FIXES: auto c = s.count(43); #define SECOND(x, y, z) y SECOND(q,std::count(s.begin(), s.end(), 22),w); // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}SECOND(q,s.count(22),w);{{$}} + // CHECK-FIXES: SECOND(q,s.count(22),w); it = find_if(s.begin(), s.end(), [](int) { return false; }); std::multiset ms; find(ms.begin(), ms.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}ms.find(46);{{$}} + // CHECK-FIXES: ms.find(46); const std::multiset &msref = ms; find(msref.begin(), msref.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}msref.find(46);{{$}} + // CHECK-FIXES: msref.find(46); std::multiset *msptr = &ms; find(msptr->begin(), msptr->end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}msptr->find(46);{{$}} + // CHECK-FIXES: msptr->find(46); it = std::find(s.begin(), s.end(), 43, std::greater()); // CHECK-MESSAGES: :[[@LINE-1]]:42: warning: different comparers used in the algorithm and the container [performance-inefficient-algorithm] FIND_IN_SET(s); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}FIND_IN_SET(s);{{$}} + // CHECK-FIXES: FIND_IN_SET(s); f(s); std::unordered_set us; lower_bound(us.begin(), us.end(), 10); - // CHECK-FIXES: {{^ }}lower_bound(us.begin(), us.end(), 10);{{$}} + // CHECK-FIXES: lower_bound(us.begin(), us.end(), 10); find(us.begin(), us.end(), 10); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}us.find(10);{{$}} + // CHECK-FIXES: us.find(10); std::unordered_multiset ums; find(ums.begin(), ums.end(), 10); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}ums.find(10);{{$}} + // CHECK-FIXES: ums.find(10); std::map intmap; find(intmap.begin(), intmap.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}find(intmap.begin(), intmap.end(), 46);{{$}} + // CHECK-FIXES: find(intmap.begin(), intmap.end(), 46); std::multimap intmmap; find(intmmap.begin(), intmmap.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}find(intmmap.begin(), intmmap.end(), 46);{{$}} + // CHECK-FIXES: find(intmmap.begin(), intmmap.end(), 46); std::unordered_map umap; find(umap.begin(), umap.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}find(umap.begin(), umap.end(), 46);{{$}} + // CHECK-FIXES: find(umap.begin(), umap.end(), 46); std::unordered_multimap ummap; find(ummap.begin(), ummap.end(), 46); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}find(ummap.begin(), ummap.end(), 46);{{$}} + // CHECK-FIXES: find(ummap.begin(), ummap.end(), 46); } struct Value { @@ -162,5 +162,5 @@ struct Ordering { void g(std::set container, int value) { lower_bound(container.begin(), container.end(), value, Ordering()); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this STL algorithm call should be - // CHECK-FIXES: {{^ }}lower_bound(container.begin(), container.end(), value, Ordering());{{$}} + // CHECK-FIXES: lower_bound(container.begin(), container.end(), value, Ordering()); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/trivially-destructible.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/trivially-destructible.cpp index 2ff3eda559a52..40cf90d21467a 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/trivially-destructible.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/trivially-destructible.cpp @@ -21,7 +21,7 @@ struct NotTriviallyDestructible1 : TriviallyDestructible2 { NotTriviallyDestructible1::~NotTriviallyDestructible1() = default; // to-be-removed // CHECK-MESSAGES: :[[@LINE-1]]:28: note: destructor definition is here -// CHECK-FIXES: {{^}}// to-be-removed +// CHECK-FIXES: // to-be-removed // Don't emit for class template with type-dependent fields. template @@ -57,7 +57,7 @@ struct MaybeTriviallyDestructible1 { template MaybeTriviallyDestructible1::~MaybeTriviallyDestructible1() noexcept = default; // to-be-removed // CHECK-MESSAGES: :[[@LINE-1]]:35: note: destructor definition is here -// CHECK-FIXES: {{^}}// to-be-removed +// CHECK-FIXES: // to-be-removed // Emit for explicit specializations. template <> @@ -69,7 +69,7 @@ struct MaybeTriviallyDestructible1: TriviallyDestructible1 { MaybeTriviallyDestructible1::~MaybeTriviallyDestructible1() noexcept = default; // to-be-removed // CHECK-MESSAGES: :[[@LINE-1]]:38: note: destructor definition is here -// CHECK-FIXES: {{^}}// to-be-removed +// CHECK-FIXES: // to-be-removed struct NotTriviallyDestructible2 { virtual ~NotTriviallyDestructible2(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/type-promotion-in-math-fn.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/type-promotion-in-math-fn.cpp index b2da7cc393a29..5309a1667d79a 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/type-promotion-in-math-fn.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/type-promotion-in-math-fn.cpp @@ -67,169 +67,169 @@ void check_all_fns() { acos(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'acos' promotes float to double [performance-type-promotion-in-math-fn] - // CHECK-FIXES: {{^}} std::acos(a);{{$}} + // CHECK-FIXES: std::acos(a); acosh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'acosh' - // CHECK-FIXES: {{^}} std::acosh(a);{{$}} + // CHECK-FIXES: std::acosh(a); asin(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'asin' - // CHECK-FIXES: {{^}} std::asin(a);{{$}} + // CHECK-FIXES: std::asin(a); asinh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'asinh' - // CHECK-FIXES: {{^}} std::asinh(a);{{$}} + // CHECK-FIXES: std::asinh(a); atan2(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'atan2' - // CHECK-FIXES: {{^}} std::atan2(a, b);{{$}} + // CHECK-FIXES: std::atan2(a, b); atan(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'atan' - // CHECK-FIXES: {{^}} std::atan(a);{{$}} + // CHECK-FIXES: std::atan(a); atanh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'atanh' - // CHECK-FIXES: {{^}} std::atanh(a);{{$}} + // CHECK-FIXES: std::atanh(a); cbrt(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'cbrt' - // CHECK-FIXES: {{^}} std::cbrt(a);{{$}} + // CHECK-FIXES: std::cbrt(a); ceil(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'ceil' - // CHECK-FIXES: {{^}} std::ceil(a);{{$}} + // CHECK-FIXES: std::ceil(a); copysign(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'copysign' - // CHECK-FIXES: {{^}} std::copysign(a, b);{{$}} + // CHECK-FIXES: std::copysign(a, b); cos(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'cos' - // CHECK-FIXES: {{^}} std::cos(a);{{$}} + // CHECK-FIXES: std::cos(a); cosh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'cosh' - // CHECK-FIXES: {{^}} std::cosh(a);{{$}} + // CHECK-FIXES: std::cosh(a); erf(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'erf' - // CHECK-FIXES: {{^}} std::erf(a);{{$}} + // CHECK-FIXES: std::erf(a); erfc(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'erfc' - // CHECK-FIXES: {{^}} std::erfc(a);{{$}} + // CHECK-FIXES: std::erfc(a); exp2(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'exp2' - // CHECK-FIXES: {{^}} std::exp2(a);{{$}} + // CHECK-FIXES: std::exp2(a); exp(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'exp' - // CHECK-FIXES: {{^}} std::exp(a);{{$}} + // CHECK-FIXES: std::exp(a); expm1(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'expm1' - // CHECK-FIXES: {{^}} std::expm1(a);{{$}} + // CHECK-FIXES: std::expm1(a); fabs(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fabs' - // CHECK-FIXES: {{^}} std::fabs(a);{{$}} + // CHECK-FIXES: std::fabs(a); fdim(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fdim' - // CHECK-FIXES: {{^}} std::fdim(a, b);{{$}} + // CHECK-FIXES: std::fdim(a, b); floor(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'floor' - // CHECK-FIXES: {{^}} std::floor(a);{{$}} + // CHECK-FIXES: std::floor(a); fma(a, b, c); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fma' - // CHECK-FIXES: {{^}} std::fma(a, b, c);{{$}} + // CHECK-FIXES: std::fma(a, b, c); fmax(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fmax' - // CHECK-FIXES: {{^}} std::fmax(a, b);{{$}} + // CHECK-FIXES: std::fmax(a, b); fmin(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fmin' - // CHECK-FIXES: {{^}} std::fmin(a, b);{{$}} + // CHECK-FIXES: std::fmin(a, b); fmod(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'fmod' - // CHECK-FIXES: {{^}} std::fmod(a, b);{{$}} + // CHECK-FIXES: std::fmod(a, b); frexp(a, int_ptr); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'frexp' - // CHECK-FIXES: {{^}} std::frexp(a, int_ptr);{{$}} + // CHECK-FIXES: std::frexp(a, int_ptr); hypot(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'hypot' - // CHECK-FIXES: {{^}} std::hypot(a, b);{{$}} + // CHECK-FIXES: std::hypot(a, b); ilogb(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'ilogb' - // CHECK-FIXES: {{^}} std::ilogb(a);{{$}} + // CHECK-FIXES: std::ilogb(a); ldexp(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'ldexp' - // CHECK-FIXES: {{^}} std::ldexp(a, b);{{$}} + // CHECK-FIXES: std::ldexp(a, b); lgamma(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'lgamma' - // CHECK-FIXES: {{^}} std::lgamma(a);{{$}} + // CHECK-FIXES: std::lgamma(a); llrint(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'llrint' - // CHECK-FIXES: {{^}} std::llrint(a);{{$}} + // CHECK-FIXES: std::llrint(a); llround(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'llround' - // CHECK-FIXES: {{^}} std::llround(a);{{$}} + // CHECK-FIXES: std::llround(a); log10(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'log10' - // CHECK-FIXES: {{^}} std::log10(a);{{$}} + // CHECK-FIXES: std::log10(a); log1p(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'log1p' - // CHECK-FIXES: {{^}} std::log1p(a);{{$}} + // CHECK-FIXES: std::log1p(a); log2(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'log2' - // CHECK-FIXES: {{^}} std::log2(a);{{$}} + // CHECK-FIXES: std::log2(a); log(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'log' - // CHECK-FIXES: {{^}} std::log(a);{{$}} + // CHECK-FIXES: std::log(a); logb(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'logb' - // CHECK-FIXES: {{^}} std::logb(a);{{$}} + // CHECK-FIXES: std::logb(a); lrint(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'lrint' - // CHECK-FIXES: {{^}} std::lrint(a);{{$}} + // CHECK-FIXES: std::lrint(a); lround(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'lround' - // CHECK-FIXES: {{^}} std::lround(a);{{$}} + // CHECK-FIXES: std::lround(a); nearbyint(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nearbyint' - // CHECK-FIXES: {{^}} std::nearbyint(a);{{$}} + // CHECK-FIXES: std::nearbyint(a); nextafter(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nextafter' - // CHECK-FIXES: {{^}} std::nextafter(a, b);{{$}} + // CHECK-FIXES: std::nextafter(a, b); nexttoward(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nexttoward' - // CHECK-FIXES: {{^}} std::nexttoward(a, b);{{$}} + // CHECK-FIXES: std::nexttoward(a, b); pow(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'pow' - // CHECK-FIXES: {{^}} std::pow(a, b);{{$}} + // CHECK-FIXES: std::pow(a, b); remainder(a, b); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'remainder' - // CHECK-FIXES: {{^}} std::remainder(a, b);{{$}} + // CHECK-FIXES: std::remainder(a, b); remquo(a, b, int_ptr); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'remquo' - // CHECK-FIXES: {{^}} std::remquo(a, b, int_ptr);{{$}} + // CHECK-FIXES: std::remquo(a, b, int_ptr); rint(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'rint' - // CHECK-FIXES: {{^}} std::rint(a);{{$}} + // CHECK-FIXES: std::rint(a); round(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'round' - // CHECK-FIXES: {{^}} std::round(a);{{$}} + // CHECK-FIXES: std::round(a); scalbln(a, l); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbln' - // CHECK-FIXES: {{^}} std::scalbln(a, l);{{$}} + // CHECK-FIXES: std::scalbln(a, l); scalbn(a, i); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbn' - // CHECK-FIXES: {{^}} std::scalbn(a, i);{{$}} + // CHECK-FIXES: std::scalbn(a, i); sin(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'sin' - // CHECK-FIXES: {{^}} std::sin(a);{{$}} + // CHECK-FIXES: std::sin(a); sinh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'sinh' - // CHECK-FIXES: {{^}} std::sinh(a);{{$}} + // CHECK-FIXES: std::sinh(a); sqrt(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'sqrt' - // CHECK-FIXES: {{^}} std::sqrt(a);{{$}} + // CHECK-FIXES: std::sqrt(a); tan(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'tan' - // CHECK-FIXES: {{^}} std::tan(a);{{$}} + // CHECK-FIXES: std::tan(a); tanh(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'tanh' - // CHECK-FIXES: {{^}} std::tanh(a);{{$}} + // CHECK-FIXES: std::tanh(a); tgamma(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'tgamma' - // CHECK-FIXES: {{^}} std::tgamma(a);{{$}} + // CHECK-FIXES: std::tgamma(a); trunc(a); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'trunc' - // CHECK-FIXES: {{^}} std::trunc(a);{{$}} + // CHECK-FIXES: std::trunc(a); } // nexttoward/nexttowardf are weird -- the second param is always long double. @@ -237,16 +237,16 @@ void check_all_fns() { void check_nexttoward() { nexttoward(0.f, 0); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nexttoward' - // CHECK-FIXES: {{^}} std::nexttoward(0.f, 0);{{$}} + // CHECK-FIXES: std::nexttoward(0.f, 0); nexttoward(0.f, 0l); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nexttoward' - // CHECK-FIXES: {{^}} std::nexttoward(0.f, 0l);{{$}} + // CHECK-FIXES: std::nexttoward(0.f, 0l); nexttoward(0.f, 0.f); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nexttoward' - // CHECK-FIXES: {{^}} std::nexttoward(0.f, 0.f);{{$}} + // CHECK-FIXES: std::nexttoward(0.f, 0.f); nexttoward(0.f, 0.); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'nexttoward' - // CHECK-FIXES: {{^}} std::nexttoward(0.f, 0.);{{$}} + // CHECK-FIXES: std::nexttoward(0.f, 0.); // No warnings for these. nexttoward(0., 0); @@ -259,10 +259,10 @@ void check_nexttoward() { void check_scalbn() { scalbn(0.f, 0); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbn' - // CHECK-FIXES: {{^}} std::scalbn(0.f, 0);{{$}} + // CHECK-FIXES: std::scalbn(0.f, 0); scalbn(0.f, static_cast(0)); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbn' - // CHECK-FIXES: {{^}} std::scalbn(0.f, static_cast(0));{{$}} + // CHECK-FIXES: std::scalbn(0.f, static_cast(0)); // No warnings for these. scalbn(0., 0); @@ -275,10 +275,10 @@ void check_scalbn() { void check_scalbln() { scalbln(0.f, 0); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbln' - // CHECK-FIXES: {{^}} std::scalbln(0.f, 0);{{$}} + // CHECK-FIXES: std::scalbln(0.f, 0); scalbln(0.f, 0l); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: call to 'scalbln' - // CHECK-FIXES: {{^}} std::scalbln(0.f, 0l);{{$}} + // CHECK-FIXES: std::scalbln(0.f, 0l); // No warnings for these. scalbln(0., 0); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/delete-null-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/delete-null-pointer.cpp index 36e8f059c22b3..1a37c2bbf1133 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/delete-null-pointer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/delete-null-pointer.cpp @@ -10,7 +10,7 @@ struct Templ { delete mem; // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: 'if' statement is unnecessary; // CHECK-FIXES: // t1 - // CHECK-FIXES-NEXT: {{^ }}// t2 + // CHECK-FIXES-NEXT: // t2 // CHECK-FIXES-NEXT: delete mem; } T mem; @@ -24,7 +24,7 @@ struct TemplPtr { delete mem; // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: 'if' statement is unnecessary; // CHECK-FIXES: // t3 - // CHECK-FIXES-NEXT: {{^ }}// t4 + // CHECK-FIXES-NEXT: // t4 // CHECK-FIXES-NEXT: delete mem; } T *mem; @@ -45,7 +45,7 @@ struct NeverInstantiated { delete mem; // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: 'if' statement is unnecessary; // CHECK-FIXES: // t1 - // CHECK-FIXES-NEXT: {{^ }}// t2 + // CHECK-FIXES-NEXT: // t2 // CHECK-FIXES-NEXT: delete mem; } T mem; @@ -59,7 +59,7 @@ struct NeverInstantiatedPtr { delete mem; // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: 'if' statement is unnecessary; // CHECK-FIXES: // t3 - // CHECK-FIXES-NEXT: {{^ }}// t4 + // CHECK-FIXES-NEXT: // t4 // CHECK-FIXES-NEXT: delete mem; } T *mem; @@ -72,7 +72,7 @@ void f() { // CHECK-MESSAGES: :[[@LINE-2]]:3: warning: 'if' statement is unnecessary; deleting null pointer has no effect [readability-delete-null-pointer] // CHECK-FIXES: int *ps = 0; - // CHECK-FIXES-NEXT: {{^ }}// #0 + // CHECK-FIXES-NEXT: // #0 // CHECK-FIXES-NEXT: delete ps; int *p = 0; @@ -83,10 +83,10 @@ void f() { } // #3 // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: 'if' statement is unnecessary; deleting null pointer has no effect [readability-delete-null-pointer] - // CHECK-FIXES: {{^ }}// #1 - // CHECK-FIXES-NEXT: {{^ }}// #2 + // CHECK-FIXES: // #1 + // CHECK-FIXES-NEXT: // #2 // CHECK-FIXES-NEXT: delete p; - // CHECK-FIXES-NEXT: {{^ }}// #3 + // CHECK-FIXES-NEXT: // #3 int *p2 = new int[3]; // #4 @@ -95,7 +95,7 @@ void f() { // CHECK-MESSAGES: :[[@LINE-2]]:3: warning: 'if' statement is unnecessary; // CHECK-FIXES: // #4 - // CHECK-FIXES-NEXT: {{^ }}// #5 + // CHECK-FIXES-NEXT: // #5 // CHECK-FIXES-NEXT: delete[] p2; int *p3 = 0; @@ -136,7 +136,7 @@ void f() { if (mp) // #6 delete mp; // CHECK-MESSAGES: :[[@LINE-2]]:7: warning: 'if' statement is unnecessary; deleting null pointer has no effect [readability-delete-null-pointer] - // CHECK-FIXES: {{^ }}// #6 + // CHECK-FIXES: // #6 // CHECK-FIXES-NEXT: delete mp; } int *mp; diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-no-warn.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-no-warn.cpp index d17ed02148578..a67dc830b112e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-no-warn.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-no-warn.cpp @@ -20,12 +20,12 @@ int lifeTimeExtensionTests(int a) { b++; } if (int b = a) { // comment-0 - // CHECK-FIXES: {{^}} int b = a; - // CHECK-FIXES-NEXT: {{^}}if (b) { // comment-0 + // CHECK-FIXES: int b = a; + // CHECK-FIXES-NEXT: if (b) { // comment-0 return a; } else { // comment-0 // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after 'return' - // CHECK-FIXES: {{^}} } // comment-0 + // CHECK-FIXES: } // comment-0 return b; } } diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-hungarian-notation-lower-case-prefix.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-hungarian-notation-lower-case-prefix.cpp index 65aee7e9f6ced..dd8a70299df42 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-hungarian-notation-lower-case-prefix.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-hungarian-notation-lower-case-prefix.cpp @@ -11,164 +11,164 @@ class C_MyClass1 { public: static int ClassMemberCase; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for class member 'ClassMemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} static int i_ClassMemberCase; + // CHECK-FIXES: static int i_ClassMemberCase; char const ConstantMemberCase = 0; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for constant member 'ConstantMemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char const c_ConstantMemberCase = 0; + // CHECK-FIXES: char const c_ConstantMemberCase = 0; void MyFunc1(const int ConstantParameterCase); // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: invalid case style for constant parameter 'ConstantParameterCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} void MyFunc1(const int i_ConstantParameterCase); + // CHECK-FIXES: void MyFunc1(const int i_ConstantParameterCase); void MyFunc2(const int* ConstantPointerParameterCase); // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: invalid case style for pointer parameter 'ConstantPointerParameterCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} void MyFunc2(const int* pi_ConstantPointerParameterCase); + // CHECK-FIXES: void MyFunc2(const int* pi_ConstantPointerParameterCase); static constexpr int ConstexprVariableCase = 123; // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: invalid case style for constexpr variable 'ConstexprVariableCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} static constexpr int i_ConstexprVariableCase = 123; + // CHECK-FIXES: static constexpr int i_ConstexprVariableCase = 123; }; const int GlobalConstantCase = 0; // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for global constant 'GlobalConstantCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const int i_GlobalConstantCase = 0; +// CHECK-FIXES: const int i_GlobalConstantCase = 0; const int* GlobalConstantPointerCase = nullptr; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global pointer 'GlobalConstantPointerCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const int* pi_GlobalConstantPointerCase = nullptr; +// CHECK-FIXES: const int* pi_GlobalConstantPointerCase = nullptr; int* GlobalPointerCase = nullptr; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global pointer 'GlobalPointerCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int* pi_GlobalPointerCase = nullptr; +// CHECK-FIXES: int* pi_GlobalPointerCase = nullptr; int GlobalVariableCase = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'GlobalVariableCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_GlobalVariableCase = 0; +// CHECK-FIXES: int i_GlobalVariableCase = 0; void Func1(){ int const LocalConstantCase = 3; // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for local constant 'LocalConstantCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} int const i_LocalConstantCase = 3; + // CHECK-FIXES: int const i_LocalConstantCase = 3; unsigned const ConstantCase = 1; // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for local constant 'ConstantCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} unsigned const u_ConstantCase = 1; + // CHECK-FIXES: unsigned const u_ConstantCase = 1; int* const LocalConstantPointerCase = nullptr; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for local constant pointer 'LocalConstantPointerCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} int* const pi_LocalConstantPointerCase = nullptr; + // CHECK-FIXES: int* const pi_LocalConstantPointerCase = nullptr; int *LocalPointerCase = nullptr; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for local pointer 'LocalPointerCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} int *pi_LocalPointerCase = nullptr; + // CHECK-FIXES: int *pi_LocalPointerCase = nullptr; int LocalVariableCase = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for local variable 'LocalVariableCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} int i_LocalVariableCase = 0; + // CHECK-FIXES: int i_LocalVariableCase = 0; } class C_MyClass2 { char MemberCase; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for private member 'MemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char c_MemberCase; + // CHECK-FIXES: char c_MemberCase; void Func1(int ParameterCase); // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for parameter 'ParameterCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} void Func1(int i_ParameterCase); + // CHECK-FIXES: void Func1(int i_ParameterCase); void Func2(const int ParameterCase); // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: invalid case style for constant parameter 'ParameterCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} void Func2(const int i_ParameterCase); + // CHECK-FIXES: void Func2(const int i_ParameterCase); void Func3(const int *PointerParameterCase); // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: invalid case style for pointer parameter 'PointerParameterCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} void Func3(const int *pi_PointerParameterCase); + // CHECK-FIXES: void Func3(const int *pi_PointerParameterCase); }; class C_MyClass3 { private: char PrivateMemberCase; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for private member 'PrivateMemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char c_PrivateMemberCase; + // CHECK-FIXES: char c_PrivateMemberCase; protected: char ProtectedMemberCase; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for protected member 'ProtectedMemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char c_ProtectedMemberCase; + // CHECK-FIXES: char c_ProtectedMemberCase; public: char PublicMemberCase; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for public member 'PublicMemberCase' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char c_PublicMemberCase; + // CHECK-FIXES: char c_PublicMemberCase; }; static const int StaticConstantCase = 3; // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for global constant 'StaticConstantCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}static const int i_StaticConstantCase = 3; +// CHECK-FIXES: static const int i_StaticConstantCase = 3; static int StaticVariableCase = 3; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global variable 'StaticVariableCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}static int i_StaticVariableCase = 3; +// CHECK-FIXES: static int i_StaticVariableCase = 3; struct MyStruct { int StructCase; }; // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: invalid case style for public member 'StructCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}struct MyStruct { int i_StructCase; }; +// CHECK-FIXES: struct MyStruct { int i_StructCase; }; struct shouldBeCamelCaseStruct { int i_Field; }; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for struct 'shouldBeCamelCaseStruct' [readability-identifier-naming] -// CHECK-FIXES: {{^}}struct ShouldBeCamelCaseStruct { int i_Field; }; +// CHECK-FIXES: struct ShouldBeCamelCaseStruct { int i_Field; }; union MyUnion { int UnionCase; long l_UnionCase; }; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for union 'MyUnion' [readability-identifier-naming] // CHECK-MESSAGES: :[[@LINE-2]]:21: warning: invalid case style for public member 'UnionCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}union myUnion { int i_UnionCase; long l_UnionCase; }; +// CHECK-FIXES: union myUnion { int i_UnionCase; long l_UnionCase; }; //===----------------------------------------------------------------------===// // C string //===----------------------------------------------------------------------===// const char *NamePtr = "Name"; // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global pointer 'NamePtr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const char *sz_NamePtr = "Name"; +// CHECK-FIXES: const char *sz_NamePtr = "Name"; const char NameArray[] = "Name"; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global constant 'NameArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const char sz_NameArray[] = "Name"; +// CHECK-FIXES: const char sz_NameArray[] = "Name"; const char *NamePtrArray[] = {"AA", "BB"}; // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global variable 'NamePtrArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const char *psz_NamePtrArray[] = {"AA", "BB"}; +// CHECK-FIXES: const char *psz_NamePtrArray[] = {"AA", "BB"}; const wchar_t *WideNamePtr = L"Name"; // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for global pointer 'WideNamePtr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const wchar_t *wsz_WideNamePtr = L"Name"; +// CHECK-FIXES: const wchar_t *wsz_WideNamePtr = L"Name"; const wchar_t WideNameArray[] = L"Name"; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for global constant 'WideNameArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const wchar_t wsz_WideNameArray[] = L"Name"; +// CHECK-FIXES: const wchar_t wsz_WideNameArray[] = L"Name"; const wchar_t *WideNamePtrArray[] = {L"AA", L"BB"}; // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for global variable 'WideNamePtrArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const wchar_t *pwsz_WideNamePtrArray[] = {L"AA", L"BB"}; +// CHECK-FIXES: const wchar_t *pwsz_WideNamePtrArray[] = {L"AA", L"BB"}; class C_MyClass4 { private: char *Name = "Text"; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for private member 'Name' [readability-identifier-naming] - // CHECK-FIXES: {{^}} char *sz_Name = "Text"; + // CHECK-FIXES: char *sz_Name = "Text"; const char *ConstName = "Text"; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for private member 'ConstName' [readability-identifier-naming] - // CHECK-FIXES: {{^}} const char *sz_ConstName = "Text"; + // CHECK-FIXES: const char *sz_ConstName = "Text"; public: const char* DuplicateString(const char* Input, size_t n_RequiredSize); // CHECK-MESSAGES: :[[@LINE-1]]:43: warning: invalid case style for pointer parameter 'Input' [readability-identifier-naming] - // CHECK-FIXES: {{^}} const char* DuplicateString(const char* sz_Input, size_t n_RequiredSize); + // CHECK-FIXES: const char* DuplicateString(const char* sz_Input, size_t n_RequiredSize); size_t UpdateText(const char* Buffer, size_t n_BufferSize); // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: invalid case style for pointer parameter 'Buffer' [readability-identifier-naming] - // CHECK-FIXES: {{^}} size_t UpdateText(const char* sz_Buffer, size_t n_BufferSize); + // CHECK-FIXES: size_t UpdateText(const char* sz_Buffer, size_t n_BufferSize); }; @@ -177,123 +177,123 @@ class C_MyClass4 { //===----------------------------------------------------------------------===// DWORD MsDword = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsDword' [readability-identifier-naming] -// CHECK-FIXES: {{^}}DWORD dw_MsDword = 0; +// CHECK-FIXES: DWORD dw_MsDword = 0; BYTE MsByte = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsByte' [readability-identifier-naming] -// CHECK-FIXES: {{^}}BYTE by_MsByte = 0; +// CHECK-FIXES: BYTE by_MsByte = 0; WORD MsWord = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsWord' [readability-identifier-naming] -// CHECK-FIXES: {{^}}WORD w_MsWord = 0; +// CHECK-FIXES: WORD w_MsWord = 0; BOOL MsBool = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsBool' [readability-identifier-naming] -// CHECK-FIXES: {{^}}BOOL b_MsBool = 0; +// CHECK-FIXES: BOOL b_MsBool = 0; BOOLEAN MsBoolean = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'MsBoolean' [readability-identifier-naming] -// CHECK-FIXES: {{^}}BOOLEAN b_MsBoolean = 0; +// CHECK-FIXES: BOOLEAN b_MsBoolean = 0; CHAR MsValueChar = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsValueChar' [readability-identifier-naming] -// CHECK-FIXES: {{^}}CHAR c_MsValueChar = 0; +// CHECK-FIXES: CHAR c_MsValueChar = 0; UCHAR MsValueUchar = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueUchar' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UCHAR uc_MsValueUchar = 0; +// CHECK-FIXES: UCHAR uc_MsValueUchar = 0; SHORT MsValueShort = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueShort' [readability-identifier-naming] -// CHECK-FIXES: {{^}}SHORT s_MsValueShort = 0; +// CHECK-FIXES: SHORT s_MsValueShort = 0; USHORT MsValueUshort = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'MsValueUshort' [readability-identifier-naming] -// CHECK-FIXES: {{^}}USHORT us_MsValueUshort = 0; +// CHECK-FIXES: USHORT us_MsValueUshort = 0; WORD MsValueWord = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsValueWord' [readability-identifier-naming] -// CHECK-FIXES: {{^}}WORD w_MsValueWord = 0; +// CHECK-FIXES: WORD w_MsValueWord = 0; DWORD MsValueDword = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueDword' [readability-identifier-naming] -// CHECK-FIXES: {{^}}DWORD dw_MsValueDword = 0; +// CHECK-FIXES: DWORD dw_MsValueDword = 0; DWORD32 MsValueDword32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'MsValueDword32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}DWORD32 dw32_MsValueDword32 = 0; +// CHECK-FIXES: DWORD32 dw32_MsValueDword32 = 0; DWORD64 MsValueDword64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'MsValueDword64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}DWORD64 dw64_MsValueDword64 = 0; +// CHECK-FIXES: DWORD64 dw64_MsValueDword64 = 0; LONG MsValueLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsValueLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}LONG l_MsValueLong = 0; +// CHECK-FIXES: LONG l_MsValueLong = 0; ULONG MsValueUlong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueUlong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}ULONG ul_MsValueUlong = 0; +// CHECK-FIXES: ULONG ul_MsValueUlong = 0; ULONG32 MsValueUlong32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'MsValueUlong32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}ULONG32 ul32_MsValueUlong32 = 0; +// CHECK-FIXES: ULONG32 ul32_MsValueUlong32 = 0; ULONG64 MsValueUlong64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'MsValueUlong64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}ULONG64 ul64_MsValueUlong64 = 0; +// CHECK-FIXES: ULONG64 ul64_MsValueUlong64 = 0; ULONGLONG MsValueUlongLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for global variable 'MsValueUlongLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}ULONGLONG ull_MsValueUlongLong = 0; +// CHECK-FIXES: ULONGLONG ull_MsValueUlongLong = 0; HANDLE MsValueHandle = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'MsValueHandle' [readability-identifier-naming] -// CHECK-FIXES: {{^}}HANDLE h_MsValueHandle = 0; +// CHECK-FIXES: HANDLE h_MsValueHandle = 0; INT MsValueInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'MsValueInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INT i_MsValueInt = 0; +// CHECK-FIXES: INT i_MsValueInt = 0; INT8 MsValueInt8 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsValueInt8' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INT8 i8_MsValueInt8 = 0; +// CHECK-FIXES: INT8 i8_MsValueInt8 = 0; INT16 MsValueInt16 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueInt16' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INT16 i16_MsValueInt16 = 0; +// CHECK-FIXES: INT16 i16_MsValueInt16 = 0; INT32 MsValueInt32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueInt32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INT32 i32_MsValueInt32 = 0; +// CHECK-FIXES: INT32 i32_MsValueInt32 = 0; INT64 MsValueINt64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueINt64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INT64 i64_MsValueINt64 = 0; +// CHECK-FIXES: INT64 i64_MsValueINt64 = 0; UINT MsValueUint = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'MsValueUint' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UINT ui_MsValueUint = 0; +// CHECK-FIXES: UINT ui_MsValueUint = 0; UINT8 MsValueUint8 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'MsValueUint8' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UINT8 u8_MsValueUint8 = 0; +// CHECK-FIXES: UINT8 u8_MsValueUint8 = 0; UINT16 MsValueUint16 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'MsValueUint16' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UINT16 u16_MsValueUint16 = 0; +// CHECK-FIXES: UINT16 u16_MsValueUint16 = 0; UINT32 MsValueUint32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'MsValueUint32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UINT32 u32_MsValueUint32 = 0; +// CHECK-FIXES: UINT32 u32_MsValueUint32 = 0; UINT64 MsValueUint64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'MsValueUint64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}UINT64 u64_MsValueUint64 = 0; +// CHECK-FIXES: UINT64 u64_MsValueUint64 = 0; PVOID MsValuePvoid = NULL; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global pointer 'MsValuePvoid' [readability-identifier-naming] -// CHECK-FIXES: {{^}}PVOID p_MsValuePvoid = NULL; +// CHECK-FIXES: PVOID p_MsValuePvoid = NULL; //===----------------------------------------------------------------------===// @@ -301,19 +301,19 @@ PVOID MsValuePvoid = NULL; //===----------------------------------------------------------------------===// unsigned GlobalUnsignedArray[] = {1, 2, 3}; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'GlobalUnsignedArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned a_GlobalUnsignedArray[] = {1, 2, 3}; +// CHECK-FIXES: unsigned a_GlobalUnsignedArray[] = {1, 2, 3}; int GlobalIntArray[] = {1, 2, 3}; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'GlobalIntArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int a_GlobalIntArray[] = {1, 2, 3}; +// CHECK-FIXES: int a_GlobalIntArray[] = {1, 2, 3}; int DataInt[1] = {0}; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'DataInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int a_DataInt[1] = {0}; +// CHECK-FIXES: int a_DataInt[1] = {0}; int DataArray[2] = {0}; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'DataArray' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int a_DataArray[2] = {0}; +// CHECK-FIXES: int a_DataArray[2] = {0}; //===----------------------------------------------------------------------===// @@ -321,56 +321,56 @@ int DataArray[2] = {0}; //===----------------------------------------------------------------------===// int *DataIntPtr[1] = {0}; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'DataIntPtr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int *pa_DataIntPtr[1] = {0}; +// CHECK-FIXES: int *pa_DataIntPtr[1] = {0}; void *BufferPtr1; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global pointer 'BufferPtr1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}void *p_BufferPtr1; +// CHECK-FIXES: void *p_BufferPtr1; void **BufferPtr2; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'BufferPtr2' [readability-identifier-naming] -// CHECK-FIXES: {{^}}void **pp_BufferPtr2; +// CHECK-FIXES: void **pp_BufferPtr2; void **pBufferPtr3; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'pBufferPtr3' [readability-identifier-naming] -// CHECK-FIXES: {{^}}void **pp_BufferPtr3; +// CHECK-FIXES: void **pp_BufferPtr3; int *pBufferPtr4; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global pointer 'pBufferPtr4' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int *pi_BufferPtr4; +// CHECK-FIXES: int *pi_BufferPtr4; typedef void (*FUNC_PTR_HELLO)(); FUNC_PTR_HELLO Hello = NULL; // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for global pointer 'Hello' [readability-identifier-naming] -// CHECK-FIXES: {{^}}FUNC_PTR_HELLO fn_Hello = NULL; +// CHECK-FIXES: FUNC_PTR_HELLO fn_Hello = NULL; void *ValueVoidPtr = NULL; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global pointer 'ValueVoidPtr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}void *p_ValueVoidPtr = NULL; +// CHECK-FIXES: void *p_ValueVoidPtr = NULL; ptrdiff_t PtrDiff = NULL; // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for global variable 'PtrDiff' [readability-identifier-naming] -// CHECK-FIXES: {{^}}ptrdiff_t p_PtrDiff = NULL; +// CHECK-FIXES: ptrdiff_t p_PtrDiff = NULL; int8_t *ValueI8Ptr; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global pointer 'ValueI8Ptr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int8_t *pi8_ValueI8Ptr; +// CHECK-FIXES: int8_t *pi8_ValueI8Ptr; uint8_t *ValueU8Ptr; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global pointer 'ValueU8Ptr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}uint8_t *pu8_ValueU8Ptr; +// CHECK-FIXES: uint8_t *pu8_ValueU8Ptr; unsigned char *ValueUcPtr; // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for global pointer 'ValueUcPtr' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned char *puc_ValueUcPtr; +// CHECK-FIXES: unsigned char *puc_ValueUcPtr; unsigned char **ValueUcPtr2; // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: invalid case style for global pointer 'ValueUcPtr2' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned char **ppuc_ValueUcPtr2; +// CHECK-FIXES: unsigned char **ppuc_ValueUcPtr2; void MyFunc2(void* Val){} // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: invalid case style for pointer parameter 'Val' [readability-identifier-naming] -// CHECK-FIXES: {{^}}void MyFunc2(void* p_Val){} +// CHECK-FIXES: void MyFunc2(void* p_Val){} //===----------------------------------------------------------------------===// @@ -379,16 +379,16 @@ void MyFunc2(void* Val){} int i_ValueIndex = 1; int &RefValueIndex = i_ValueIndex; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'RefValueIndex' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int &i_RefValueIndex = i_ValueIndex; +// CHECK-FIXES: int &i_RefValueIndex = i_ValueIndex; const int &ConstRefValue = i_ValueIndex; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global variable 'ConstRefValue' [readability-identifier-naming] -// CHECK-FIXES: {{^}}const int &i_ConstRefValue = i_ValueIndex; +// CHECK-FIXES: const int &i_ConstRefValue = i_ValueIndex; long long ll_ValueLongLong = 2; long long &RefValueLongLong = ll_ValueLongLong; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global variable 'RefValueLongLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long long &ll_RefValueLongLong = ll_ValueLongLong; +// CHECK-FIXES: long long &ll_RefValueLongLong = ll_ValueLongLong; //===----------------------------------------------------------------------===// @@ -396,151 +396,151 @@ long long &RefValueLongLong = ll_ValueLongLong; //===----------------------------------------------------------------------===// int8_t ValueI8; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'ValueI8' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int8_t i8_ValueI8; +// CHECK-FIXES: int8_t i8_ValueI8; int16_t ValueI16 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'ValueI16' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int16_t i16_ValueI16 = 0; +// CHECK-FIXES: int16_t i16_ValueI16 = 0; int32_t ValueI32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'ValueI32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int32_t i32_ValueI32 = 0; +// CHECK-FIXES: int32_t i32_ValueI32 = 0; int64_t ValueI64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'ValueI64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int64_t i64_ValueI64 = 0; +// CHECK-FIXES: int64_t i64_ValueI64 = 0; uint8_t ValueU8 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'ValueU8' [readability-identifier-naming] -// CHECK-FIXES: {{^}}uint8_t u8_ValueU8 = 0; +// CHECK-FIXES: uint8_t u8_ValueU8 = 0; uint16_t ValueU16 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'ValueU16' [readability-identifier-naming] -// CHECK-FIXES: {{^}}uint16_t u16_ValueU16 = 0; +// CHECK-FIXES: uint16_t u16_ValueU16 = 0; uint32_t ValueU32 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'ValueU32' [readability-identifier-naming] -// CHECK-FIXES: {{^}}uint32_t u32_ValueU32 = 0; +// CHECK-FIXES: uint32_t u32_ValueU32 = 0; uint64_t ValueU64 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'ValueU64' [readability-identifier-naming] -// CHECK-FIXES: {{^}}uint64_t u64_ValueU64 = 0; +// CHECK-FIXES: uint64_t u64_ValueU64 = 0; float ValueFloat = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'ValueFloat' [readability-identifier-naming] -// CHECK-FIXES: {{^}}float f_ValueFloat = 0; +// CHECK-FIXES: float f_ValueFloat = 0; double ValueDouble = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'ValueDouble' [readability-identifier-naming] -// CHECK-FIXES: {{^}}double d_ValueDouble = 0; +// CHECK-FIXES: double d_ValueDouble = 0; char ValueChar = 'c'; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'ValueChar' [readability-identifier-naming] -// CHECK-FIXES: {{^}}char c_ValueChar = 'c'; +// CHECK-FIXES: char c_ValueChar = 'c'; bool ValueBool = true; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'ValueBool' [readability-identifier-naming] -// CHECK-FIXES: {{^}}bool b_ValueBool = true; +// CHECK-FIXES: bool b_ValueBool = true; int ValueInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'ValueInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_ValueInt = 0; +// CHECK-FIXES: int i_ValueInt = 0; size_t ValueSize = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'ValueSize' [readability-identifier-naming] -// CHECK-FIXES: {{^}}size_t n_ValueSize = 0; +// CHECK-FIXES: size_t n_ValueSize = 0; wchar_t ValueWchar = 'w'; // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for global variable 'ValueWchar' [readability-identifier-naming] -// CHECK-FIXES: {{^}}wchar_t wc_ValueWchar = 'w'; +// CHECK-FIXES: wchar_t wc_ValueWchar = 'w'; short ValueShort = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'ValueShort' [readability-identifier-naming] -// CHECK-FIXES: {{^}}short s_ValueShort = 0; +// CHECK-FIXES: short s_ValueShort = 0; unsigned ValueUnsigned = 0; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'ValueUnsigned' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned u_ValueUnsigned = 0; +// CHECK-FIXES: unsigned u_ValueUnsigned = 0; signed ValueSigned = 0; // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global variable 'ValueSigned' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed s_ValueSigned = 0; +// CHECK-FIXES: signed s_ValueSigned = 0; long ValueLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: invalid case style for global variable 'ValueLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long l_ValueLong = 0; +// CHECK-FIXES: long l_ValueLong = 0; long long ValueLongLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for global variable 'ValueLongLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long long ll_ValueLongLong = 0; +// CHECK-FIXES: long long ll_ValueLongLong = 0; long long int ValueLongLongInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for global variable 'ValueLongLongInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long long int lli_ValueLongLongInt = 0; +// CHECK-FIXES: long long int lli_ValueLongLongInt = 0; long double ValueLongDouble = 0; // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global variable 'ValueLongDouble' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long double ld_ValueLongDouble = 0; +// CHECK-FIXES: long double ld_ValueLongDouble = 0; signed int ValueSignedInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global variable 'ValueSignedInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed int si_ValueSignedInt = 0; +// CHECK-FIXES: signed int si_ValueSignedInt = 0; signed short ValueSignedShort = 0; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global variable 'ValueSignedShort' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed short ss_ValueSignedShort = 0; +// CHECK-FIXES: signed short ss_ValueSignedShort = 0; signed short int ValueSignedShortInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for global variable 'ValueSignedShortInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed short int ssi_ValueSignedShortInt = 0; +// CHECK-FIXES: signed short int ssi_ValueSignedShortInt = 0; signed long long ValueSignedLongLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for global variable 'ValueSignedLongLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed long long sll_ValueSignedLongLong = 0; +// CHECK-FIXES: signed long long sll_ValueSignedLongLong = 0; signed long int ValueSignedLongInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: invalid case style for global variable 'ValueSignedLongInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed long int sli_ValueSignedLongInt = 0; +// CHECK-FIXES: signed long int sli_ValueSignedLongInt = 0; signed long ValueSignedLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global variable 'ValueSignedLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}signed long sl_ValueSignedLong = 0; +// CHECK-FIXES: signed long sl_ValueSignedLong = 0; unsigned long long int ValueUnsignedLongLongInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: invalid case style for global variable 'ValueUnsignedLongLongInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned long long int ulli_ValueUnsignedLongLongInt = 0; +// CHECK-FIXES: unsigned long long int ulli_ValueUnsignedLongLongInt = 0; unsigned long long ValueUnsignedLongLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: invalid case style for global variable 'ValueUnsignedLongLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned long long ull_ValueUnsignedLongLong = 0; +// CHECK-FIXES: unsigned long long ull_ValueUnsignedLongLong = 0; unsigned long int ValueUnsignedLongInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: invalid case style for global variable 'ValueUnsignedLongInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned long int uli_ValueUnsignedLongInt = 0; +// CHECK-FIXES: unsigned long int uli_ValueUnsignedLongInt = 0; unsigned long ValueUnsignedLong = 0; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for global variable 'ValueUnsignedLong' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned long ul_ValueUnsignedLong = 0; +// CHECK-FIXES: unsigned long ul_ValueUnsignedLong = 0; unsigned short int ValueUnsignedShortInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: invalid case style for global variable 'ValueUnsignedShortInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned short int usi_ValueUnsignedShortInt = 0; +// CHECK-FIXES: unsigned short int usi_ValueUnsignedShortInt = 0; unsigned short ValueUnsignedShort = 0; // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for global variable 'ValueUnsignedShort' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned short us_ValueUnsignedShort = 0; +// CHECK-FIXES: unsigned short us_ValueUnsignedShort = 0; unsigned int ValueUnsignedInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global variable 'ValueUnsignedInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned int ui_ValueUnsignedInt = 0; +// CHECK-FIXES: unsigned int ui_ValueUnsignedInt = 0; unsigned char ValueUnsignedChar = 0; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for global variable 'ValueUnsignedChar' [readability-identifier-naming] -// CHECK-FIXES: {{^}}unsigned char uc_ValueUnsignedChar = 0; +// CHECK-FIXES: unsigned char uc_ValueUnsignedChar = 0; long int ValueLongInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for global variable 'ValueLongInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}long int li_ValueLongInt = 0; +// CHECK-FIXES: long int li_ValueLongInt = 0; //===----------------------------------------------------------------------===// @@ -548,25 +548,25 @@ long int ValueLongInt = 0; //===----------------------------------------------------------------------===// volatile int VolatileInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global variable 'VolatileInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}volatile int i_VolatileInt = 0; +// CHECK-FIXES: volatile int i_VolatileInt = 0; thread_local int ThreadLocalValueInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: invalid case style for global variable 'ThreadLocalValueInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}thread_local int i_ThreadLocalValueInt = 0; +// CHECK-FIXES: thread_local int i_ThreadLocalValueInt = 0; extern int ExternValueInt; // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: invalid case style for global variable 'ExternValueInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}extern int i_ExternValueInt; +// CHECK-FIXES: extern int i_ExternValueInt; struct DataBuffer { mutable size_t Size; }; // CHECK-MESSAGES: :[[@LINE-2]]:20: warning: invalid case style for public member 'Size' [readability-identifier-naming] -// CHECK-FIXES: {{^}} mutable size_t n_Size; +// CHECK-FIXES: mutable size_t n_Size; static constexpr int const &ConstExprInt = 42; // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: invalid case style for constexpr variable 'ConstExprInt' [readability-identifier-naming] -// CHECK-FIXES: {{^}}static constexpr int const &i_ConstExprInt = 42; +// CHECK-FIXES: static constexpr int const &i_ConstExprInt = 42; //===----------------------------------------------------------------------===// @@ -575,7 +575,7 @@ static constexpr int const &ConstExprInt = 42; typedef int INDEX; INDEX iIndex = 0; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'iIndex' [readability-identifier-naming] -// CHECK-FIXES: {{^}}INDEX Index = 0; +// CHECK-FIXES: INDEX Index = 0; //===----------------------------------------------------------------------===// @@ -583,96 +583,96 @@ INDEX iIndex = 0; //===----------------------------------------------------------------------===// class ClassCase { int Func(); }; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for class 'ClassCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}class C_ClassCase { int Func(); }; +// CHECK-FIXES: class C_ClassCase { int Func(); }; class AbstractClassCase { virtual int Func() = 0; }; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for abstract class 'AbstractClassCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}class I_AbstractClassCase { virtual int Func() = 0; }; +// CHECK-FIXES: class I_AbstractClassCase { virtual int Func() = 0; }; class AbstractClassCase1 { virtual int Func1() = 0; int Func2(); }; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for abstract class 'AbstractClassCase1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}class I_AbstractClassCase1 { virtual int Func1() = 0; int Func2(); }; +// CHECK-FIXES: class I_AbstractClassCase1 { virtual int Func1() = 0; int Func2(); }; class ClassConstantCase { public: static const int i_ConstantCase; }; // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for class 'ClassConstantCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}class C_ClassConstantCase { public: static const int i_ConstantCase; }; +// CHECK-FIXES: class C_ClassConstantCase { public: static const int i_ConstantCase; }; //===----------------------------------------------------------------------===// // Other Cases //===----------------------------------------------------------------------===// int lower_case = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'lower_case' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_LowerCase = 0; +// CHECK-FIXES: int i_LowerCase = 0; int lower_case1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'lower_case1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_LowerCase1 = 0; +// CHECK-FIXES: int i_LowerCase1 = 0; int lower_case_2 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'lower_case_2' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_LowerCase2 = 0; +// CHECK-FIXES: int i_LowerCase2 = 0; int UPPER_CASE = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'UPPER_CASE' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_UpperCase = 0; +// CHECK-FIXES: int i_UpperCase = 0; int UPPER_CASE_1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'UPPER_CASE_1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_UpperCase1 = 0; +// CHECK-FIXES: int i_UpperCase1 = 0; int camelBack = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'camelBack' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelBack = 0; +// CHECK-FIXES: int i_CamelBack = 0; int camelBack_1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'camelBack_1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelBack1 = 0; +// CHECK-FIXES: int i_CamelBack1 = 0; int camelBack2 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'camelBack2' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelBack2 = 0; +// CHECK-FIXES: int i_CamelBack2 = 0; int CamelCase = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'CamelCase' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelCase = 0; +// CHECK-FIXES: int i_CamelCase = 0; int CamelCase_1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'CamelCase_1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelCase1 = 0; +// CHECK-FIXES: int i_CamelCase1 = 0; int CamelCase2 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'CamelCase2' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelCase2 = 0; +// CHECK-FIXES: int i_CamelCase2 = 0; int camel_Snake_Back = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'camel_Snake_Back' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelSnakeBack = 0; +// CHECK-FIXES: int i_CamelSnakeBack = 0; int camel_Snake_Back_1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'camel_Snake_Back_1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelSnakeBack1 = 0; +// CHECK-FIXES: int i_CamelSnakeBack1 = 0; int Camel_Snake_Case = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'Camel_Snake_Case' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelSnakeCase = 0; +// CHECK-FIXES: int i_CamelSnakeCase = 0; int Camel_Snake_Case_1 = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: invalid case style for global variable 'Camel_Snake_Case_1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}int i_CamelSnakeCase1 = 0; +// CHECK-FIXES: int i_CamelSnakeCase1 = 0; //===----------------------------------------------------------------------===// // Enum //===----------------------------------------------------------------------===// enum REV_TYPE { RevValid }; // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: invalid case style for enum constant 'RevValid' [readability-identifier-naming] -// CHECK-FIXES: {{^}}enum REV_TYPE { rt_RevValid }; +// CHECK-FIXES: enum REV_TYPE { rt_RevValid }; enum EnumConstantCase { OneByte, TwoByte }; // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: invalid case style for enum constant 'OneByte' [readability-identifier-naming] // CHECK-MESSAGES: :[[@LINE-2]]:34: warning: invalid case style for enum constant 'TwoByte' [readability-identifier-naming] -// CHECK-FIXES: {{^}}enum EnumConstantCase { ecc_OneByte, ecc_TwoByte }; +// CHECK-FIXES: enum EnumConstantCase { ecc_OneByte, ecc_TwoByte }; enum class ScopedEnumConstantCase { Case1 }; // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: invalid case style for scoped enum constant 'Case1' [readability-identifier-naming] -// CHECK-FIXES: {{^}}enum class ScopedEnumConstantCase { secc_Case1 }; +// CHECK-FIXES: enum class ScopedEnumConstantCase { secc_Case1 }; // clang-format on diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp index f807875e27698..2945047dee4ca 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp @@ -11,11 +11,11 @@ class SomeClass { template int someMethod(); // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for method 'someMethod' [readability-identifier-naming] -// CHECK-FIXES: {{^}} int SomeMethod(); +// CHECK-FIXES: int SomeMethod(); }; template int SomeClass::someMethod() { -// CHECK-FIXES: {{^}}int SomeClass::SomeMethod() { +// CHECK-FIXES: int SomeClass::SomeMethod() { return 5; } @@ -24,7 +24,7 @@ int SomeClass::someMethod() { void someFunc() { Inner::SomeClass S; S.someMethod(); -// CHECK-FIXES: {{^}} S.SomeMethod(); +// CHECK-FIXES: S.SomeMethod(); } } // namespace SomeNamespace diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration-cxx17.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration-cxx17.cpp index b50ad4ce25839..a811db1879ecb 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration-cxx17.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration-cxx17.cpp @@ -63,31 +63,31 @@ void modern() { auto autoInt1 = 3, autoInt2 = 4; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: auto autoInt1 = 3; - // CHECK-FIXES: {{^ }}auto autoInt2 = 4; + // CHECK-FIXES: auto autoInt2 = 4; decltype(int()) declnottouch = 4; decltype(int()) declint1 = 5, declint2 = 3; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: decltype(int()) declint1 = 5; - // CHECK-FIXES: {{^ }}decltype(int()) declint2 = 3; + // CHECK-FIXES: decltype(int()) declint2 = 3; std::vector vectorA = {1, 2}, vectorB = {1, 2, 3}, vectorC({1, 1, 1}); // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: std::vector vectorA = {1, 2}; - // CHECK-FIXES: {{^ }}std::vector vectorB = {1, 2, 3}; - // CHECK-FIXES: {{^ }}std::vector vectorC({1, 1, 1}); + // CHECK-FIXES: std::vector vectorB = {1, 2, 3}; + // CHECK-FIXES: std::vector vectorC({1, 1, 1}); using uType = int; uType utype1, utype2; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: uType utype1; - // CHECK-FIXES: {{^ }}uType utype2; + // CHECK-FIXES: uType utype2; Types::MyType mytype1, mytype2, mytype3 = 3; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: Types::MyType mytype1; - // CHECK-FIXES: {{^ }}Types::MyType mytype2; - // CHECK-FIXES: {{^ }}Types::MyType mytype3 = 3; + // CHECK-FIXES: Types::MyType mytype2; + // CHECK-FIXES: Types::MyType mytype3 = 3; { using namespace std::string_literals; @@ -95,9 +95,9 @@ void modern() { std::vector s{"foo"s, "bar"s}, t{"foo"s}, u, a({"hey", "you"}), bb = {"h", "a"}; // CHECK-MESSAGES: [[@LINE-1]]:5: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: std::vector s{"foo"s, "bar"s}; - // CHECK-FIXES: {{^ }}std::vector t{"foo"s}; - // CHECK-FIXES: {{^ }}std::vector u; - // CHECK-FIXES: {{^ }}std::vector a({"hey", "you"}); - // CHECK-FIXES: {{^ }}std::vector bb = {"h", "a"}; + // CHECK-FIXES: std::vector t{"foo"s}; + // CHECK-FIXES: std::vector u; + // CHECK-FIXES: std::vector a({"hey", "you"}); + // CHECK-FIXES: std::vector bb = {"h", "a"}; } } diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration.c b/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration.c index e1960a15b8d83..1354a29ffedaf 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration.c +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/isolate-declaration.c @@ -6,7 +6,7 @@ void c_specific(void) { int j = sizeof(struct T { int i; }), k; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: multiple declarations in a single statement reduces readability // CHECK-FIXES: int j = sizeof(struct T { int i; }); - // CHECK-FIXES: {{^ }}int k; + // CHECK-FIXES: int k; void g(struct U { int i; } s); // One decl void h(struct V { int i; } s), m(int i, ...); // Two decls diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration-ignore-macros.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration-ignore-macros.cpp index 1a124adc55f6a..1fe7d1d356db3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration-ignore-macros.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration-ignore-macros.cpp @@ -6,16 +6,16 @@ extern int Xyz; extern int Xyz; // Xyz // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: redundant 'Xyz' declaration [readability-redundant-declaration] -// CHECK-FIXES: {{^}}// Xyz{{$}} +// CHECK-FIXES: // Xyz namespace macros { #define DECLARE(x) extern int x #define DEFINE(x) extern int x; int x = 42 DECLARE(test); DEFINE(test); -// CHECK-FIXES: {{^}}#define DECLARE(x) extern int x{{$}} -// CHECK-FIXES: {{^}}#define DEFINE(x) extern int x; int x = 42{{$}} -// CHECK-FIXES: {{^}}DECLARE(test);{{$}} -// CHECK-FIXES: {{^}}DEFINE(test);{{$}} +// CHECK-FIXES: #define DECLARE(x) extern int x +// CHECK-FIXES: #define DEFINE(x) extern int x; int x = 42 +// CHECK-FIXES: DECLARE(test); +// CHECK-FIXES: DEFINE(test); } // namespace macros From 7a41761407c485d18b7d48232b308556b3b43934 Mon Sep 17 00:00:00 2001 From: Wanyi Date: Tue, 15 Apr 2025 13:46:15 -0400 Subject: [PATCH 015/710] [lldb] Make SBProcess thread related actions listen to StopLocker (#134339) # Summary This PR updates `SBProcess::GetNumThreads()` and `SBProcess::GetThreadAtIndex()` to listen to the stop locker. `SBProcess::GetNumThreads()` will return 0 if the process is running. ## Problem Description Recently upon debugging a program with thousands of threads in VS Code, lldb-dap would hang at a `threads` request sent right after receiving the `configurationDone` response. Soon after it will end the debug session with the following error ``` Process exited with status = -1 (0xffffffff) lost connection ``` This is because LLDB is still in the middle of resuming all the threads. And requesting threads will end up interrupt the process on Linux. From the gdb-remote log it ended up getting `lldb::StateType::eStateInvalid` and just exit with status -1. I don't think it's reasonable to allow getting threads from a running process. There are a few approaches to fix this: 1) Send the stopped event to IDE after `configurationDone`. This aligns with the CLI behavior. 2) However, the above approach will break the existing user facing behavior. The alternative will be reject the `threads` request if the process is not stopped. 3) Improve the run lock. This is a synchronize issue where process was in the middle of resuming while lldb-dap attempts to interrupt it. **This PR implements the option 3** ## HOWEVER This fixed the "lost connection" issue below but new issue has surfaced. From testing, and also from checking the [VSCode source code](https://github.com/microsoft/vscode/blob/174af221c9ea2ccdb64abe4aab8e1a805e77beae/src/vs/workbench/contrib/debug/browser/debugSession.ts#L791), it expects having threadID to perform `pause`. So after attaching, without any threads reported to the client, the user will not be able to pause the attached process. `setBreakpoint` will still work and once we make a stop at the bp (or any stop that will report threads, client can perform pause again. ## NEXT 1) Made an attempt to return initial thread list so that VSCode can pause (second commit in the PR) 2) Investigate why threads will trigger unwinding the second frame of a thread, which leads to sending the interrupt 3) Decided if we want to support `stopOnEntry` for attaching, given i. This is not an official specification ii. If enable stopOnEntry, we need to fix attaching on Linux, to send only one stopped event. Currently, all threads upon attaching will have stop reason `SIGSTOP` and lldb-dap will send `stopped` event for each one of them. Every `stopped` will trigger the client request for threads. iii. Alternatively, we can support auto continue correspond to `(lldb) process attach --continue`. This require the ii above. ### Additionally lldb-dap will not send a `continued` event after `configurationDone` because it checks `dap.focus_tid == LLDB_INVALID_THREAD_ID` (so that we don't send it for `launch` request). Notice `dap.focus_tid` will only get assigned when handling stop or stepping. According to DAP > Please note: a debug adapter is not expected to send this event in response to a request that implies that execution continues, e.g. launch or continue. It is only necessary to send a continued event if there was no previous request that implied this. So I guess we are not violating DAP if we don't send `continued` event. But I'd like to get some sense about this. ## Test Plan Used following program for testing: https://gist.github.com/kusmour/1729d2e07b7b1063897db77de194e47d **NOTE: Utilize stdin to get pid and attach AFTER hitting enter. Attach should happen when all the threads start running.** DAP messages before the change image DAP message after the change - report zero threads after attaching image --------- Co-authored-by: Jonas Devlieghere --- .../test/tools/lldb-dap/dap_server.py | 4 ++++ lldb/source/API/SBProcess.cpp | 20 ++++++++++--------- .../tools/lldb-dap/attach/TestDAP_attach.py | 2 +- lldb/tools/lldb-dap/DAP.h | 3 +++ .../ConfigurationDoneRequestHandler.cpp | 10 +++++++++- .../Handler/ThreadsRequestHandler.cpp | 17 +++++++++++----- lldb/tools/lldb-dap/JSONUtils.cpp | 13 ++++++++++++ lldb/tools/lldb-dap/JSONUtils.h | 2 ++ 8 files changed, 55 insertions(+), 16 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 45403e9df8525..61d7fa94479b8 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -649,6 +649,10 @@ def request_configurationDone(self): response = self.send_recv(command_dict) if response: self.configuration_done_sent = True + # Client requests the baseline of currently existing threads after + # a successful launch or attach. + # Kick off the threads request that follows + self.request_threads() return response def _process_stopped(self): diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 23ea449b30cca..ba77b2beed5ea 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -193,10 +193,11 @@ uint32_t SBProcess::GetNumThreads() { if (process_sp) { Process::StopLocker stop_locker; - const bool can_update = stop_locker.TryLock(&process_sp->GetRunLock()); - std::lock_guard guard( - process_sp->GetTarget().GetAPIMutex()); - num_threads = process_sp->GetThreadList().GetSize(can_update); + if (stop_locker.TryLock(&process_sp->GetRunLock())) { + std::lock_guard guard( + process_sp->GetTarget().GetAPIMutex()); + num_threads = process_sp->GetThreadList().GetSize(); + } } return num_threads; @@ -393,11 +394,12 @@ SBThread SBProcess::GetThreadAtIndex(size_t index) { ProcessSP process_sp(GetSP()); if (process_sp) { Process::StopLocker stop_locker; - const bool can_update = stop_locker.TryLock(&process_sp->GetRunLock()); - std::lock_guard guard( - process_sp->GetTarget().GetAPIMutex()); - thread_sp = process_sp->GetThreadList().GetThreadAtIndex(index, can_update); - sb_thread.SetThread(thread_sp); + if (stop_locker.TryLock(&process_sp->GetRunLock())) { + std::lock_guard guard( + process_sp->GetTarget().GetAPIMutex()); + thread_sp = process_sp->GetThreadList().GetThreadAtIndex(index, false); + sb_thread.SetThread(thread_sp); + } } return sb_thread; diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py index 9df44cc454d5d..b9fbf2c8d14f9 100644 --- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py +++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py @@ -1,5 +1,5 @@ """ -Test lldb-dap setBreakpoints request +Test lldb-dap attach request """ diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 8d32a18fb711e..b79a0d9d0f25c 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -211,6 +211,9 @@ struct DAP { /// The set of features supported by the connected client. llvm::DenseSet clientFeatures; + /// The initial thread list upon attaching. + std::optional initial_thread_list; + /// Creates a new DAP sessions. /// /// \param[in] log diff --git a/lldb/tools/lldb-dap/Handler/ConfigurationDoneRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ConfigurationDoneRequestHandler.cpp index cd120e1fdfaba..f39bbdefdbb95 100644 --- a/lldb/tools/lldb-dap/Handler/ConfigurationDoneRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ConfigurationDoneRequestHandler.cpp @@ -44,6 +44,7 @@ namespace lldb_dap { // just an acknowledgement, so no body field is required." // }] // }, + void ConfigurationDoneRequestHandler::operator()( const llvm::json::Object &request) const { llvm::json::Object response; @@ -52,8 +53,15 @@ void ConfigurationDoneRequestHandler::operator()( dap.configuration_done_sent = true; if (dap.stop_at_entry) SendThreadStoppedEvent(dap); - else + else { + // Client requests the baseline of currently existing threads after + // a successful launch or attach by sending a 'threads' request + // right after receiving the configurationDone response. + // Obtain the list of threads before we resume the process + dap.initial_thread_list = + GetThreads(dap.target.GetProcess(), dap.thread_format); dap.target.GetProcess().Continue(); + } } } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Handler/ThreadsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ThreadsRequestHandler.cpp index 2b857f7f6a02b..16d797c2ab327 100644 --- a/lldb/tools/lldb-dap/Handler/ThreadsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ThreadsRequestHandler.cpp @@ -50,16 +50,23 @@ namespace lldb_dap { // } void ThreadsRequestHandler::operator()( const llvm::json::Object &request) const { - lldb::SBProcess process = dap.target.GetProcess(); llvm::json::Object response; FillResponse(request, response); - const uint32_t num_threads = process.GetNumThreads(); llvm::json::Array threads; - for (uint32_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { - lldb::SBThread thread = process.GetThreadAtIndex(thread_idx); - threads.emplace_back(CreateThread(thread, dap.thread_format)); + // Client requests the baseline of currently existing threads after + // a successful launch or attach by sending a 'threads' request + // right after receiving the configurationDone response. + // If no thread has reported to the client, it prevents something + // like the pause request from working in the running state. + // Return the cache of initial threads as the process might have resumed + if (dap.initial_thread_list) { + threads = dap.initial_thread_list.value(); + dap.initial_thread_list.reset(); + } else { + threads = GetThreads(dap.target.GetProcess(), dap.thread_format); } + if (threads.size() == 0) { response["success"] = llvm::json::Value(false); } diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 7660403666150..33f10c93d2ada 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -870,6 +870,19 @@ llvm::json::Value CreateThread(lldb::SBThread &thread, lldb::SBFormat &format) { return llvm::json::Value(std::move(object)); } +llvm::json::Array GetThreads(lldb::SBProcess process, lldb::SBFormat &format) { + lldb::SBMutex lock = process.GetTarget().GetAPIMutex(); + std::lock_guard guard(lock); + + llvm::json::Array threads; + const uint32_t num_threads = process.GetNumThreads(); + for (uint32_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + lldb::SBThread thread = process.GetThreadAtIndex(thread_idx); + threads.emplace_back(CreateThread(thread, format)); + } + return threads; +} + // "StoppedEvent": { // "allOf": [ { "$ref": "#/definitions/Event" }, { // "type": "object", diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h index da91797290ff0..b8c53353bf42d 100644 --- a/lldb/tools/lldb-dap/JSONUtils.h +++ b/lldb/tools/lldb-dap/JSONUtils.h @@ -414,6 +414,8 @@ llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread, /// definition outlined by Microsoft. llvm::json::Value CreateThread(lldb::SBThread &thread, lldb::SBFormat &format); +llvm::json::Array GetThreads(lldb::SBProcess process, lldb::SBFormat &format); + /// Create a "StoppedEvent" object for a LLDB thread object. /// /// This function will fill in the following keys in the returned From 9b13d345303d819bb83de7ebbeb826d704add0bc Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Tue, 15 Apr 2025 10:49:52 -0700 Subject: [PATCH 016/710] [libc][bazel] Remove a no-op libc_internal_target macro. (#135818) This macro is a no-op after 90c001ac9e1d92a1a95d191d1640ab5337a937e5: libc_function macro now produce a "regular" cc_library target, without modifying its name, and this target is intended to only be used in tests. Thus, libc_internal_target macro is no longer needed, and we can safely treat libc_function rules and libc_support_library rules identically for test purposes. `libc_function_deps` attribute of a `libc_test` macro can also be cleaned up, but I plan to do this in a subsequent change. --- .../llvm-project-overlay/libc/libc_build_rules.bzl | 9 +-------- .../libc/test/libc_test_rules.bzl | 12 +++++------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl index 86dfb53a86014..60add23a46c48 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl @@ -10,10 +10,6 @@ load(":libc_configure_options.bzl", "LIBC_CONFIGURE_OPTIONS") load(":libc_namespace.bzl", "LIBC_NAMESPACE") load(":platforms.bzl", "PLATFORM_CPU_X86_64") -# TODO: Remove this helper function once all donwstream users are migrated. -def libc_internal_target(name): - return name - def libc_common_copts(): root_label = Label(":libc") libc_include_path = paths.join(root_label.workspace_root, root_label.package) @@ -84,10 +80,7 @@ def libc_function(name, **kwargs): # Builds "internal" library with a function, exposed as a C++ function in # the "LIBC_NAMESPACE" namespace. This allows us to test the function in the # presence of another libc. - _libc_library( - name = libc_internal_target(name), - **kwargs - ) + _libc_library(name = name, **kwargs) LibcLibraryInfo = provider( "All source files and textual headers for building a particular library.", diff --git a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl index 8c20a9172989c..7e798429ef19b 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl @@ -12,26 +12,24 @@ They come in two flavors: When performing tests we make sure to always use the internal version. """ -load("//libc:libc_build_rules.bzl", "libc_common_copts", "libc_internal_target") +load("//libc:libc_build_rules.bzl", "libc_common_copts") load("//libc:libc_configure_options.bzl", "LIBC_CONFIGURE_OPTIONS") -def libc_test(name, srcs, libc_function_deps = [], copts = [], deps = [], local_defines = [], **kwargs): +def libc_test(name, libc_function_deps = [], copts = [], deps = [], local_defines = [], **kwargs): """Add target for a libc test. Args: name: Test target name - srcs: List of sources for the test. libc_function_deps: List of libc_function targets used by this test. copts: The list of options to add to the C++ compilation command. deps: The list of other libraries to be linked in to the test target. local_defines: The list of target local_defines if any. - **kwargs: Attributes relevant for a libc_test. For example, name, srcs. + **kwargs: Attributes relevant for a cc_test. """ native.cc_test( name = name, - srcs = srcs, local_defines = local_defines + LIBC_CONFIGURE_OPTIONS, - deps = [libc_internal_target(d) for d in libc_function_deps] + [ + deps = [ "//libc/test/UnitTest:LibcUnitTest", "//libc:__support_macros_config", "//libc:errno", @@ -39,7 +37,7 @@ def libc_test(name, srcs, libc_function_deps = [], copts = [], deps = [], local_ "//libc:func_free", "//libc:func_malloc", "//libc:func_realloc", - ] + deps, + ] + libc_function_deps + deps, copts = copts + libc_common_copts(), linkstatic = 1, **kwargs From 3b9103044361094a8fde16a877f2e8cb0f96ce24 Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Tue, 15 Apr 2025 10:51:02 -0700 Subject: [PATCH 017/710] [llvm] add documentation for public interface annotations (LLVM_ABI, etc) ## Purpose Add documentation for the existing family of `LLVM_ABI` annotation macros defined in llvm/Support/Compiler.h. These annotations are used to describe LLVM's public interface. ## Background This documentation is in support of the annotation effort described [here](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307/). ## Validation Manually inspected rendered ReST document on GitHub. Co-authored-by: Saleem Abdulrasool --- llvm/docs/InterfaceExportAnnotations.rst | 376 +++++++++++++++++++++++ llvm/docs/Reference.rst | 1 + 2 files changed, 377 insertions(+) create mode 100644 llvm/docs/InterfaceExportAnnotations.rst diff --git a/llvm/docs/InterfaceExportAnnotations.rst b/llvm/docs/InterfaceExportAnnotations.rst new file mode 100644 index 0000000000000..eecf6ffe6eaca --- /dev/null +++ b/llvm/docs/InterfaceExportAnnotations.rst @@ -0,0 +1,376 @@ +LLVM Interface Export Annotations +================================= +Symbols that are part of LLVM's public interface must be explicitly annotated +to support shared library builds with hidden default symbol visibility. This +document provides background and guidelines for annotating the codebase. + +LLVM Shared Library +------------------- +LLVM builds as a static library by default, but it can also be built as a shared +library with the following configuration: + +:: + + LLVM_BUILD_LLVM_DYLIB=On + LLVM_LINK_LLVM_DYLIB=On + +There are three shared library executable formats we're interested in: PE +Dynamic Link Library (.dll) on Windows, Mach-O Shared Object (.dylib) on Apple +systems, and ELF Shared Object (.so) on Linux, BSD and other Unix-like systems. + +ELF and Mach-O Shared Object files can be built with no additional setup or +configuration. This is because all global symbols in the library are exported by +default -- the same as when building a static library. However, when building a +DLL for Windows, the situation is more complex: + +- Symbols are not exported from a DLL by default. Symbols must be annotated with + ``__declspec(dllexport)`` when building the library to be externally visible. + +- Symbols imported from a Windows DLL should generally be annotated with + ``__declspec(dllimport)`` when compiling clients. + +- A single Windows DLL can export a maximum of 65,535 symbols. + +Because of the requirements for Windows DLLs, additional work must be done to +ensure the proper set of public symbols is exported and visible to clients. + +Annotation Macros +----------------- +The distinct DLL import and export annotations required for Windows DLLs +typically lead developers to define a preprocessor macro for annotating exported +symbols in header public files. The custom macro resolves to the _export_ +annotation when building the library and the _import_ annotation when building +the client. + +We have defined the `LLVM_ABI` macro in `llvm/Support/Compiler.h +`__ +for this purpose: + +.. code:: cpp + + #if defined(LLVM_EXPORTS) + #define LLVM_ABI __declspec(dllexport) + #else + #define LLVM_ABI __declspec(dllimport) + #endif + +Windows DLL symbol visibility requirements are approximated on ELF and Mach-O +shared library builds by setting default symbol visibility to hidden +(``-fvisibility-default=hidden``) when building with the following +configuration: + +:: + + LLVM_BUILD_LLVM_DYLIB_VIS=On + +For an ELF or Mach-O platform with this setting, the ``LLVM_ABI`` macro is +defined to override the default hidden symbol visibility: + +.. code:: cpp + + #define LLVM_ABI __attribute__((visibility("default"))) + +In addition to ``LLVM_ABI``, there are a few other macros for use in less +common cases described below. + +Export macros are used to annotate symbols only within their intended shared +library. This is necessary because of the way Windows handles import/export +annotations. + +For example, ``LLVM_ABI`` resolves to ``__declspec(dllexport)`` only when +building source that is part of the LLVM shared library (e.g. source under +``llvm-project/llvm``). If ``LLVM_ABI`` were incorrectly used to annotate a +symbol from a different LLVM project (such as Clang) it would always resolve to +``__declspec(dllimport)`` and the symbol would not be properly exported. + +Annotating Symbols +------------------ +Functions +~~~~~~~~~ +Exported function declarations in header files must be annotated with +``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + LLVM_ABI void exported_function(int a, int b); + +Global Variables +~~~~~~~~~~~~~~~~ +Exported global variables must be annotated with ``LLVM_ABI`` at their +``extern`` declarations. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + LLVM_ABI extern int exported_global_variable; + +Classes, Structs, and Unions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Classes, structs, and unions can be annotated with ``LLVM_ABI`` at their +declaration, but this option is generally discouraged because it will +export every class member, vtable, and type information. Instead, ``LLVM_ABI`` +should be applied to individual class members that require export. + +In the most common case, public and protected methods without a body in the +class declaration must be annotated with ``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + class ExampleClass { + public: + // Public methods defined externally must be annotatated. + LLVM_ABI int sourceDefinedPublicMethod(int a, int b); + + // Methods defined in the class definition do not need annotation. + int headerDefinedPublicMethod(int a, int b) { + return a + b; + } + + // Constructors and destructors must be annotated if defined externally. + ExampleClass() {} + LLVM_ABI ~ExampleClass(); + + // Public static methods defined externally must be annotatated. + LLVM_ABI static int sourceDefinedPublicStaticMethod(int a, int b); + }; + +Additionally, public and protected static fields that are not initialized at +declaration must be annotated with ``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + class ExampleClass { + public: + // Public static fields defined externally must be annotated. + LLVM_ABI static int mutableStaticField; + LLVM_ABI static const int constStaticField; + + // Static members initialized at declaration do not need to be annotated. + static const int initializedConstStaticField = 0; + static constexpr int initializedConstexprStaticField = 0; + }; + +Private methods may also require ``LLVM_ABI`` annotation in certain cases. This +situation occurs when a method defined in a header calls the private method. The +private method call may be from within the class, a parent class, or a friend +class. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + class ExampleClass { + private: + // Private methods must be annotated if referenced by a public method defined a + // header file. + LLVM_ABI int privateMethod(int a, int b); + + public: + // Inlineable method defined in the class definition calls a private method + // defined externally. If the private method is not annotated for export, this + // method will fail to link. + int publicMethod(int a, int b) { + return privateMethod(a, b); + } + }; + +There are less common cases where you may also need to annotate an inline +function even though it is fully defined in a header. Annotating an inline +function for export does not prevent it being inlined into client code. However, +it does ensure there is a single, stable address for the function exported from +the shared library. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + // Annotate the function so it is exported from the library at a fixed + // address. + LLVM_ABI inline int inlineFunction(int a, int b) { + return a + b; + } + +Similarly, if a stable pointer-to-member function address is required for a +method in a C++ class, it may be annotated for export. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + class ExampleClass { + public: + // Annotate the method so it is exported from the library at a fixed + // address. + LLVM_ABI inline int inlineMethod(int a, int b) { + return a + b; + } + }; + +.. note:: + + When an inline function is annotated for export, the header containing the + function definition **must** be included by at least one of the library's + source files or the function will never be compiled with the export + annotation. + +Friend Functions +~~~~~~~~~~~~~~~~ +Friend functions declared in a class, struct or union must be annotated with +``LLVM_ABI`` if the corresponding function declaration is also annotated. This +requirement applies even when the class itself is annotated with ``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + // An exported function that has friend access to ExampleClass internals. + LLVM_ABI int friend_function(ExampleClass &obj); + + class ExampleClass { + // Friend declaration of a function must be annotated the same as the actual + // function declaration. + LLVM_ABI friend int friend_function(ExampleClass &obj); + }; + +.. note:: + + Annotating the friend declaration avoids an “inconsistent dll linkage” + compiler error when building for Windows. This annotation is harmless but not + required when building ELF or Mach-O shared libraries. + +Virtual Table and Type Info +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Classes and structs with exported virtual methods, or child classes that export +overridden virtual methods, must also export their vtable for ELF and Mach-O +builds. This can be achieved by annotating the class rather than individual +class members. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + class ParentClass { + public: + virtual int virtualMethod(int a, int b); + virtual int anotherVirtualMethod(int a, int b); + virtual ~ParentClass(); + }; + + // Annotating the class exports vtable and type information as well as all + // class members. + class LLVM_ABI ChildClass : public ParentClass { + public: + // Inline method override does not require the class be annotated. + int virtualMethod(int a, int b) override { + return ParentClass::virtualMethod(a, b); + } + + // Overriding a virtual method from the parent requires the class be + // annotated. The parent class may require annotation as well. + int pureVirtualMethod(int a, int b) override; + ~ChildClass(); + }; + +If annotating a type with ``LLVM_ABI`` causes compilation issues such as those +described +`here `__, +the class may require modification. Often, explicitly deleting the copy +constructor and copy assignment operator will resolve the issue. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + #include + + class LLVM_ABI ExportedClass { + public: + // Explicitly delete the copy constructor and assignment operator. + ExportedClass(ExportedClass const&) = delete; + ExportedClass& operator=(ExportedClass const&) = delete; + }; + +Templates +~~~~~~~~~ +Most template classes are entirely header-defined and do not need to be exported +because they will be instantiated and compiled into the client as needed. Such +template classes require no export annotations. However, there are some less +common cases where annotations are required for templates. + +Specialized Template Functions +++++++++++++++++++++++++++++++ +As with any other exported function, an exported specialization of a template +function not defined in a header file must have its declaration annotated with +``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + template T templateMethod(T a, T b) { + return a + b; + } + + // The explicitly specialized definition of templateMethod for int is located in + // a source file. This declaration must be annotated with LLVM_ABI to export it. + template <> LLVM_ABI int templateMethod(int a, int b); + +Similarly, an exported specialization of a method in a template class must have +its declaration annotated with ``LLVM_ABI``. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + template class TemplateClass { + public: + int method(int a, int b) { + return a + b; + } + }; + + // The explicitly specialized definition of method for int is defined in a + // source file. The declaration must be annotated with LLVM_ABI to export it. + template <> LLVM_ABI int TemplateStruct::method(int a, int b); + +Explicitly Instantiated Template Classes +++++++++++++++++++++++++++++++++++++++++ +Explicitly instantiated template classes must be annotated with +template-specific annotations at both declaration and definition. + +An extern template instantiation in a header file must be annotated with +``LLVM_TEMPLATE_ABI``. This will typically be located in a header file. + +.. code:: cpp + + #include "llvm/Support/Compiler.h" + + template class TemplateClass { + public: + TemplateClass(T val) : val_(val) {} + + T get() const { return val_; } + + private: + const T val_; + }; + + // Explicitly instantiate and export TempalateClass for int type. + extern template class LLVM_TEMPLATE_ABI TemplateClass; + +The corresponding definition of the template instantiation must be annotated +with ``LLVM_EXPORT_TEMPLATE``. This will typically be located in a source file. + +.. code:: cpp + + #include "TemplateClass.h" + + // Explicitly instantiate and export TempalateClass for int type. + template class LLVM_EXPORT_TEMPLATE TemplateClass; diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index 470b6bd024b89..e1f46b00f2b30 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -31,6 +31,7 @@ LLVM and API reference documentation. HowToSetUpLLVMStyleRTTI HowToUseAttributes InAlloca + InterfaceExportAnnotations LangRef LibFuzzer MarkedUpDisassembly From 30d13e359190f7a0e2122292ec4a4fc1a6c71acc Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Tue, 15 Apr 2025 19:52:26 +0200 Subject: [PATCH 018/710] [CIR] Upstream ArraySubscriptExpr from function parameter with pointer base (#135493) This change adds an ArraySubscriptExpr from the function parameter with base type as Pointer Issue https://github.com/llvm/llvm-project/issues/130197 --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 43 ++++++++-- clang/test/CIR/CodeGen/array.cpp | 122 ++++++++++++++++++++++++--- 2 files changed, 148 insertions(+), 17 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index f0732a8ea60af..cffe5c5cd1ec3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -552,7 +552,19 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) { // in lexical order (this complexity is, sadly, required by C++17). assert((e->getIdx() == e->getLHS() || e->getIdx() == e->getRHS()) && "index was neither LHS nor RHS"); - const mlir::Value idx = emitScalarExpr(e->getIdx()); + + auto emitIdxAfterBase = [&]() -> mlir::Value { + const mlir::Value idx = emitScalarExpr(e->getIdx()); + + // Extend or truncate the index type to 32 or 64-bits. + auto ptrTy = mlir::dyn_cast(idx.getType()); + if (ptrTy && mlir::isa(ptrTy.getPointee())) + cgm.errorNYI(e->getSourceRange(), + "emitArraySubscriptExpr: index type cast"); + return idx; + }; + + const mlir::Value idx = emitIdxAfterBase(); if (const Expr *array = getSimpleArrayDecayOperand(e->getBase())) { LValue arrayLV; if (const auto *ase = dyn_cast(array)) @@ -566,13 +578,34 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) { arrayLV.getAddress(), e->getType(), idx, cgm.getLoc(e->getExprLoc()), /*shouldDecay=*/true); - return LValue::makeAddr(addr, e->getType(), LValueBaseInfo()); + const LValue lv = LValue::makeAddr(addr, e->getType(), LValueBaseInfo()); + + if (getLangOpts().ObjC && getLangOpts().getGC() != LangOptions::NonGC) { + cgm.errorNYI(e->getSourceRange(), "emitArraySubscriptExpr: ObjC with GC"); + } + + return lv; } // The base must be a pointer; emit it with an estimate of its alignment. - cgm.errorNYI(e->getSourceRange(), - "emitArraySubscriptExpr: The base must be a pointer"); - return {}; + assert(e->getBase()->getType()->isPointerType() && + "The base must be a pointer"); + + LValueBaseInfo eltBaseInfo; + const Address ptrAddr = emitPointerWithAlignment(e->getBase(), &eltBaseInfo); + // Propagate the alignment from the array itself to the result. + const Address addxr = emitArraySubscriptPtr( + *this, cgm.getLoc(e->getBeginLoc()), cgm.getLoc(e->getEndLoc()), ptrAddr, + e->getType(), idx, cgm.getLoc(e->getExprLoc()), + /*shouldDecay=*/false); + + const LValue lv = LValue::makeAddr(addxr, e->getType(), eltBaseInfo); + + if (getLangOpts().ObjC && getLangOpts().getGC() != LangOptions::NonGC) { + cgm.errorNYI(e->getSourceRange(), "emitArraySubscriptExpr: ObjC with GC"); + } + + return lv; } LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) { diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp index 5cda061cdbf12..08f6d730f161a 100644 --- a/clang/test/CIR/CodeGen/array.cpp +++ b/clang/test/CIR/CodeGen/array.cpp @@ -350,20 +350,118 @@ void func7() { // OGCG: %[[ARR:.*]] = alloca [1 x ptr], align 8 // OGCG: call void @llvm.memset.p0.i64(ptr align 8 %[[ARR]], i8 0, i64 8, i1 false) -void func8(int p[10]) {} -// CIR: cir.func @func8(%arg0: !cir.ptr -// CIR: cir.alloca !cir.ptr, !cir.ptr>, ["p", init] +void func8(int arr[10]) { + int e = arr[0]; + int e2 = arr[1]; +} -// LLVM: define void @func8(ptr {{%.*}}) -// LLVM-NEXT: alloca ptr, i64 1, align 8 +// CIR: cir.func @func8(%[[ARG:.*]]: !cir.ptr +// CIR: %[[ARR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arr", init] +// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init] +// CIR: %[[INIT_2:.*]] = cir.alloca !s32i, !cir.ptr, ["e2", init] +// CIR: cir.store %[[ARG]], %[[ARR]] : !cir.ptr, !cir.ptr> +// CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[TMP_1:.*]] = cir.load %[[ARR]] : !cir.ptr>, !cir.ptr +// CIR: %[[ELE_0:.*]] = cir.ptr_stride(%[[TMP_1]] : !cir.ptr, %[[IDX]] : !s32i), !cir.ptr +// CIR: %[[TMP_2:.*]] = cir.load %[[ELE_0]] : !cir.ptr, !s32i +// CIR: cir.store %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr +// CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[TMP_3:.*]] = cir.load %[[ARR]] : !cir.ptr>, !cir.ptr +// CIR: %[[ELE_1:.*]] = cir.ptr_stride(%[[TMP_3]] : !cir.ptr, %[[IDX_1]] : !s32i), !cir.ptr +// CIR: %[[TMP_4:.*]] = cir.load %[[ELE_1]] : !cir.ptr, !s32i +// CIR: cir.store %[[TMP_4]], %[[INIT_2]] : !s32i, !cir.ptr + +// LLVM: define void @func8(ptr %[[ARG:.*]]) +// LLVM: %[[ARR:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[INIT:.*]] = alloca i32, i64 1, align 4 +// LLVM: %[[INIT_2:.*]] = alloca i32, i64 1, align 4 +// LLVM: store ptr %[[ARG]], ptr %[[ARR]], align 8 +// LLVM: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// LLVM: %[[ELE_0:.*]] = getelementptr i32, ptr %[[TMP_1]], i64 0 +// LLVM: %[[TMP_2:.*]] = load i32, ptr %[[ELE_0]], align 4 +// LLVM: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 +// LLVM: %[[TMP_3:.*]] = load ptr, ptr %[[ARR]], align 8 +// LLVM: %[[ELE_1:.*]] = getelementptr i32, ptr %[[TMP_3]], i64 1 +// LLVM: %[[TMP_4:.*]] = load i32, ptr %[[ELE_1]], align 4 +// LLVM: store i32 %[[TMP_4]], ptr %[[INIT_2]], align 4 + +// OGCG: %[[ARR:.*]] = alloca ptr, align 8 +// OGCG: %[[INIT:.*]] = alloca i32, align 4 +// OGCG: %[[INIT_2:.*]] = alloca i32, align 4 +// OGCG: store ptr {{%.*}}, ptr %[[ARR]], align 8 +// OGCG: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// OGCG: %[[ELE_0:.*]] = getelementptr inbounds i32, ptr %[[TMP_1]], i64 0 +// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[ELE_0]], align 4 +// OGCG: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 +// OGCG: %[[TMP_3:.*]] = load ptr, ptr %[[ARR]], align 8 +// OGCG: %[[ELE_1:.*]] = getelementptr inbounds i32, ptr %[[TMP_3]], i64 1 +// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[ELE_1]], align 4 +// OGCG: store i32 %[[TMP_2]], ptr %[[INIT_2]], align 4 -// OGCG: alloca ptr, align 8 +void func9(int arr[10][5]) { + int e = arr[1][2]; +} -void func9(int pp[10][5]) {} -// CIR: cir.func @func9(%arg0: !cir.ptr> -// CIR: cir.alloca !cir.ptr>, !cir.ptr>> +// CIR: cir.func @func9(%[[ARG:.*]]: !cir.ptr> +// CIR: %[[ARR:.*]] = cir.alloca !cir.ptr>, !cir.ptr>>, ["arr", init] +// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init] +// CIR: cir.store %[[ARG]], %[[ARR]] : !cir.ptr>, !cir.ptr>> +// CIR: %[[IDX:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[TMP_1:.*]] = cir.load %[[ARR]] : !cir.ptr>>, !cir.ptr> +// CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[TMP_1]] : !cir.ptr>, %[[IDX_1]] : !s32i), !cir.ptr> +// CIR: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr>), !cir.ptr +// CIR: %[[ARR_1_2:.*]] = cir.ptr_stride(%[[ARR_1_PTR]] : !cir.ptr, %[[IDX]] : !s32i), !cir.ptr +// CIR: %[[TMP_2:.*]] = cir.load %[[ARR_1_2]] : !cir.ptr, !s32i +// CIR: cir.store %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr + +// LLVM: define void @func9(ptr %[[ARG:.*]]) +// LLVM: %[[ARR:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[INIT:.*]] = alloca i32, i64 1, align 4 +// LLVM: store ptr %[[ARG]], ptr %[[ARR]], align 8 +// LLVM: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// LLVM: %[[ARR_1:.*]] = getelementptr [5 x i32], ptr %[[TMP_1]], i64 1 +// LLVM: %[[ARR_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0 +// LLVM: %[[ARR_1_2:.*]] = getelementptr i32, ptr %[[ARR_1_PTR]], i64 2 +// LLVM: %[[TMP_2:.*]] = load i32, ptr %[[ARR_1_2]], align 4 +// LLVM: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 + +// OGCG: %[[ARR:.*]] = alloca ptr, align 8 +// OGCG: %[[INIT:.*]] = alloca i32, align 4 +// OGCG: store ptr {{%.*}}, ptr %[[ARR]], align 8 +// OGCG: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// OGCG: %[[ARR_1:.*]] = getelementptr inbounds [5 x i32], ptr %[[TMP_1]], i64 1 +// OGCG: %[[ARR_1_2:.*]] = getelementptr inbounds [5 x i32], ptr %[[ARR_1]], i64 0, i64 2 +// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[ARR_1_2]], align 4 +// OGCG: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 + +void func10(int *a) { + int e = a[5]; +} -// LLVM: define void @func9(ptr {{%.*}}) -// LLVM-NEXT: alloca ptr, i64 1, align 8 +// CIR: cir.func @func10(%[[ARG:.*]]: !cir.ptr +// CIR: %[[ARR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["a", init] +// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init] +// CIR: cir.store %[[ARG]], %[[ARR]] : !cir.ptr, !cir.ptr> +// CIR: %[[IDX:.*]] = cir.const #cir.int<5> : !s32i +// CIR: %[[TMP_1:.*]] = cir.load %[[ARR]] : !cir.ptr>, !cir.ptr +// CIR: %[[ELE:.*]] = cir.ptr_stride(%[[TMP_1]] : !cir.ptr, %[[IDX]] : !s32i), !cir.ptr +// CIR: %[[TMP_2:.*]] = cir.load %[[ELE]] : !cir.ptr, !s32i +// CIR: cir.store %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr + +// LLVM: define void @func10(ptr %[[ARG:.*]]) { +// LLVM: %[[ARR:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[INIT:.*]] = alloca i32, i64 1, align 4 +// LLVM: store ptr %[[ARG]], ptr %[[ARR]], align 8 +// LLVM: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// LLVM: %[[ELE:.*]] = getelementptr i32, ptr %[[TMP_1]], i64 5 +// LLVM: %[[TMP_2:.*]] = load i32, ptr %[[ELE]], align 4 +// LLVM: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 -// OGCG: alloca ptr, align 8 +// OGCG: %[[ARR:.*]] = alloca ptr, align 8 +// OGCG: %[[INIT:.*]] = alloca i32, align 4 +// OGCG: store ptr {{%.*}}, ptr %[[ARR]], align 8 +// OGCG: %[[TMP_1:.*]] = load ptr, ptr %[[ARR]], align 8 +// OGCG: %[[ELE:.*]] = getelementptr inbounds i32, ptr %[[TMP_1]], i64 5 +// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[ELE]], align 4 +// OGCG: store i32 %[[TMP_2]], ptr %[[INIT]], align 4 From 3f58ff20fe540fbbc2e5bfea1606f8cdc00d4157 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 15 Apr 2025 11:08:48 -0700 Subject: [PATCH 019/710] AArch64: Remove the PAUTH_BLEND pseudo-instruction. It can be represented using a regular MOVK instruction which also has the advantage of sometimes being selectable without a preceding MOV. Reviewers: ahmedbougacha, asl, atrosinenko Reviewed By: atrosinenko Pull Request: https://github.com/llvm/llvm-project/pull/134765 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 +-- .../lib/Target/AArch64/AArch64PointerAuth.cpp | 37 ------------------- .../AArch64/ptrauth-pseudo-instructions.mir | 27 -------------- 3 files changed, 1 insertion(+), 68 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/ptrauth-pseudo-instructions.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b90792d60d102..99f2b79d31bb7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1798,9 +1798,6 @@ def PAUTH_PROLOGUE : Pseudo<(outs), (ins), []>, Sched<[]> { def PAUTH_EPILOGUE : Pseudo<(outs), (ins), []>, Sched<[]>; } -def PAUTH_BLEND : Pseudo<(outs GPR64:$disc), - (ins GPR64:$addr_disc, i32imm:$int_disc), []>, Sched<[]>; - // These pointer authentication instructions require armv8.3a let Predicates = [HasPAuth] in { @@ -10130,7 +10127,7 @@ let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, maySt // v8.3 Pointer Authentication late patterns def : Pat<(int_ptrauth_blend GPR64:$Rd, imm64_0_65535:$imm), - (PAUTH_BLEND GPR64:$Rd, (trunc_imm imm64_0_65535:$imm))>; + (MOVKXi GPR64:$Rd, (trunc_imm imm64_0_65535:$imm), 48)>; def : Pat<(int_ptrauth_blend GPR64:$Rd, GPR64:$Rn), (BFMXri GPR64:$Rd, GPR64:$Rn, 16, 15)>; diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp index c3bc70ad6f427..ba03f4e257b69 100644 --- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp +++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp @@ -42,13 +42,6 @@ class AArch64PointerAuth : public MachineFunctionPass { void authenticateLR(MachineFunction &MF, MachineBasicBlock::iterator MBBI) const; - /// Stores blend(AddrDisc, IntDisc) to the Result register. - void emitBlend(MachineBasicBlock::iterator MBBI, Register Result, - Register AddrDisc, unsigned IntDisc) const; - - /// Expands PAUTH_BLEND pseudo instruction. - void expandPAuthBlend(MachineBasicBlock::iterator MBBI) const; - bool checkAuthenticatedLR(MachineBasicBlock::iterator TI) const; }; @@ -249,32 +242,6 @@ unsigned llvm::AArch64PAuth::getCheckerSizeInBytes(AuthCheckMethod Method) { llvm_unreachable("Unknown AuthCheckMethod enum"); } -void AArch64PointerAuth::emitBlend(MachineBasicBlock::iterator MBBI, - Register Result, Register AddrDisc, - unsigned IntDisc) const { - MachineBasicBlock &MBB = *MBBI->getParent(); - DebugLoc DL = MBBI->getDebugLoc(); - - if (Result != AddrDisc) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), Result) - .addReg(AArch64::XZR) - .addReg(AddrDisc) - .addImm(0); - - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), Result) - .addReg(Result) - .addImm(IntDisc) - .addImm(48); -} - -void AArch64PointerAuth::expandPAuthBlend( - MachineBasicBlock::iterator MBBI) const { - Register ResultReg = MBBI->getOperand(0).getReg(); - Register AddrDisc = MBBI->getOperand(1).getReg(); - unsigned IntDisc = MBBI->getOperand(2).getImm(); - emitBlend(MBBI, ResultReg, AddrDisc, IntDisc); -} - bool AArch64PointerAuth::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget(); TII = Subtarget->getInstrInfo(); @@ -290,7 +257,6 @@ bool AArch64PointerAuth::runOnMachineFunction(MachineFunction &MF) { break; case AArch64::PAUTH_PROLOGUE: case AArch64::PAUTH_EPILOGUE: - case AArch64::PAUTH_BLEND: PAuthPseudoInstrs.push_back(MI.getIterator()); break; } @@ -305,9 +271,6 @@ bool AArch64PointerAuth::runOnMachineFunction(MachineFunction &MF) { case AArch64::PAUTH_EPILOGUE: authenticateLR(MF, It); break; - case AArch64::PAUTH_BLEND: - expandPAuthBlend(It); - break; default: llvm_unreachable("Unhandled opcode"); } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-pseudo-instructions.mir b/llvm/test/CodeGen/AArch64/ptrauth-pseudo-instructions.mir deleted file mode 100644 index d7fe1953deb47..0000000000000 --- a/llvm/test/CodeGen/AArch64/ptrauth-pseudo-instructions.mir +++ /dev/null @@ -1,27 +0,0 @@ -# RUN: llc -mtriple=aarch64--- -run-pass=aarch64-ptrauth -verify-machineinstrs %s -o - | FileCheck %s - -# Test the corner cases that cannot be reliably tested using LLVM IR as input. - ---- | - define i64 @blend_untied(i64 %unused, i64 %ptr_arg) { - ret i64 0 - } -... ---- -# Check that the input register is copied to the output one, if not tied. - -name: blend_untied -tracksRegLiveness: true -body: | - bb.0: - liveins: $lr, $x0, $x1 - $x0 = PAUTH_BLEND $x1, 42 - RET undef $lr - -# CHECK: liveins: $lr, $x0, $x1 -# CHECK-NEXT: {{^ +$}} -# CHECK-NEXT: $x0 = ORRXrs $xzr, $x1, 0 -# CHECK-NEXT: $x0 = MOVKXi $x0, 42, 48 -# CHECK-NEXT: RET undef $lr - -... From a5aa0c46c3274eaf25dde4d792a1abd6191cccf9 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 15 Apr 2025 11:12:05 -0700 Subject: [PATCH 020/710] Introduce -funique-source-file-names flag. The purpose of this flag is to allow the compiler to assume that each object file passed to the linker has been compiled using a unique source file name. This is useful for reducing link times when doing ThinLTO in combination with whole-program devirtualization or CFI, as it allows modules without exported symbols to be built with ThinLTO. Reviewers: vitalybuka, teresajohnson Reviewed By: teresajohnson Pull Request: https://github.com/llvm/llvm-project/pull/135728 --- clang/docs/ControlFlowIntegrity.rst | 5 +++ clang/docs/UsersManual.rst | 10 +++++ clang/include/clang/Basic/CodeGenOptions.def | 2 + clang/include/clang/Driver/Options.td | 7 ++++ clang/lib/CodeGen/CodeGenModule.cpp | 4 ++ clang/lib/Driver/ToolChains/Clang.cpp | 3 ++ clang/test/CodeGen/unique-source-file-names.c | 2 + clang/test/Driver/unique-source-file-names.c | 5 +++ llvm/lib/Transforms/Utils/ModuleUtils.cpp | 40 +++++++++---------- .../unique-source-file-names.ll | 22 ++++++++++ 10 files changed, 79 insertions(+), 21 deletions(-) create mode 100644 clang/test/CodeGen/unique-source-file-names.c create mode 100644 clang/test/Driver/unique-source-file-names.c create mode 100644 llvm/test/Transforms/ThinLTOBitcodeWriter/unique-source-file-names.ll diff --git a/clang/docs/ControlFlowIntegrity.rst b/clang/docs/ControlFlowIntegrity.rst index 2f2f8ccf4481b..baff9ab54ff26 100644 --- a/clang/docs/ControlFlowIntegrity.rst +++ b/clang/docs/ControlFlowIntegrity.rst @@ -42,6 +42,11 @@ default visibility setting is ``-fvisibility=default``, which would disable CFI checks for classes without visibility attributes. Most users will want to specify ``-fvisibility=hidden``, which enables CFI checks for such classes. +When using ``-fsanitize=cfi*`` with ``-flto=thin``, it is recommended +to reduce link times by passing `-funique-source-file-names +`_, provided +that your program is compatible with it. + Experimental support for :ref:`cross-DSO control flow integrity ` exists that does not require classes to have hidden LTO visibility. This cross-DSO support has unstable ABI at this time. diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 2a93c2552d7dc..d4656a7e63c99 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2297,6 +2297,16 @@ are listed below. pure ThinLTO, as all split regular LTO modules are merged and LTO linked with regular LTO. +.. option:: -f[no-]unique-source-file-names + + When enabled, allows the compiler to assume that each object file + passed to the linker has been compiled using a unique source file + name. This is useful for reducing link times when doing ThinLTO + in combination with whole-program devirtualization or CFI. + + A misuse of this flag will generally result in a duplicate symbol + error at link time. + .. option:: -fforce-emit-vtables In order to improve devirtualization, forces emitting of vtables even in diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index a436c0ec98d5b..c5990fb248689 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -278,6 +278,8 @@ CODEGENOPT(SanitizeCfiICallNormalizeIntegers, 1, 0) ///< Normalize integer types ///< CFI icall function signatures CODEGENOPT(SanitizeCfiCanonicalJumpTables, 1, 0) ///< Make jump table symbols canonical ///< instead of creating a local jump table. +CODEGENOPT(UniqueSourceFileNames, 1, 0) ///< Allow the compiler to assume that TUs + ///< have unique source file names at link time CODEGENOPT(SanitizeKcfiArity, 1, 0) ///< Embed arity in KCFI patchable function prefix CODEGENOPT(SanitizeCoverageType, 2, 0) ///< Type of sanitizer coverage ///< instrumentation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c9d2bc5e81976..e9acb20348654 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4140,6 +4140,13 @@ def ftrigraphs : Flag<["-"], "ftrigraphs">, Group, def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group, HelpText<"Do not process trigraph sequences">, Visibility<[ClangOption, CC1Option]>; +defm unique_source_file_names: BoolOption<"f", "unique-source-file-names", + CodeGenOpts<"UniqueSourceFileNames">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], [ClangOption], " the compiler to assume that each translation unit has a unique " + "source file name at link time">>, + Group; def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group; def funsigned_char : Flag<["-"], "funsigned-char">, Group; def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 4a48c2f35ff23..26e09fe239242 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1144,6 +1144,10 @@ void CodeGenModule::Release() { 1); } + if (CodeGenOpts.UniqueSourceFileNames) { + getModule().addModuleFlag(llvm::Module::Max, "Unique Source File Names", 1); + } + if (LangOpts.Sanitize.has(SanitizerKind::KCFI)) { getModule().addModuleFlag(llvm::Module::Override, "kcfi", 1); // KCFI assumes patchable-function-prefix is the same for all indirectly diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 65910e7fdaaa6..8506a5c00e7bc 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7744,6 +7744,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_late_parse_attributes, options::OPT_fno_experimental_late_parse_attributes); + Args.addOptInFlag(CmdArgs, options::OPT_funique_source_file_names, + options::OPT_fno_unique_source_file_names); + // Setup statistics file output. SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D); if (!StatsFile.empty()) { diff --git a/clang/test/CodeGen/unique-source-file-names.c b/clang/test/CodeGen/unique-source-file-names.c new file mode 100644 index 0000000000000..1d5a4a5e8e4c5 --- /dev/null +++ b/clang/test/CodeGen/unique-source-file-names.c @@ -0,0 +1,2 @@ +// RUN: %clang_cc1 -funique-source-file-names -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s +// CHECK: !{i32 7, !"Unique Source File Names", i32 1} diff --git a/clang/test/Driver/unique-source-file-names.c b/clang/test/Driver/unique-source-file-names.c new file mode 100644 index 0000000000000..8322f0e37b0c7 --- /dev/null +++ b/clang/test/Driver/unique-source-file-names.c @@ -0,0 +1,5 @@ +// RUN: %clang -funique-source-file-names -### %s 2> %t +// RUN: FileCheck < %t %s + +// CHECK: "-cc1" +// CHECK: "-funique-source-file-names" diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 1c31e851ef4b2..10efdd61d4553 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -345,27 +345,25 @@ void llvm::filterDeadComdatFunctions( std::string llvm::getUniqueModuleId(Module *M) { MD5 Md5; - bool ExportsSymbols = false; - auto AddGlobal = [&](GlobalValue &GV) { - if (GV.isDeclaration() || GV.getName().starts_with("llvm.") || - !GV.hasExternalLinkage() || GV.hasComdat()) - return; - ExportsSymbols = true; - Md5.update(GV.getName()); - Md5.update(ArrayRef{0}); - }; - - for (auto &F : *M) - AddGlobal(F); - for (auto &GV : M->globals()) - AddGlobal(GV); - for (auto &GA : M->aliases()) - AddGlobal(GA); - for (auto &IF : M->ifuncs()) - AddGlobal(IF); - - if (!ExportsSymbols) - return ""; + + auto *UniqueSourceFileNames = mdconst::extract_or_null( + M->getModuleFlag("Unique Source File Names")); + if (UniqueSourceFileNames && UniqueSourceFileNames->getZExtValue()) { + Md5.update(M->getSourceFileName()); + } else { + bool ExportsSymbols = false; + for (auto &GV : M->global_values()) { + if (GV.isDeclaration() || GV.getName().starts_with("llvm.") || + !GV.hasExternalLinkage() || GV.hasComdat()) + continue; + ExportsSymbols = true; + Md5.update(GV.getName()); + Md5.update(ArrayRef{0}); + } + + if (!ExportsSymbols) + return ""; + } MD5::MD5Result R; Md5.final(R); diff --git a/llvm/test/Transforms/ThinLTOBitcodeWriter/unique-source-file-names.ll b/llvm/test/Transforms/ThinLTOBitcodeWriter/unique-source-file-names.ll new file mode 100644 index 0000000000000..0f3fd566f9b1c --- /dev/null +++ b/llvm/test/Transforms/ThinLTOBitcodeWriter/unique-source-file-names.ll @@ -0,0 +1,22 @@ +; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t2 -thinlto-split-lto-unit -o %t %s +; RUN: llvm-modextract -b -n 1 -o %t1 %t +; RUN: llvm-dis -o - %t1 | FileCheck %s + +source_filename = "unique-source-file-names.c" + +@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @f, ptr null }] + +; CHECK: @g.45934e8a5251fb7adbecfff71a4e70ed = +@g = internal global i8 42, !type !0 + +declare void @sink(ptr) + +define internal void @f() { + call void @sink(ptr @g) + ret void +} + +!0 = !{i32 0, !"typeid"} + +!llvm.module.flags = !{!1} +!1 = !{i32 1, !"Unique Source File Names", i32 1} From d0c973a7a0149db3b71767d4c5a20a31e6a8ed5b Mon Sep 17 00:00:00 2001 From: Michael Spencer Date: Tue, 15 Apr 2025 11:19:07 -0700 Subject: [PATCH 021/710] [llvm][clang] Allocate a new stack instead of spawning a new thread to get more stack space (#133173) Clang spawns a new thread to avoid running out of stack space. This can make debugging and performance analysis more difficult as how the threads are connected is difficult to recover. This patch introduces `runOnNewStack` and applies it in Clang. On platforms that have good support for it this allocates a new stack and moves to it using assembly. Doing split stacks like this actually runs on most platforms, but many debuggers and unwinders reject the large or backwards stack offsets that occur. Apple platforms and tools are known to support this, so this only enables it there for now. --- clang/docs/ReleaseNotes.rst | 4 + clang/include/clang/Basic/Stack.h | 5 +- clang/lib/Basic/Stack.cpp | 40 ++---- clang/lib/Frontend/CompilerInstance.cpp | 2 +- .../llvm/Support/CrashRecoveryContext.h | 3 + llvm/include/llvm/Support/ProgramStack.h | 62 ++++++++++ llvm/lib/Support/CMakeLists.txt | 1 + llvm/lib/Support/CrashRecoveryContext.cpp | 11 ++ llvm/lib/Support/ProgramStack.cpp | 114 ++++++++++++++++++ llvm/unittests/Support/CMakeLists.txt | 1 + llvm/unittests/Support/ProgramStackTest.cpp | 35 ++++++ 11 files changed, 248 insertions(+), 30 deletions(-) create mode 100644 llvm/include/llvm/Support/ProgramStack.h create mode 100644 llvm/lib/Support/ProgramStack.cpp create mode 100644 llvm/unittests/Support/ProgramStackTest.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e63de32a0b2aa..22543821ee4b3 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -195,6 +195,10 @@ Non-comprehensive list of changes in this release - Added `__builtin_elementwise_exp10`. - For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction. - Added `__builtin_elementwise_minnum` and `__builtin_elementwise_maxnum`. +- Clang itself now uses split stacks instead of threads for allocating more + stack space when running on Apple AArch64 based platforms. This means that + stack traces of Clang from debuggers, crashes, and profilers may look + different than before. New Compiler Flags ------------------ diff --git a/clang/include/clang/Basic/Stack.h b/clang/include/clang/Basic/Stack.h index 30ebd94aedd1f..9674b9d9b62c3 100644 --- a/clang/include/clang/Basic/Stack.h +++ b/clang/include/clang/Basic/Stack.h @@ -27,7 +27,10 @@ namespace clang { /// Call this once on each thread, as soon after starting the thread as /// feasible, to note the approximate address of the bottom of the stack. - void noteBottomOfStack(); + /// + /// \param ForceSet set to true if you know the call is near the bottom of a + /// new stack. Used for split stacks. + void noteBottomOfStack(bool ForceSet = false); /// Determine whether the stack is nearly exhausted. bool isStackNearlyExhausted(); diff --git a/clang/lib/Basic/Stack.cpp b/clang/lib/Basic/Stack.cpp index aa15d8e66950f..8cbb84943f8d3 100644 --- a/clang/lib/Basic/Stack.cpp +++ b/clang/lib/Basic/Stack.cpp @@ -13,33 +13,13 @@ #include "clang/Basic/Stack.h" #include "llvm/Support/CrashRecoveryContext.h" +#include "llvm/Support/ProgramStack.h" -#ifdef _MSC_VER -#include // for _AddressOfReturnAddress -#endif +static LLVM_THREAD_LOCAL uintptr_t BottomOfStack = 0; -static LLVM_THREAD_LOCAL void *BottomOfStack = nullptr; - -static void *getStackPointer() { -#if __GNUC__ || __has_builtin(__builtin_frame_address) - return __builtin_frame_address(0); -#elif defined(_MSC_VER) - return _AddressOfReturnAddress(); -#else - char CharOnStack = 0; - // The volatile store here is intended to escape the local variable, to - // prevent the compiler from optimizing CharOnStack into anything other - // than a char on the stack. - // - // Tested on: MSVC 2015 - 2019, GCC 4.9 - 9, Clang 3.2 - 9, ICC 13 - 19. - char *volatile Ptr = &CharOnStack; - return Ptr; -#endif -} - -void clang::noteBottomOfStack() { - if (!BottomOfStack) - BottomOfStack = getStackPointer(); +void clang::noteBottomOfStack(bool ForceSet) { + if (!BottomOfStack || ForceSet) + BottomOfStack = llvm::getStackPointer(); } bool clang::isStackNearlyExhausted() { @@ -51,7 +31,8 @@ bool clang::isStackNearlyExhausted() { if (!BottomOfStack) return false; - intptr_t StackDiff = (intptr_t)getStackPointer() - (intptr_t)BottomOfStack; + intptr_t StackDiff = + (intptr_t)llvm::getStackPointer() - (intptr_t)BottomOfStack; size_t StackUsage = (size_t)std::abs(StackDiff); // If the stack pointer has a surprising value, we do not understand this @@ -66,9 +47,12 @@ bool clang::isStackNearlyExhausted() { void clang::runWithSufficientStackSpaceSlow(llvm::function_ref Diag, llvm::function_ref Fn) { llvm::CrashRecoveryContext CRC; - CRC.RunSafelyOnThread([&] { - noteBottomOfStack(); + // Preserve the BottomOfStack in case RunSafelyOnNewStack uses split stacks. + uintptr_t PrevBottom = BottomOfStack; + CRC.RunSafelyOnNewStack([&] { + noteBottomOfStack(true); Diag(); Fn(); }, DesiredStackSize); + BottomOfStack = PrevBottom; } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 243e0a3c15b05..5fe80fc16482e 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -1265,7 +1265,7 @@ bool CompilerInstance::compileModule(SourceLocation ImportLoc, // Execute the action to actually build the module in-place. Use a separate // thread so that we get a stack large enough. - bool Crashed = !llvm::CrashRecoveryContext().RunSafelyOnThread( + bool Crashed = !llvm::CrashRecoveryContext().RunSafelyOnNewStack( [&]() { GenerateModuleFromModuleMapAction Action; Instance.ExecuteAction(Action); diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h index 26ddf97b3ef02..31293d6715757 100644 --- a/llvm/include/llvm/Support/CrashRecoveryContext.h +++ b/llvm/include/llvm/Support/CrashRecoveryContext.h @@ -97,6 +97,9 @@ class CrashRecoveryContext { return RunSafelyOnThread([&]() { Fn(UserData); }, RequestedStackSize); } + bool RunSafelyOnNewStack(function_ref, + unsigned RequestedStackSize = 0); + /// Explicitly trigger a crash recovery in the current process, and /// return failure from RunSafely(). This function does not return. [[noreturn]] void HandleExit(int RetCode); diff --git a/llvm/include/llvm/Support/ProgramStack.h b/llvm/include/llvm/Support/ProgramStack.h new file mode 100644 index 0000000000000..3ce5de1c0e0d6 --- /dev/null +++ b/llvm/include/llvm/Support/ProgramStack.h @@ -0,0 +1,62 @@ +//===--- ProgramStack.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_PROGRAMSTACK_H +#define LLVM_SUPPORT_PROGRAMSTACK_H + +#include "llvm/ADT/STLFunctionalExtras.h" + +// LLVM_HAS_SPLIT_STACKS is exposed in the header because CrashRecoveryContext +// needs to know if it's running on another thread or not. +// +// Currently only Apple AArch64 is known to support split stacks in the debugger +// and other tooling. +#if defined(__APPLE__) && defined(__aarch64__) && \ + LLVM_HAS_CPP_ATTRIBUTE(gnu::naked) && __has_extension(gnu_asm) +# define LLVM_HAS_SPLIT_STACKS +# define LLVM_HAS_SPLIT_STACKS_AARCH64 +#endif + +namespace llvm { + +/// \returns an address close to the current value of the stack pointer. +/// +/// The value is not guaranteed to point to anything specific. It can be used to +/// estimate how much stack space has been used since the previous call. +uintptr_t getStackPointer(); + +/// \returns the default stack size for this platform. +/// +/// Based on \p RLIMIT_STACK or the equivalent. +unsigned getDefaultStackSize(); + +/// Runs Fn on a new stack of at least the given size. +/// +/// \param StackSize requested stack size. A size of 0 uses the default stack +/// size of the platform. +/// +/// The preferred implementation is split stacks on platforms that have a good +/// debugging experience for them. On other platforms a new thread is used. +void runOnNewStack(unsigned StackSize, function_ref Fn); + +template +R runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { + std::optional Ret; + runOnNewStack(StackSize, [&]() { Ret = Fn(std::forward(Args)...); }); + return std::move(*Ret); +} + +template +void runOnNewStack(unsigned StackSize, function_ref Fn, + Ts &&...Args) { + runOnNewStack(StackSize, [&]() { Fn(std::forward(Args)...); }); +} + +} // namespace llvm + +#endif // LLVM_SUPPORT_PROGRAMSTACK_H diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 98ffd829d80b8..def37f3f278d0 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -295,6 +295,7 @@ add_llvm_component_library(LLVMSupport Path.cpp Process.cpp Program.cpp + ProgramStack.cpp RWMutex.cpp Signals.cpp Threading.cpp diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp index f53aea177d612..88c38d7526e71 100644 --- a/llvm/lib/Support/CrashRecoveryContext.cpp +++ b/llvm/lib/Support/CrashRecoveryContext.cpp @@ -10,6 +10,7 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ExitCodes.h" +#include "llvm/Support/ProgramStack.h" #include "llvm/Support/Signals.h" #include "llvm/Support/thread.h" #include @@ -523,3 +524,13 @@ bool CrashRecoveryContext::RunSafelyOnThread(function_ref Fn, CRC->setSwitchedThread(); return Info.Result; } + +bool CrashRecoveryContext::RunSafelyOnNewStack(function_ref Fn, + unsigned RequestedStackSize) { +#ifdef LLVM_HAS_SPLIT_STACKS + return runOnNewStack(RequestedStackSize, + function_ref([&]() { return RunSafely(Fn); })); +#else + return RunSafelyOnThread(Fn, RequestedStackSize); +#endif +} diff --git a/llvm/lib/Support/ProgramStack.cpp b/llvm/lib/Support/ProgramStack.cpp new file mode 100644 index 0000000000000..9e5a546b34974 --- /dev/null +++ b/llvm/lib/Support/ProgramStack.cpp @@ -0,0 +1,114 @@ +//===--- RunOnNewStack.cpp - Crash Recovery -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ProgramStack.h" +#include "llvm/Config/config.h" +#include "llvm/Support/Compiler.h" + +#ifdef LLVM_ON_UNIX +# include // for getrlimit +#endif + +#ifdef _MSC_VER +# include // for _AddressOfReturnAddress +#endif + +#ifndef LLVM_HAS_SPLIT_STACKS +# include "llvm/Support/thread.h" +#endif + +using namespace llvm; + +uintptr_t llvm::getStackPointer() { +#if __GNUC__ || __has_builtin(__builtin_frame_address) + return (uintptr_t)__builtin_frame_address(0); +#elif defined(_MSC_VER) + return (uintptr_t)_AddressOfReturnAddress(); +#else + volatile char CharOnStack = 0; + // The volatile store here is intended to escape the local variable, to + // prevent the compiler from optimizing CharOnStack into anything other + // than a char on the stack. + // + // Tested on: MSVC 2015 - 2019, GCC 4.9 - 9, Clang 3.2 - 9, ICC 13 - 19. + char *volatile Ptr = &CharOnStack; + return (uintptr_t)Ptr; +#endif +} + +unsigned llvm::getDefaultStackSize() { +#ifdef LLVM_ON_UNIX + rlimit RL; + getrlimit(RLIMIT_STACK, &RL); + return RL.rlim_cur; +#else + // Clang recursively parses, instantiates templates, and evaluates constant + // expressions. We've found 8MiB to be a reasonable stack size given the way + // Clang works and the way C++ is commonly written. + return 8 << 20; +#endif +} + +namespace { +#ifdef LLVM_HAS_SPLIT_STACKS_AARCH64 +[[gnu::naked]] void runOnNewStackImpl(void *Stack, void (*Fn)(void *), + void *Ctx) { + __asm__ volatile( + "mov x16, sp\n\t" + "sub x0, x0, #0x20\n\t" // subtract space from stack + "stp xzr, x16, [x0, #0x00]\n\t" // save old sp + "stp x29, x30, [x0, #0x10]\n\t" // save fp, lr + "mov sp, x0\n\t" // switch to new stack + "add x29, x0, #0x10\n\t" // switch to new frame + ".cfi_def_cfa w29, 16\n\t" + ".cfi_offset w30, -8\n\t" // lr + ".cfi_offset w29, -16\n\t" // fp + + "mov x0, x2\n\t" // Ctx is the only argument + "blr x1\n\t" // call Fn + + "ldp x29, x30, [sp, #0x10]\n\t" // restore fp, lr + "ldp xzr, x16, [sp, #0x00]\n\t" // load old sp + "mov sp, x16\n\t" + "ret" + ); +} +#endif + +#ifdef LLVM_HAS_SPLIT_STACKS +void callback(void *Ctx) { + (*reinterpret_cast *>(Ctx))(); +} +#endif +} // namespace + +#ifdef LLVM_HAS_SPLIT_STACKS +void llvm::runOnNewStack(unsigned StackSize, function_ref Fn) { + if (StackSize == 0) + StackSize = getDefaultStackSize(); + + // We use malloc here instead of mmap because: + // - it's simpler, + // - many malloc implementations will reuse the allocation in cases where + // we're bouncing accross the edge of a stack boundry, and + // - many malloc implemenations will already provide guard pages for + // allocations this large. + void *Stack = malloc(StackSize); + void *BottomOfStack = (char *)Stack + StackSize; + + runOnNewStackImpl(BottomOfStack, callback, &Fn); + + free(Stack); +} +#else +void llvm::runOnNewStack(unsigned StackSize, function_ref Fn) { + llvm::thread Thread( + StackSize == 0 ? std::nullopt : std::optional(StackSize), Fn); + Thread.join(); +} +#endif diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index 6c4e7cb689b20..e5bf820fb4d1c 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -70,6 +70,7 @@ add_llvm_unittest(SupportTests PerThreadBumpPtrAllocatorTest.cpp ProcessTest.cpp ProgramTest.cpp + ProgramStackTest.cpp RecyclerTest.cpp RegexTest.cpp ReverseIterationTest.cpp diff --git a/llvm/unittests/Support/ProgramStackTest.cpp b/llvm/unittests/Support/ProgramStackTest.cpp new file mode 100644 index 0000000000000..31dfb3b88ade6 --- /dev/null +++ b/llvm/unittests/Support/ProgramStackTest.cpp @@ -0,0 +1,35 @@ +//===- unittest/Support/ProgramStackTest.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ProgramStack.h" +#include "llvm/Support/Process.h" +#include "gtest/gtest.h" + +using namespace llvm; + +static uintptr_t func(int &A) { + A = 7; + return getStackPointer(); +} + +static void func2(int &A) { + A = 5; +} + +TEST(ProgramStackTest, runOnNewStack) { + int A = 0; + uintptr_t Stack = runOnNewStack(0, function_ref(func), A); + EXPECT_EQ(A, 7); + intptr_t StackDiff = (intptr_t)llvm::getStackPointer() - (intptr_t)Stack; + size_t StackDistance = (size_t)std::abs(StackDiff); + // Page size is used as it's large enough to guarantee were not on the same + // stack but not too large to cause spurious failures. + EXPECT_GT(StackDistance, llvm::sys::Process::getPageSizeEstimate()); + runOnNewStack(0, function_ref(func2), A); + EXPECT_EQ(A, 5); +} From 429a84f8a4bf559f43f50072747ef49d3e3b2cf1 Mon Sep 17 00:00:00 2001 From: Michael Spencer Date: Tue, 15 Apr 2025 11:34:26 -0700 Subject: [PATCH 022/710] [clang] Fix ambiguity in `runOnNewStack` --- llvm/include/llvm/Support/ProgramStack.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/ProgramStack.h b/llvm/include/llvm/Support/ProgramStack.h index 3ce5de1c0e0d6..232a7b5670b44 100644 --- a/llvm/include/llvm/Support/ProgramStack.h +++ b/llvm/include/llvm/Support/ProgramStack.h @@ -45,7 +45,8 @@ unsigned getDefaultStackSize(); void runOnNewStack(unsigned StackSize, function_ref Fn); template -R runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { +std::enable_if_t, R> +runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { std::optional Ret; runOnNewStack(StackSize, [&]() { Ret = Fn(std::forward(Args)...); }); return std::move(*Ret); From 227f4066befadf0d10eddb39947e35dbf820b1bb Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 15 Apr 2025 18:36:11 +0000 Subject: [PATCH 023/710] [gn build] Port d0c973a7a014 --- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 3a9f43b1070a7..0d2330cba6a7a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -124,6 +124,7 @@ static_library("Support") { "Parallel.cpp", "PluginLoader.cpp", "PrettyStackTrace.cpp", + "ProgramStack.cpp", "RISCVAttributeParser.cpp", "RISCVAttributes.cpp", "RISCVISAUtils.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index bf6a0b7523279..19418ad52147b 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -71,6 +71,7 @@ unittest("SupportTests") { "Path.cpp", "PerThreadBumpPtrAllocatorTest.cpp", "ProcessTest.cpp", + "ProgramStackTest.cpp", "ProgramTest.cpp", "RISCVAttributeParserTest.cpp", "RecyclerTest.cpp", From 13615f7b506a693783764da87dc80e97cf59b95c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 15 Apr 2025 11:56:35 -0700 Subject: [PATCH 024/710] [flang][openacc] Allow if clause on atomic directives (#135451) The new version of the OpenACC specification will allow the if clause on the atomic directives. Allow it in `ACC.td` and update the parse node and parser in flang to support it. OpenACC dialect will need to be updated to support it as well. --- flang/include/flang/Parser/parse-tree.h | 10 +++--- flang/lib/Parser/openacc-parsers.cpp | 25 ++++++++------- .../Semantics/OpenACC/acc-atomic-validity.f90 | 32 +++++++++++++++++++ llvm/include/llvm/Frontend/OpenACC/ACC.td | 1 + 4 files changed, 53 insertions(+), 15 deletions(-) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index eeb438991feee..0c2a5de3b71d2 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -5244,21 +5244,23 @@ EMPTY_CLASS(AccEndAtomic); // ACC ATOMIC READ struct AccAtomicRead { TUPLE_CLASS_BOILERPLATE(AccAtomicRead); - std::tuple, std::optional> + std::tuple, + std::optional> t; }; // ACC ATOMIC WRITE struct AccAtomicWrite { TUPLE_CLASS_BOILERPLATE(AccAtomicWrite); - std::tuple, std::optional> + std::tuple, + std::optional> t; }; // ACC ATOMIC UPDATE struct AccAtomicUpdate { TUPLE_CLASS_BOILERPLATE(AccAtomicUpdate); - std::tuple, Statement, + std::tuple, AccClauseList, Statement, std::optional> t; }; @@ -5268,7 +5270,7 @@ struct AccAtomicCapture { TUPLE_CLASS_BOILERPLATE(AccAtomicCapture); WRAPPER_CLASS(Stmt1, Statement); WRAPPER_CLASS(Stmt2, Statement); - std::tuple t; + std::tuple t; }; struct OpenACCAtomicConstruct { diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp index fb731ee52cbba..072eba99826a1 100644 --- a/flang/lib/Parser/openacc-parsers.cpp +++ b/flang/lib/Parser/openacc-parsers.cpp @@ -187,22 +187,25 @@ TYPE_PARSER(construct( // 2.12 Atomic constructs TYPE_PARSER(construct(startAccLine >> "END ATOMIC"_tok)) -TYPE_PARSER("ATOMIC" >> - construct(verbatim("READ"_tok) / endAccLine, - statement(assignmentStmt), maybe(Parser{} / endAccLine))) +TYPE_PARSER("ATOMIC" >> construct(verbatim("READ"_tok), + Parser{} / endAccLine, + statement(assignmentStmt), + maybe(Parser{} / endAccLine))) -TYPE_PARSER("ATOMIC" >> - construct(verbatim("WRITE"_tok) / endAccLine, - statement(assignmentStmt), maybe(Parser{} / endAccLine))) +TYPE_PARSER("ATOMIC" >> construct(verbatim("WRITE"_tok), + Parser{} / endAccLine, + statement(assignmentStmt), + maybe(Parser{} / endAccLine))) TYPE_PARSER("ATOMIC" >> - construct(maybe(verbatim("UPDATE"_tok)) / endAccLine, - statement(assignmentStmt), maybe(Parser{} / endAccLine))) + construct(maybe(verbatim("UPDATE"_tok)), + Parser{} / endAccLine, statement(assignmentStmt), + maybe(Parser{} / endAccLine))) TYPE_PARSER("ATOMIC" >> - construct(verbatim("CAPTURE"_tok) / endAccLine, - statement(assignmentStmt), statement(assignmentStmt), - Parser{} / endAccLine)) + construct(verbatim("CAPTURE"_tok), + Parser{} / endAccLine, statement(assignmentStmt), + statement(assignmentStmt), Parser{} / endAccLine)) TYPE_PARSER( sourced(construct(Parser{})) || diff --git a/flang/test/Semantics/OpenACC/acc-atomic-validity.f90 b/flang/test/Semantics/OpenACC/acc-atomic-validity.f90 index ba68031b0f18b..07fb864695737 100644 --- a/flang/test/Semantics/OpenACC/acc-atomic-validity.f90 +++ b/flang/test/Semantics/OpenACC/acc-atomic-validity.f90 @@ -10,6 +10,7 @@ program openacc_atomic_validity integer :: i integer, parameter :: N = 256 integer, dimension(N) :: c + logical :: l !$acc parallel @@ -23,27 +24,58 @@ program openacc_atomic_validity !$acc atomic write c(i) = 10 + !$acc atomic write if(l) + c(i) = 10 + !$acc atomic write c(i) = 10 !$acc end atomic + !$acc atomic write if(.true.) + c(i) = 10 + !$acc end atomic + !$acc atomic read i = c(i) + + !$acc atomic read if(.true.) + i = c(i) !$acc atomic read i = c(i) !$acc end atomic + !$acc atomic read if(l) + i = c(i) + !$acc end atomic + + !ERROR: FINALIZE clause is not allowed on the ATOMIC READ FINALIZE IF(L) + !$acc atomic read finalize if(l) + i = c(i) + !$acc end atomic + !$acc atomic capture c(i) = i i = i + 1 !$acc end atomic + !$acc atomic capture if(l .EQV. .false.) + c(i) = i + i = i + 1 + !$acc end atomic + !$acc atomic update !ERROR: RHS of atomic update statement must be scalar !ERROR: LHS of atomic update statement must be scalar c = c + 1 + !$acc atomic update if(i == 0) + c(i) = c(i) + 1 + + !ERROR: At most one IF clause can appear on the ATOMIC UPDATE IF(I == 0) IF(.TRUE.) + !$acc atomic update if(i == 0) if(.true.) + c(i) = c(i) + 1 + !$acc end parallel end program openacc_atomic_validity diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td index 2acee9bcc7195..e1a4183785d1f 100644 --- a/llvm/include/llvm/Frontend/OpenACC/ACC.td +++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td @@ -270,6 +270,7 @@ def ACCC_Unknown : Clause<"unknown"> { // 2.12 def ACC_Atomic : Directive<"atomic"> { + let allowedOnceClauses = [VersionedClause]; let association = AS_Block; let category = CA_Executable; } From 8f25e43055058a6a16bf44573feb37a9ce51dc1a Mon Sep 17 00:00:00 2001 From: AdityaK Date: Tue, 15 Apr 2025 12:11:24 -0700 Subject: [PATCH 025/710] [NFC] Rename hasSameElementsOrSplat to hasSameNumElementsOrSplat (#133183) Makes it less confusing as this function only matches the number of elements --- mlir/lib/IR/BuiltinAttributes.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp index daf79dc5de981..67d1ad927cacc 100644 --- a/mlir/lib/IR/BuiltinAttributes.cpp +++ b/mlir/lib/IR/BuiltinAttributes.cpp @@ -589,7 +589,7 @@ static APInt readBits(const char *rawData, size_t bitPos, size_t bitWidth) { /// Returns true if 'values' corresponds to a splat, i.e. one element, or has /// the same element count as 'type'. template -static bool hasSameElementsOrSplat(ShapedType type, const Values &values) { +static bool hasSameNumElementsOrSplat(ShapedType type, const Values &values) { return (values.size() == 1) || (type.getNumElements() == static_cast(values.size())); } @@ -901,7 +901,7 @@ bool DenseElementsAttr::classof(Attribute attr) { DenseElementsAttr DenseElementsAttr::get(ShapedType type, ArrayRef values) { - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); Type eltType = type.getElementType(); @@ -985,7 +985,7 @@ DenseElementsAttr DenseElementsAttr::get(ShapedType type, DenseElementsAttr DenseElementsAttr::get(ShapedType type, ArrayRef values) { - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); assert(type.getElementType().isInteger(1)); std::vector buff(llvm::divideCeil(values.size(), CHAR_BIT)); @@ -1020,7 +1020,7 @@ DenseElementsAttr DenseElementsAttr::get(ShapedType type, DenseElementsAttr DenseElementsAttr::get(ShapedType type, ArrayRef values) { assert(type.getElementType().isIntOrIndex()); - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); size_t storageBitWidth = getDenseElementStorageWidth(type.getElementType()); return DenseIntOrFPElementsAttr::getRaw(type, storageBitWidth, values); } @@ -1028,7 +1028,7 @@ DenseElementsAttr DenseElementsAttr::get(ShapedType type, ArrayRef> values) { ComplexType complex = llvm::cast(type.getElementType()); assert(llvm::isa(complex.getElementType())); - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); size_t storageBitWidth = getDenseElementStorageWidth(complex) / 2; ArrayRef intVals(reinterpret_cast(values.data()), values.size() * 2); @@ -1041,7 +1041,7 @@ DenseElementsAttr DenseElementsAttr::get(ShapedType type, DenseElementsAttr DenseElementsAttr::get(ShapedType type, ArrayRef values) { assert(llvm::isa(type.getElementType())); - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); size_t storageBitWidth = getDenseElementStorageWidth(type.getElementType()); return DenseIntOrFPElementsAttr::getRaw(type, storageBitWidth, values); } @@ -1050,7 +1050,7 @@ DenseElementsAttr::get(ShapedType type, ArrayRef> values) { ComplexType complex = llvm::cast(type.getElementType()); assert(llvm::isa(complex.getElementType())); - assert(hasSameElementsOrSplat(type, values)); + assert(hasSameNumElementsOrSplat(type, values)); ArrayRef apVals(reinterpret_cast(values.data()), values.size() * 2); size_t storageBitWidth = getDenseElementStorageWidth(complex) / 2; From 9a1ece26126363c64c67d9a6e357076e814acf9e Mon Sep 17 00:00:00 2001 From: marius doerner Date: Tue, 15 Apr 2025 21:13:56 +0200 Subject: [PATCH 026/710] [clang] Clear `NeedsCleaning` flag after `ExpandBuiltinMacro` (#133574) After builtin macro expansion in `Preprocessor::ExpandBuiltinMacro` the result token may have the `Token::NeedsCleaning` flag set which causes an assertion failure later on when the lexer retrieves the spelling of the token in `getSpellingSlow`. This commit adds an `Tok.clearFlag(Token::NeedsCleaning)` call to the end of `ExpandBuiltinMacro`. Closes #128384 --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Lex/PPMacroExpansion.cpp | 1 + clang/test/Preprocessor/embed___has_embed.c | 19 +++ clang/test/Preprocessor/has_attribute.c | 20 ++++ clang/test/Preprocessor/has_attribute.cpp | 20 ++++ clang/test/Preprocessor/has_c_attribute.c | 20 ++++ clang/test/Preprocessor/has_include.c | 49 ++++++++ clang/test/Preprocessor/pr133574.c | 121 ++++++++++++++++++++ 8 files changed, 252 insertions(+) create mode 100644 clang/test/Preprocessor/pr133574.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 22543821ee4b3..6025e76029d19 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -404,6 +404,8 @@ Bug Fixes in This Version - Defining an integer literal suffix (e.g., ``LL``) before including ```` in a freestanding build no longer causes invalid token pasting when using the ``INTn_C`` macros. (#GH85995) +- Fixed an assertion failure in the expansion of builtin macros like ``__has_embed()`` with line breaks before the + closing paren. (#GH133574) - Clang no longer accepts invalid integer constants which are too large to fit into any (standard or extended) integer type when the constant is unevaluated. Merely forming the token is sufficient to render the program invalid. Code diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp index 8e35d56d3f1a6..37ac1bf07e9c0 100644 --- a/clang/lib/Lex/PPMacroExpansion.cpp +++ b/clang/lib/Lex/PPMacroExpansion.cpp @@ -2089,6 +2089,7 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) { CreateString(OS.str(), Tok, Tok.getLocation(), Tok.getLocation()); Tok.setFlagValue(Token::StartOfLine, IsAtStartOfLine); Tok.setFlagValue(Token::LeadingSpace, HasLeadingSpace); + Tok.clearFlag(Token::NeedsCleaning); } void Preprocessor::markMacroAsUsed(MacroInfo *MI) { diff --git a/clang/test/Preprocessor/embed___has_embed.c b/clang/test/Preprocessor/embed___has_embed.c index 43a3068b5f53a..2705b5ef6fd5b 100644 --- a/clang/test/Preprocessor/embed___has_embed.c +++ b/clang/test/Preprocessor/embed___has_embed.c @@ -58,3 +58,22 @@ unsigned char buffer[] = { #else #error 17 #endif + +#if __has_embed(__FILE__\ +) +#else +#error 18 +#endif + +#define F __FI\ +LE__ +#if __has_embed(F) +#else +#error 19 +#endif + +#if __has_embed(F\ +) +#else +#error 20 +#endif diff --git a/clang/test/Preprocessor/has_attribute.c b/clang/test/Preprocessor/has_attribute.c index 0ba664a53e649..6f6f519bd299d 100644 --- a/clang/test/Preprocessor/has_attribute.c +++ b/clang/test/Preprocessor/has_attribute.c @@ -68,3 +68,23 @@ int has_no_volatile_attribute(); int has_fallthrough; #endif // CHECK: int has_fallthrough; + +#if __has_attribute(F\ +) +int has_fallthrough_2; +#endif +// CHECK: int has_fallthrough_2; + +#define F_2 fall\ +through + +#if __has_attribute(F_2) +int has_fallthrough_3; +#endif +// CHECK: int has_fallthrough_3; + +#if __has_attribute(F_2\ +) +int has_fallthrough_4; +#endif +// CHECK: int has_fallthrough_4; diff --git a/clang/test/Preprocessor/has_attribute.cpp b/clang/test/Preprocessor/has_attribute.cpp index 00ec57615c84b..72af6de27e8bb 100644 --- a/clang/test/Preprocessor/has_attribute.cpp +++ b/clang/test/Preprocessor/has_attribute.cpp @@ -116,6 +116,26 @@ int funclike_1; int funclike_2; #endif // CHECK: int funclike_2; + +#if __has_cpp_attribute(CF\ +) +int has_clang_falthrough_5; +#endif +// CHECK: int has_clang_falthrough_5; + +#define CF_2 clang::\ +fallthrough + +#if __has_cpp_attribute(CF_2) +int has_clang_falthrough_6; +#endif +// CHECK: int has_clang_falthrough_6; + +#if __has_cpp_attribute(CF_2\ +) +int has_clang_falthrough_7; +#endif +// CHECK: int has_clang_falthrough_7; } // Test for Microsoft __declspec attributes diff --git a/clang/test/Preprocessor/has_c_attribute.c b/clang/test/Preprocessor/has_c_attribute.c index 3332571d758c8..d8be13f5898a9 100644 --- a/clang/test/Preprocessor/has_c_attribute.c +++ b/clang/test/Preprocessor/has_c_attribute.c @@ -84,3 +84,23 @@ int funclike_1; int funclike_2; #endif // CHECK: int funclike_2; + +#if __has_c_attribute(CL\ +) +int has_clang_likely_5; +#endif +// CHECK: int has_clang_likely_5; + +#define CL_2 clang::\ +likely + +#if __has_c_attribute(CL_2) +int has_clang_likely_6; +#endif +// CHECK: int has_clang_likely_6; + +#if __has_c_attribute(CL_2\ +) +int has_clang_likely_7; +#endif +// CHECK: int has_clang_likely_7; diff --git a/clang/test/Preprocessor/has_include.c b/clang/test/Preprocessor/has_include.c index c95025d83860a..ff199bf23063f 100644 --- a/clang/test/Preprocessor/has_include.c +++ b/clang/test/Preprocessor/has_include.c @@ -197,3 +197,52 @@ __has_include #ifdef FOO #elif __has_include() #endif + +#if __has_include(\ +) +#else + #error "__has_include failed (10)." +#endif + +#define MACRO6 +#if __has_include(MACRO6\ +) +#else + #error "__has_include failed (11)." +#endif + +#if __has_include_next(/*expected-warning {{#include_next in primary source file}}*/\ +) +#else + #error "__has_include_next failed (9)." +#endif + +#if __has_include_next(MACRO6/*expected-warning {{#include_next in primary source file}}*/\ +) +#else + #error "__has_include_next failed (10)." +#endif + +#define MACRO7 +#if __has_include(MACRO7) +#else + #error "__has_include failed (12)." +#endif + +#if __has_include(MACRO7\ +) +#else + #error "__has_include failed (13)." +#endif + +#if __has_include_next(MACRO7) //expected-warning {{#include_next in primary source file}} +#else + #error "__has_include_next failed (11)." +#endif + +#if __has_include_next(MACRO7/*expected-warning {{#include_next in primary source file}}*/\ +) +#else + #error "__has_include_next failed (12)." +#endif diff --git a/clang/test/Preprocessor/pr133574.c b/clang/test/Preprocessor/pr133574.c new file mode 100644 index 0000000000000..a34073e63b760 --- /dev/null +++ b/clang/test/Preprocessor/pr133574.c @@ -0,0 +1,121 @@ +// RUN: %clang_cc1 -E -verify %s +// expected-no-diagnostics + +#define DATE_LBR __D\ +ATE__ + +const char* test1(void) { + return __DATE\ +__; +} +const char* test2(void) { + return DATE_LBR; +} + +#define TIME_LBR __TIME_\ +_ + +const char* test3(void) { + return __TIM\ +E__; +} + +const char* test4(void) { + return TIME_LBR; +} + +#define LINE_LBR __LI\ +NE__ + +int test5(void) { + return _\ +_LINE__; +} + +int test6(void) { + return LINE_LBR; +} + +#define FILE_LBR __FI\ +LE__ + +const char* test7(void) { + return __\ +FILE__; +} + +const char* test8(void) { + return FILE_LBR; +} + +#define FILE_NAME_LBR __FILE_NA\ +ME__ + +const char* test9(void) { + return __FILE_NAM\ +E__; +} + +const char* test10(void) { + return FILE_NAME_LBR; +} + +#define BASE_FILE_LBR __BASE_FIL\ +E__ + +const char* test11(void) { + return __BASE_\ +FILE__; +} + +const char* test12(void) { + return BASE_FILE_LBR; +} + +#define INCLUDE_LEVEL_LBR __INCLUDE\ +_LEVEL__ + +int test13(void) { + return __IN\ +CLUDE_LEVEL__; +} + +int test14(void) { + return INCLUDE_LEVEL_LBR; +} + +#define TIMESTAMP_LBR __TIMESTA\ +MP__ + +const char* test15(void) { + return __TIMESTA\ +MP__; +} + +const char* test16(void) { + return TIMESTAMP_LBR; +} + +#define FLT_EVAL_METHOD_LBR __FLT_EVAL_METH\ +OD__ + +int test17(void) { + return __FL\ +T_EVAL_METHOD__; +} + +int test18(void) { + return FLT_EVAL_METHOD_LBR; +} + +#define COUNTER_LBR __COUNTE\ +R__ + +int test19(void) { + return _\ +_COUNTER__; +} + +int test20(void) { + return COUNTER_LBR; +} From b581bd3429b28420ff473f700fe96c18127a475d Mon Sep 17 00:00:00 2001 From: Scott Manley Date: Tue, 15 Apr 2025 14:21:19 -0500 Subject: [PATCH 027/710] [flang][OpenACC] use correct type when create private box init recipe (#135698) The recipe for initializing private box types was incorrect because hlfir::createTempFromMold() is not a suitable utility function when the box element type is a trivial type. --- flang/lib/Lower/OpenACC.cpp | 24 ++++++++++++------- flang/test/Lower/OpenACC/acc-private.f90 | 30 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 3dd35ed9ae481..c83e277b996f3 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -522,13 +522,17 @@ static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe, mlir::Type ty, mlir::Location loc) { mlir::Value retVal = recipe.getInitRegion().front().getArgument(0); ty = fir::unwrapRefType(ty); - if (fir::isa_trivial(ty)) { + + auto getDeclareOpForType = [&](mlir::Type ty) -> hlfir::DeclareOp { auto alloca = builder.create(loc, ty); - auto declareOp = builder.create( + return builder.create( loc, alloca, accPrivateInitName, /*shape=*/nullptr, llvm::ArrayRef{}, /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{}); - retVal = declareOp.getBase(); + }; + + if (fir::isa_trivial(ty)) { + retVal = getDeclareOpForType(ty).getBase(); } else if (auto seqTy = mlir::dyn_cast_or_null(ty)) { if (fir::isa_trivial(seqTy.getEleTy())) { mlir::Value shape; @@ -552,12 +556,16 @@ static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe, } } else if (auto boxTy = mlir::dyn_cast_or_null(ty)) { mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy()); - if (!fir::isa_trivial(innerTy) && !mlir::isa(innerTy)) + if (fir::isa_trivial(innerTy)) { + retVal = getDeclareOpForType(ty).getBase(); + } else if (mlir::isa(innerTy)) { + fir::FirOpBuilder firBuilder{builder, recipe.getOperation()}; + hlfir::Entity source = hlfir::Entity{retVal}; + auto [temp, cleanup] = hlfir::createTempFromMold(loc, firBuilder, source); + retVal = temp; + } else { TODO(loc, "Unsupported boxed type in OpenACC privatization"); - fir::FirOpBuilder firBuilder{builder, recipe.getOperation()}; - hlfir::Entity source = hlfir::Entity{retVal}; - auto [temp, cleanup] = hlfir::createTempFromMold(loc, firBuilder, source); - retVal = temp; + } } builder.create(loc, retVal); } diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90 index 50c7a258bb567..356bb9d825d8e 100644 --- a/flang/test/Lower/OpenACC/acc-private.f90 +++ b/flang/test/Lower/OpenACC/acc-private.f90 @@ -87,6 +87,13 @@ ! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box> ! CHECK: } +! CHECK-LABEL: @privatization_ref_box_heap_i32 : !fir.ref>> init { +! CHECK: ^bb0(%arg0: !fir.ref>>): +! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.box> +! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.private.init"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref>> +! CHECK: } + ! CHECK-LABEL: acc.private.recipe @privatization_ref_box_heap_Uxi32 : !fir.ref>>> init { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>>>): ! CHECK: %[[LOADBOX:.*]] = fir.load %[[ARG0]] : !fir.ref>>> @@ -292,6 +299,29 @@ subroutine acc_private_allocatable_array(a, n) ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.ref>>>) ! CHECK: acc.serial private(@privatization_ref_box_heap_Uxi32 -> %{{.*}} : !fir.ref>>>) +subroutine acc_private_allocatable_scalar(b, a, n) + integer :: a(n) + integer, allocatable :: b + integer :: i, n + + !$acc parallel loop private(b) + do i = 1, n + a(i) = b + end do + + !$acc serial private(b) + a(i) = b + !$acc end serial +end subroutine + +! CHECK-LABEL: func.func @_QPacc_private_allocatable_scalar( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "b"} +! CHECK: %[[DECLA_B:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFacc_private_allocatable_scalarEb"} : (!fir.ref>>, !fir.dscope) -> (!fir.ref>>, !fir.ref>>) +! CHECK: acc.parallel {{.*}} { +! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_B]]#0 : !fir.ref>>) -> !fir.ref>> {name = "b"} +! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_i32 -> %[[PRIVATE]] : !fir.ref>>) +! CHECK: acc.serial private(@privatization_ref_box_heap_i32 -> %{{.*}} : !fir.ref>>) { + subroutine acc_private_pointer_array(a, n) integer, pointer :: a(:) integer :: i, n From bd9c5112c750f07df2886562b227d9e2aeb338c8 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Tue, 15 Apr 2025 14:36:42 -0500 Subject: [PATCH 028/710] [mlir][tosa] Add error_if checks for Transpose (#135219) This adds missing error_if checking for Transpose Op also moved all transpose op's verifier tests from invalid.mlir to verifier.mlir Signed-off-by: Tai Ly --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 48 ++++++---- mlir/test/Dialect/Tosa/invalid.mlir | 112 ------------------------ mlir/test/Dialect/Tosa/verifier.mlir | 126 +++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 131 deletions(-) create mode 100644 mlir/test/Dialect/Tosa/verifier.mlir diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 9579d71a2afe9..bce5b226635f3 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -1981,23 +1981,28 @@ LogicalResult tosa::TransposeOp::verify() { .failed()) { return failure(); } - TensorType inputType = getInput1().getType(); - TensorType outputType = getOutput().getType(); + + const ShapeAdaptor inputShape(getInput1().getType()); + const ShapeAdaptor outputShape(getOutput().getType()); + const llvm::ArrayRef constantPerms = getPerms(); - if (inputType.hasRank() && - constantPerms.size() != static_cast(inputType.getRank())) + if (inputShape.hasRank() && + constantPerms.size() != static_cast(inputShape.getRank())) return emitOpError() << "expected perms attribute to have size " - << inputType.getRank() << " (input rank) but got size " + << inputShape.getRank() + << " (input rank) but got size " << constantPerms.size(); - if (inputType.hasRank() && outputType.hasRank() && - inputType.getRank() != outputType.getRank()) + + if (inputShape.hasRank() && outputShape.hasRank() && + inputShape.getRank() != outputShape.getRank()) return emitOpError() << "expected input tensor rank to equal result tensor rank"; - if (outputType.hasRank() && - constantPerms.size() != static_cast(outputType.getRank())) + + if (outputShape.hasRank() && + constantPerms.size() != static_cast(outputShape.getRank())) return emitOpError() << "expected perms attribute to have size " - << outputType.getRank() + << outputShape.getRank() << " (output rank) but got size " << constantPerms.size(); @@ -2010,22 +2015,27 @@ LogicalResult tosa::TransposeOp::verify() { constantPerms, [](int32_t v) -> int64_t { return v; })))) return emitOpError() << "expected valid permutation indices"; + // ERROR_IF(tensor_size(shape1) != tensor_size(shape)) + if (inputShape.hasStaticShape() && outputShape.hasStaticShape() && + inputShape.getNumElements() != outputShape.getNumElements()) + return emitOpError() << "expected input1 and output to have same numbers " + "of elements, got " + << inputShape.getNumElements() << " and " + << outputShape.getNumElements(); + // Verify that the types of the input and output tensors are properly // permuted. - if (inputType.hasRank() && outputType.hasRank()) { - assert(constantPerms.size() == static_cast(inputType.getRank()) && - inputType.getRank() == outputType.getRank()); - - for (auto i = 0; i < outputType.getRank(); i++) { - if (inputType.isDynamicDim(constantPerms[i]) || - outputType.isDynamicDim(i)) + if (inputShape.hasRank() && outputShape.hasRank()) { + for (auto i = 0; i < outputShape.getRank(); i++) { + if (inputShape.isDynamicDim(constantPerms[i]) || + outputShape.isDynamicDim(i)) continue; - if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i)) + if (inputShape.getDimSize(constantPerms[i]) != outputShape.getDimSize(i)) return emitOpError() << "expected output tensor dim " << i << " to match " << "input dim " << constantPerms[i] << " with value of " - << inputType.getDimSize(constantPerms[i]); + << inputShape.getDimSize(constantPerms[i]); } } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 8d8f4f562c4b6..c0b251885de5c 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -368,79 +368,6 @@ func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>) -> tensor // ----- -func.func @test_transpose_io_rank_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21x1xf32> { - // expected-error@+1 {{'tosa.transpose' op expected input tensor rank to equal result tensor rank}} - %0 = tosa.transpose %arg0 {perms = array}: (tensor<13x21x3xf32>) -> tensor<3x13x21x1xf32> - return %0 : tensor<3x13x21x1xf32> -} - -// ----- - -func.func @test_transpose_rank0_perms() { - %14 = tensor.empty() : tensor<5x27xi64> - // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 2 (input rank) but got size 0}} - %72 = tosa.transpose %14 {perms = array }: (tensor<5x27xi64>) -> tensor - return -} - -// ----- - -func.func @test_transpose_invalid_perms_size(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> { - // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 3 (input rank) but got size 7}} - %0 = tosa.transpose %arg0 {perms = array }: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32> - return %0 : tensor<3x13x21xf32> -} - -// ----- - -func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>) -> tensor { - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %0 = tosa.transpose %arg0 {perms = array }: (tensor<13x21x3xf32>) -> tensor - return %0 : tensor -} - -// ----- - -func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<*xi32> - return %1 : tensor<*xi32> -} - -// ----- - -func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<*xi32> - return %1 : tensor<*xi32> -} - -// ----- - -func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> { - // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}} - %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<3x4xi32> - return %1 : tensor<3x4xi32> -} - -// ----- - -func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> { - // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}} - %1 = tosa.transpose %arg0 {perms = array }: (tensor<2x?xi32>) -> tensor<3x4xi32> - return %1 : tensor<3x4xi32> -} - -// ----- - -func.func @test_transpose_element_type_mismatch(%arg0: tensor<2x3xi32>) -> tensor<3x2xf32> { - // expected-error@+1 {{'tosa.transpose' op failed to verify that all of {input1, output} have same element type}} - %1 = tosa.transpose %arg0 {perms = array} : (tensor<2x3xi32>) -> tensor<3x2xf32> - return %1 : tensor<3x2xf32> -} - -// ----- - func.func @test_reduce_sum_type_mismatch(%arg0 : tensor<2x3x4x5xf32>) -> () { // expected-error@+2 {{failed to infer returned types}} // expected-error@+1 {{'tosa.reduce_sum' op inferred type(s) 'tensor<1x3x4x5xf32>' are incompatible with return type(s) of operation 'tensor<1x3x4x5xi32>'}} @@ -783,37 +710,6 @@ func.func @test_tile_io_rank_mismatch() { return } -// ----- - -// CHECK-LABEL: @test_invalid_constant_permutation -func.func @test_invalid_constant_permutation() { - %0 = tensor.empty() : tensor<3x4x5xi32> - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %2 = tosa.transpose %0 {perms = array}: (tensor<3x4x5xi32>) -> tensor<3x4x5xi32> - return -} - -// ----- - -// CHECK-LABEL: test_rank_size_constant_permutation -func.func @test_rank_size_constant_permutation() { - %0 = arith.constant 6 : index - %2 = tensor.empty(%0) : tensor - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %3 = tosa.transpose %2 {perms = array}: (tensor) -> tensor - return -} - -// ----- - -// CHECK-LABEL: test_large_constant_permutation -func.func @test_large_constant_permutation() { - %0 = arith.constant 6 : index - %2 = tensor.empty(%0) : tensor - // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} - %3 = tosa.transpose %2 {perms = array}: (tensor) -> tensor - return -} // ----- @@ -2061,14 +1957,6 @@ func.func @test_scalar_tile(%arg0: tensor) -> tensor<*xf32> { // ----- -func.func @test_scalar_output_transpose(%arg0: tensor<*xf32>) -> tensor { - // expected-error@+1 {{'tosa.transpose' op result #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor'}} - %1 = tosa.transpose %arg0 {perms = array} : (tensor<*xf32>) -> tensor - return %1 : tensor -} - -// ----- - // CHECK-LABEL: test_add_i1 func.func @test_add_i1(%arg0: tensor<13x21x1xi1>, %arg1: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> { // expected-error@+1 {{'tosa.add' op illegal: operand/result data types not supported}} diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir new file mode 100644 index 0000000000000..c49cbecd25c78 --- /dev/null +++ b/mlir/test/Dialect/Tosa/verifier.mlir @@ -0,0 +1,126 @@ +//-------------------------------------------------------------------------------------------------- +// Test expected errors generated by verifier checks. +//-------------------------------------------------------------------------------------------------- + +// RUN: mlir-opt %s -split-input-file -verify-diagnostics + +// ----- + +func.func @test_transpose_io_rank_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21x1xf32> { + // expected-error@+1 {{'tosa.transpose' op expected input tensor rank to equal result tensor rank}} + %0 = tosa.transpose %arg0 {perms = array}: (tensor<13x21x3xf32>) -> tensor<3x13x21x1xf32> + return %0 : tensor<3x13x21x1xf32> +} + +// ----- + +func.func @test_transpose_rank0_perms() { + %14 = tensor.empty() : tensor<5x27xi64> + // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 2 (input rank) but got size 0}} + %72 = tosa.transpose %14 {perms = array }: (tensor<5x27xi64>) -> tensor + return +} + +// ----- + +func.func @test_transpose_invalid_perms_size(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> { + // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 3 (input rank) but got size 7}} + %0 = tosa.transpose %arg0 {perms = array }: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32> + return %0 : tensor<3x13x21xf32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>) -> tensor { + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %0 = tosa.transpose %arg0 {perms = array }: (tensor<13x21x3xf32>) -> tensor + return %0 : tensor +} + +// ----- + +func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<*xi32> + return %1 : tensor<*xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<*xi32> + return %1 : tensor<*xi32> +} + +// ----- + +func.func @test_transpose_invalid_num_elements(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> { + // expected-error@+1 {{'tosa.transpose' op expected input1 and output to have same numbers of elements, got 6 and 12}} + %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<3x4xi32> + return %1 : tensor<3x4xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x2xi32> { + // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}} + %1 = tosa.transpose %arg0 {perms = array }: (tensor<3x2xi32>) -> tensor<3x2xi32> + return %1 : tensor<3x2xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> { + // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}} + %1 = tosa.transpose %arg0 {perms = array }: (tensor<2x?xi32>) -> tensor<3x4xi32> + return %1 : tensor<3x4xi32> +} + +// ----- + +func.func @test_transpose_element_type_mismatch(%arg0: tensor<2x3xi32>) -> tensor<3x2xf32> { + // expected-error@+1 {{'tosa.transpose' op failed to verify that all of {input1, output} have same element type}} + %1 = tosa.transpose %arg0 {perms = array} : (tensor<2x3xi32>) -> tensor<3x2xf32> + return %1 : tensor<3x2xf32> +} + +// ----- + +// CHECK-LABEL: @test_invalid_constant_permutation +func.func @test_invalid_constant_permutation() { + %0 = tensor.empty() : tensor<3x4x5xi32> + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %2 = tosa.transpose %0 {perms = array}: (tensor<3x4x5xi32>) -> tensor<3x4x5xi32> + return +} + +// ----- + +// CHECK-LABEL: test_rank_size_constant_permutation +func.func @test_rank_size_constant_permutation() { + %0 = arith.constant 6 : index + %2 = tensor.empty(%0) : tensor + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %3 = tosa.transpose %2 {perms = array}: (tensor) -> tensor + return +} + +// ----- + +// CHECK-LABEL: test_large_constant_permutation +func.func @test_large_constant_permutation() { + %0 = arith.constant 6 : index + %2 = tensor.empty(%0) : tensor + // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}} + %3 = tosa.transpose %2 {perms = array}: (tensor) -> tensor + return +} + +// ----- + +func.func @test_scalar_output_transpose(%arg0: tensor<*xf32>) -> tensor { + // expected-error@+1 {{'tosa.transpose' op result #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor'}} + %1 = tosa.transpose %arg0 {perms = array} : (tensor<*xf32>) -> tensor + return %1 : tensor +} From 96064e1b516aba4d7cbea2ab183b20b19b7eea86 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Tue, 15 Apr 2025 14:36:53 -0500 Subject: [PATCH 029/710] [mlir][tosa] Add table size check for Table Op (#135262) Add table size check for Table Op and add lit tests to error_if_check.mlir also corrected some existing tests that violated the table size checks Signed-off-by: Tai Ly --- .../Tosa/Transforms/TosaValidation.cpp | 24 ++++++++++++++++++- mlir/test/Dialect/Tosa/dynamic_extension.mlir | 4 ++-- mlir/test/Dialect/Tosa/error_if_check.mlir | 16 +++++++++++++ mlir/test/Dialect/Tosa/invalid_extension.mlir | 4 ++-- 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index 11eb0d969d78b..ef9d27f8df0ad 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -1012,8 +1012,30 @@ bool checkErrorIfMul(Operation *op) { return true; } +bool checkErrorIfTable(Operation *op) { + auto table = dyn_cast(op); + if (!table) + return true; + + // REQUIRE(length(table) == TABLE_SIZE) where TABLE_SIZE is 256 or 513 + const auto inputElemType = getElementTypeOrSelf(table.getInput1().getType()); + const int tableSize = inputElemType.isInteger(8) ? 256 : 513; + + const ShapeAdaptor tableShape(table.getTable().getType()); + if (tableShape.hasStaticShape()) { + const auto numElements = tableShape.getNumElements(); + if (numElements != tableSize) { + op->emitOpError() << "requires table size of " << tableSize << ", got " + << numElements; + return false; + } + } + + return true; +} + LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) { - if (!checkErrorIfResize(op) || !checkErrorIfMul(op)) + if (!checkErrorIfResize(op) || !checkErrorIfMul(op) || !checkErrorIfTable(op)) return failure(); return success(); } diff --git a/mlir/test/Dialect/Tosa/dynamic_extension.mlir b/mlir/test/Dialect/Tosa/dynamic_extension.mlir index 0ec46022157d7..25e1aa195c3a0 100644 --- a/mlir/test/Dialect/Tosa/dynamic_extension.mlir +++ b/mlir/test/Dialect/Tosa/dynamic_extension.mlir @@ -13,8 +13,8 @@ func.func @test_mul_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor<13x1x3xi8 // ----- -func.func @test_table_non_const(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () { - %0 = tosa.table %arg0, %arg1 : (tensor<4x5xi8>, tensor<513xi8>) -> tensor<4x5xi8> +func.func @test_table_non_const(%arg0 : tensor<4x5xi8>, %arg1 : tensor<256xi8>) -> () { + %0 = tosa.table %arg0, %arg1 : (tensor<4x5xi8>, tensor<256xi8>) -> tensor<4x5xi8> return } diff --git a/mlir/test/Dialect/Tosa/error_if_check.mlir b/mlir/test/Dialect/Tosa/error_if_check.mlir index f7ca0faa8bc9e..65a69be91e0c8 100644 --- a/mlir/test/Dialect/Tosa/error_if_check.mlir +++ b/mlir/test/Dialect/Tosa/error_if_check.mlir @@ -113,3 +113,19 @@ func.func @test_mul_non_zero_shift(%arg0: tensor<1x8x8x8xi16>, %arg1: tensor<1x8 %mul = tosa.mul %arg0, %arg1, %shift : (tensor<1x8x8x8xi16>, tensor<1x8x8x8xi16>, tensor<1xi8>) -> tensor<1x8x8x8xi32> return %mul : tensor<1x8x8x8xi32> } + +// ----- +// CHECK-LABEL: test_i16_table_size +func.func @test_i16_table_size(%arg0: tensor<2x64xi16>, %arg1: tensor<256xi16>) -> tensor<2x64xi32> { + // expected-error@+1 {{'tosa.table' op requires table size of 513, got 256}} + %0 = tosa.table %arg0, %arg1 : (tensor<2x64xi16>, tensor<256xi16>) -> tensor<2x64xi32> + return %0 : tensor<2x64xi32> +} + +// ----- +// CHECK-LABEL: test_i8_table_size +func.func @test_i8_table_size(%arg0: tensor<2x64xi8>, %arg1: tensor<513xi8>) -> tensor<2x64xi8> { + // expected-error@+1 {{'tosa.table' op requires table size of 256, got 513}} + %0 = tosa.table %arg0, %arg1 : (tensor<2x64xi8>, tensor<513xi8>) -> tensor<2x64xi8> + return %0 : tensor<2x64xi8> +} diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index 241e603e91c61..7386b1ba9df99 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -497,9 +497,9 @@ func.func @test_mul_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor<13x1x3xi8 // ----- -func.func @test_table_non_const(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () { +func.func @test_table_non_const(%arg0 : tensor<4x5xi8>, %arg1 : tensor<256xi8>) -> () { // expected-error@+1 {{'tosa.table' op expected compile time resolvable constant, but got variable value for operand #1}} - %0 = tosa.table %arg0, %arg1 : (tensor<4x5xi8>, tensor<513xi8>) -> tensor<4x5xi8> + %0 = tosa.table %arg0, %arg1 : (tensor<4x5xi8>, tensor<256xi8>) -> tensor<4x5xi8> return } From 4f64c80d5a23c244f942193e58ecac666c173308 Mon Sep 17 00:00:00 2001 From: Michael Spencer Date: Tue, 15 Apr 2025 12:39:00 -0700 Subject: [PATCH 030/710] [llvm] Add missing include for !LLVM_ENABLE_THREADS thread.h used report_fatal_error without including ErrorHandling.h --- llvm/include/llvm/Support/thread.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h index e3005fdb63175..ef2fba822cb1c 100644 --- a/llvm/include/llvm/Support/thread.h +++ b/llvm/include/llvm/Support/thread.h @@ -213,6 +213,7 @@ inline thread::id get_id() { return std::this_thread::get_id(); } #else // !LLVM_ENABLE_THREADS +#include "llvm/Support/ErrorHandling.h" #include namespace llvm { From e6e56f5b6a80c6ce55630d6075475cb363afb149 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 15 Apr 2025 12:45:18 -0700 Subject: [PATCH 031/710] [MemProf] Handle recursion during stack node update (#135837) If we are replacing a sequence of stack nodes with a single node representing inlined IR, and the stack id sequence contains recursion, we may have already removed some edges. Handle this case correctly by skipping the now removed edge. --- .../Transforms/IPO/MemProfContextDisambiguation.cpp | 7 ++++++- .../MemProfContextDisambiguation/inlined2.ll | 11 +++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index a5e0251277d8f..561c01cb01f82 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1711,7 +1711,12 @@ void CallsiteContextGraph:: // edge from the prior node. if (PrevNode) { auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); - assert(PrevEdge); + // If the sequence contained recursion, we might have already removed + // some edges during the connectNewNode calls above. + if (!PrevEdge) { + PrevNode = CurNode; + continue; + } set_subtract(PrevEdge->getContextIds(), SavedContextIds); if (PrevEdge->getContextIds().empty()) removeEdgeFromGraph(PrevEdge); diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll index cc97b5290e25a..2cc655e927d12 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined2.ll @@ -40,6 +40,9 @@ ;; in the input IR to ensure that the MIB call chain is matched to the longer ;; inline sequences from main. ;; +;; Update: the inlined sequence of callsite ids was manually modified to +;; include some recursion, which reproduced an error before it was fixed. +;; ;; The IR was then reduced using llvm-reduce with the expected FileCheck input. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ @@ -96,13 +99,13 @@ attributes #7 = { builtin } !6 = !{i32 7, !"frame-pointer", i32 2} !7 = !{!8, !10} !8 = !{!9, !"notcold"} -!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -5964873800580613432, i64 8632435727821051414} !10 = !{!11, !"cold"} -!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -5964873800580613432, i64 -3421689549917153178} !12 = !{i64 9086428284934609951} !13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) -!14 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} -!15 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!14 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -5964873800580613432, i64 8632435727821051414} +!15 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -5964873800580613432, i64 -3421689549917153178} ; DUMP: CCG before cloning: From a3283a92aea147e89d9d404fa7c8500223c7c22a Mon Sep 17 00:00:00 2001 From: Akira Hatanaka Date: Tue, 15 Apr 2025 12:54:25 -0700 Subject: [PATCH 032/710] [PAC] Add support for __ptrauth type qualifier (#100830) The qualifier allows programmer to directly control how pointers are signed when they are stored in a particular variable. The qualifier takes three arguments: the signing key, a flag specifying whether address discrimination should be used, and a non-negative integer that is used for additional discrimination. ``` typedef void (*my_callback)(const void*); my_callback __ptrauth(ptrauth_key_process_dependent_code, 1, 0xe27a) callback; ``` Co-Authored-By: John McCall rjmccall@apple.com --- clang/docs/PointerAuthentication.rst | 46 ++ clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/AST/Type.h | 21 +- clang/include/clang/Basic/Attr.td | 8 + clang/include/clang/Basic/AttrDocs.td | 28 + .../clang/Basic/DiagnosticParseKinds.td | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 43 +- clang/include/clang/Basic/Features.def | 1 + clang/include/clang/Basic/TokenKinds.def | 1 + clang/include/clang/Parse/Parser.h | 2 + clang/include/clang/Sema/Sema.h | 11 + clang/lib/AST/ASTContext.cpp | 1 + clang/lib/AST/DeclCXX.cpp | 27 + clang/lib/AST/ItaniumMangle.cpp | 20 + clang/lib/AST/MicrosoftMangle.cpp | 13 + clang/lib/AST/TypePrinter.cpp | 40 + clang/lib/CodeGen/CGClass.cpp | 3 + clang/lib/CodeGen/CGDebugInfo.cpp | 19 +- clang/lib/CodeGen/CGDecl.cpp | 12 +- clang/lib/CodeGen/CGExpr.cpp | 94 ++- clang/lib/CodeGen/CGExprConstant.cpp | 23 +- clang/lib/CodeGen/CGExprScalar.cpp | 62 ++ clang/lib/CodeGen/CGPointerAuth.cpp | 129 +++ clang/lib/CodeGen/CodeGenFunction.h | 26 +- clang/lib/Parse/ParseDecl.cpp | 52 ++ clang/lib/Sema/SemaCast.cpp | 19 +- clang/lib/Sema/SemaChecking.cpp | 57 ++ clang/lib/Sema/SemaDecl.cpp | 13 +- clang/lib/Sema/SemaDeclCXX.cpp | 38 + clang/lib/Sema/SemaExpr.cpp | 14 + clang/lib/Sema/SemaExprCXX.cpp | 5 + clang/lib/Sema/SemaObjCProperty.cpp | 3 + clang/lib/Sema/SemaOverload.cpp | 15 + clang/lib/Sema/SemaType.cpp | 79 ++ clang/lib/Sema/TreeTransform.h | 11 + clang/test/AST/ast-dump-ptrauth-json.cpp | 3 + clang/test/CodeGen/ptrauth-debuginfo.c | 35 + .../CodeGen/ptrauth-qualifier-const-init.c | 86 ++ .../test/CodeGen/ptrauth-qualifier-function.c | 145 ++++ .../CodeGen/ptrauth-qualifier-loadstore.c | 745 ++++++++++++++++++ .../CodeGenCXX/mangle-itanium-ptrauth.cpp | 12 + clang/test/CodeGenCXX/mangle-ms-ptrauth.cpp | 17 + .../CodeGenCXX/ptrauth-qualifier-struct.cpp | 168 ++++ .../CodeGenObjCXX/ptrauth-struct-cxx-abi.mm | 35 + clang/test/Parser/ptrauth-qualifier.c | 18 + clang/test/Preprocessor/ptrauth_extension.c | 13 + clang/test/Sema/ptrauth-atomic-ops.c | 118 +++ clang/test/Sema/ptrauth-qualifier.c | 103 +++ clang/test/SemaCXX/ptrauth-qualifier.cpp | 213 +++++ .../SemaCXX/ptrauth-template-parameters.cpp | 29 + clang/test/SemaObjC/ptrauth-qualifier.m | 56 ++ libcxxabi/test/test_demangle.pass.cpp | 3 + .../include/llvm/Demangle/MicrosoftDemangle.h | 8 + .../llvm/Demangle/MicrosoftDemangleNodes.h | 22 +- llvm/lib/Demangle/MicrosoftDemangle.cpp | 48 +- llvm/lib/Demangle/MicrosoftDemangleNodes.cpp | 10 + llvm/test/Demangle/ms-ptrauth.test | 12 + 57 files changed, 2801 insertions(+), 39 deletions(-) create mode 100644 clang/test/CodeGen/ptrauth-debuginfo.c create mode 100644 clang/test/CodeGen/ptrauth-qualifier-const-init.c create mode 100644 clang/test/CodeGen/ptrauth-qualifier-function.c create mode 100644 clang/test/CodeGen/ptrauth-qualifier-loadstore.c create mode 100644 clang/test/CodeGenCXX/mangle-itanium-ptrauth.cpp create mode 100644 clang/test/CodeGenCXX/mangle-ms-ptrauth.cpp create mode 100644 clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp create mode 100644 clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm create mode 100644 clang/test/Parser/ptrauth-qualifier.c create mode 100644 clang/test/Preprocessor/ptrauth_extension.c create mode 100644 clang/test/Sema/ptrauth-atomic-ops.c create mode 100644 clang/test/Sema/ptrauth-qualifier.c create mode 100644 clang/test/SemaCXX/ptrauth-qualifier.cpp create mode 100644 clang/test/SemaCXX/ptrauth-template-parameters.cpp create mode 100644 clang/test/SemaObjC/ptrauth-qualifier.m create mode 100644 llvm/test/Demangle/ms-ptrauth.test diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst index 68674f318c84f..41818d43ac687 100644 --- a/clang/docs/PointerAuthentication.rst +++ b/clang/docs/PointerAuthentication.rst @@ -280,6 +280,52 @@ a number of different tests. normal interface. This may be true even on targets where pointer authentication is not enabled by default. +__ptrauth Qualifier +^^^^^^^^^^^^^^^^^^^ + +``__ptrauth(key, address, discriminator)`` is an extended type +qualifier which causes so-qualified objects to hold pointers signed using the +specified schema rather than the default schema for such types. + +In the current implementation in Clang, the qualified type must be a C pointer +type, either to a function or to an object. It currently cannot be an +Objective-C pointer type, a C++ reference type, or a block pointer type; these +restrictions may be lifted in the future. + +The qualifier's operands are as follows: + +- ``key`` - an expression evaluating to a key value from ````; must + be a constant expression + +- ``address`` - whether to use address diversity (1) or not (0); must be + a constant expression with one of these two values + +- ``discriminator`` - a constant discriminator; must be a constant expression + +See `Discriminators`_ for more information about discriminators. + +Currently the operands must be constant-evaluable even within templates. In the +future this restriction may be lifted to allow value-dependent expressions as +long as they instantiate to a constant expression. + +Consistent with the ordinary C/C++ rule for parameters, top-level ``__ptrauth`` +qualifiers on a parameter (after parameter type adjustment) are ignored when +deriving the type of the function. The parameter will be passed using the +default ABI for the unqualified pointer type. + +If ``x`` is an object of type ``__ptrauth(key, address, discriminator) T``, +then the signing schema of the value stored in ``x`` is a key of ``key`` and +a discriminator determined as follows: + +- if ``address`` is 0, then the discriminator is ``discriminator``; + +- if ``address`` is 1 and ``discriminator`` is 0, then the discriminator is + ``&x``; otherwise + +- if ``address`` is 1 and ``discriminator`` is non-zero, then the discriminator + is ``ptrauth_blend_discriminator(&x, discriminator)``; see + `ptrauth_blend_discriminator`_. + ```` ~~~~~~~~~~~~~~~ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6025e76029d19..38142ad32bea0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -552,6 +552,8 @@ Arm and AArch64 Support ARM targets, however this will now disable NEON instructions being generated. The ``simd`` option is also now printed when the ``--print-supported-extensions`` option is used. +- Support for __ptrauth type qualifier has been added. + Android Support ^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 74886ef0cd824..5bf036e3347eb 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -312,6 +312,12 @@ class PointerAuthQualifier { return Result; } + std::string getAsString() const; + std::string getAsString(const PrintingPolicy &Policy) const; + + bool isEmptyWhenPrinted(const PrintingPolicy &Policy) const; + void print(raw_ostream &OS, const PrintingPolicy &Policy) const; + void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Data); } }; @@ -562,7 +568,7 @@ class Qualifiers { bool hasAddressSpace() const { return Mask & AddressSpaceMask; } LangAS getAddressSpace() const { - return static_cast(Mask >> AddressSpaceShift); + return static_cast((Mask & AddressSpaceMask) >> AddressSpaceShift); } bool hasTargetSpecificAddressSpace() const { return isTargetAddressSpace(getAddressSpace()); @@ -803,6 +809,9 @@ class Qualifiers { static_assert(sizeof(PointerAuthQualifier) == sizeof(uint32_t), "PointerAuthQualifier must be 32 bits"); + static constexpr uint64_t PtrAuthShift = 32; + static constexpr uint64_t PtrAuthMask = UINT64_C(0xffffffff) << PtrAuthShift; + static constexpr uint64_t UMask = 0x8; static constexpr uint64_t UShift = 3; static constexpr uint64_t GCAttrMask = 0x30; @@ -810,10 +819,8 @@ class Qualifiers { static constexpr uint64_t LifetimeMask = 0x1C0; static constexpr uint64_t LifetimeShift = 6; static constexpr uint64_t AddressSpaceMask = - ~(CVRMask | UMask | GCAttrMask | LifetimeMask); + ~(CVRMask | UMask | GCAttrMask | LifetimeMask | PtrAuthMask); static constexpr uint64_t AddressSpaceShift = 9; - static constexpr uint64_t PtrAuthShift = 32; - static constexpr uint64_t PtrAuthMask = uint64_t(0xffffffff) << PtrAuthShift; }; class QualifiersAndAtomic { @@ -1449,6 +1456,12 @@ class QualType { return getQualifiers().getPointerAuth(); } + bool hasAddressDiscriminatedPointerAuth() const { + if (PointerAuthQualifier PtrAuth = getPointerAuth()) + return PtrAuth.isAddressDiscriminated(); + return false; + } + enum PrimitiveDefaultInitializeKind { /// The type does not fall into any of the following categories. Note that /// this case is zero-valued so that values of this enum can be used as a diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index b7ad432738b29..9d4900f3029c8 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -3548,6 +3548,14 @@ def ObjCRequiresPropertyDefs : InheritableAttr { let SimpleHandler = 1; } +def PointerAuth : TypeAttr { + let Spellings = [CustomKeyword<"__ptrauth">]; + let Args = [IntArgument<"Key">, + BoolArgument<"AddressDiscriminated", 1>, + IntArgument<"ExtraDiscriminator", 1>]; + let Documentation = [PtrAuthDocs]; +} + def Unused : InheritableAttr { let Spellings = [CXX11<"", "maybe_unused", 201603>, GCC<"unused">, C23<"", "maybe_unused", 202106>]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 97a5f24d35d7d..76f805ef373dd 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2179,6 +2179,34 @@ Also see the documentation for `@available }]; } +def PtrAuthDocs : Documentation { + let Category = DocCatVariable; + let Heading = "__ptrauth"; + let Content = [{ +The ``__ptrauth`` qualifier allows the programmer to directly control +how pointers are signed when they are stored in a particular variable. +This can be used to strengthen the default protections of pointer +authentication and make it more difficult for an attacker to escalate +an ability to alter memory into full control of a process. + +.. code-block:: c + + #include + + typedef void (*my_callback)(const void*); + my_callback __ptrauth(ptrauth_key_process_dependent_code, 1, 0xe27a) callback; + +The first argument to ``__ptrauth`` is the name of the signing key. +Valid key names for the target are defined in ````. + +The second argument to ``__ptrauth`` is a flag (0 or 1) specifying whether +the object should use address discrimination. + +The third argument to ``__ptrauth`` is a 16-bit non-negative integer which +allows additional discrimination between objects. + }]; +} + def ExternalSourceSymbolDocs : Documentation { let Category = DocCatDecl; let Content = [{ diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 7a3cac528a363..9975520f4f9ff 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1721,6 +1721,9 @@ def warn_pragma_unroll_cuda_value_in_parens : Warning< "argument to '#pragma unroll' should not be in parentheses in CUDA C/C++">, InGroup; +def err_ptrauth_qualifier_bad_arg_count : Error< + "'__ptrauth' qualifier must take between 1 and 3 arguments">; + def warn_cuda_attr_lambda_position : Warning< "nvcc does not allow '__%0__' to appear after the parameter list in lambdas">, InGroup; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f4ab620ae61d2..3f7499d8656bd 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -1014,6 +1014,22 @@ def err_ptrauth_indirect_goto_addrlabel_arithmetic : Error< "%select{subtraction|addition}0 of address-of-label expressions is not " "supported with ptrauth indirect gotos">; +// __ptrauth qualifier +def err_ptrauth_qualifier_invalid : Error< + "%select{return type|parameter type|property}1 may not be qualified with '__ptrauth'; type is %0">; +def err_ptrauth_qualifier_cast : Error< + "cannot cast to '__ptrauth'-qualified type %0">; +def err_ptrauth_qualifier_nonpointer : Error< + "'__ptrauth' qualifier only applies to pointer types; %0 is invalid">; +def err_ptrauth_qualifier_redundant : Error< + "type %0 is already %1-qualified">; +def err_ptrauth_arg_not_ice : Error< + "argument to '__ptrauth' must be an integer constant expression">; +def err_ptrauth_address_discrimination_invalid : Error< + "invalid address discrimination flag '%0'; '__ptrauth' requires '0' or '1'">; +def err_ptrauth_extra_discriminator_invalid : Error< + "invalid extra discriminator flag '%0'; '__ptrauth' requires a value between '0' and '%1'">; + /// main() // static main() is not an error in C, just in C++. def warn_static_main : Warning<"'main' should not be declared static">, @@ -3923,7 +3939,8 @@ def note_cannot_use_trivial_abi_reason : Note< "its copy constructors and move constructors are all deleted|" "it is polymorphic|" "it has a base of a non-trivial class type|it has a virtual base|" - "it has a __weak field|it has a field of a non-trivial class type}1">; + "it has a __weak field|it has a field of a non-trivial class type|" + "it has an address-discriminated '__ptrauth' field}1">; // Availability attribute def warn_availability_unknown_platform : Warning< @@ -5021,6 +5038,10 @@ def note_ovl_candidate_bad_ownership : Note< "%select{no|__unsafe_unretained|__strong|__weak|__autoreleasing}4 ownership," " but parameter has %select{no|__unsafe_unretained|__strong|__weak|" "__autoreleasing}5 ownership">; +def note_ovl_candidate_bad_ptrauth : Note< + "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: " + "%ordinal8 argument (%3) has %select{no '__ptrauth'|%5}4 qualifier," + " but parameter has %select{no '__ptrauth'|%7}6 qualifier">; def note_ovl_candidate_bad_cvr_this : Note< "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: " "'this' argument has type %3, but method is not marked " @@ -6092,7 +6113,7 @@ def note_deleted_special_member_class_subobject : Note< "%select{default|corresponding|default|default|default}4 constructor}0|" "destructor}5" "%select{||s||}4" - "|is an ObjC pointer}6">; + "|is an ObjC pointer|has an address-discriminated '__ptrauth' qualifier}6">; def note_default_constructed_field : Note<"default constructed field %0 declared here">; def note_deleted_default_ctor_uninit_field : Note< @@ -8938,6 +8959,19 @@ def err_typecheck_incompatible_ownership : Error< "sending to parameter of different type}0,1" "|%diff{casting $ to type $|casting between types}0,1}2" " changes retain/release properties of pointer">; +def err_typecheck_incompatible_ptrauth : Error< + "%enum_select{%Assigning{%diff{assigning $ to $|assigning to different types}1,0}" + "|%Passing{%diff{passing $ to parameter of type $|" + "passing to parameter of different type}0,1}" + "|%Returning{%diff{returning $ from a function with result type $|" + "returning from function with different return type}0,1}" + "|%Converting{%diff{converting $ to type $|converting between types}0,1}" + "|%Initializing{%diff{initializing $ with an expression of type $|" + "initializing with expression of different type}0,1}" + "|%Sending{%diff{sending $ to parameter of type $|" + "sending to parameter of different type}0,1}" + "|%Casting{%diff{casting $ to type $|casting between types}0,1}}2" + " changes pointer authentication of pointee type">; def err_typecheck_comparison_of_distinct_blocks : Error< "comparison of distinct block types%diff{ ($ and $)|}0,1">; @@ -9066,6 +9100,9 @@ def err_atomic_op_needs_non_const_atomic : Error< def err_atomic_op_needs_non_const_pointer : Error< "address argument to atomic operation must be a pointer to non-const " "type (%0 invalid)">; +def err_atomic_op_needs_non_address_discriminated_pointer : Error< + "address argument to %select{atomic|__sync}0 operation must be a pointer to a non address discriminated " + "type (%1 invalid)">; def err_atomic_op_needs_trivial_copy : Error< "address argument to atomic operation must be a pointer to a " "trivially-copyable type (%0 invalid)">; @@ -9343,6 +9380,8 @@ def ext_typecheck_cond_pointer_integer_mismatch : ExtWarn< "pointer/integer type mismatch in conditional expression" "%diff{ ($ and $)|}0,1">, InGroup>; +def err_typecheck_cond_incompatible_ptrauth : Error< + "'__ptrauth' qualification mismatch%diff{ ($ and $)|}0,1">; def err_typecheck_choose_expr_requires_constant : Error< "'__builtin_choose_expr' requires a constant expression">; def warn_unused_expr : Warning<"expression result unused">, diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def index b4409efaa9c04..14bff8a68846d 100644 --- a/clang/include/clang/Basic/Features.def +++ b/clang/include/clang/Basic/Features.def @@ -107,6 +107,7 @@ FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread)) FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow)) FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo)) FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics) +EXTENSION(ptrauth_qualifier, LangOpts.PointerAuthIntrinsics) FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls) FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns) FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination) diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 880928ae0447d..868e851342eb8 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -348,6 +348,7 @@ KEYWORD(_Thread_local , KEYALL) KEYWORD(__func__ , KEYALL) KEYWORD(__objc_yes , KEYALL) KEYWORD(__objc_no , KEYALL) +KEYWORD(__ptrauth , KEYALL) // C2y UNARY_EXPR_OR_TYPE_TRAIT(_Countof, CountOf, KEYNOCXX) diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 53da6269a3b11..9ebcf144ba59e 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3169,6 +3169,8 @@ class Parser : public CodeCompletionHandler { SourceLocation *endLoc = nullptr); ExprResult ParseExtIntegerArgument(); + void ParsePtrauthQualifier(ParsedAttributes &Attrs); + VirtSpecifiers::Specifier isCXX11VirtSpecifier(const Token &Tok) const; VirtSpecifiers::Specifier isCXX11VirtSpecifier() const { return isCXX11VirtSpecifier(Tok); diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5ab0af8234e26..fe37fd7701ce3 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -3547,6 +3547,17 @@ class Sema final : public SemaBase { bool checkConstantPointerAuthKey(Expr *keyExpr, unsigned &key); + enum PointerAuthDiscArgKind { + // Address discrimination argument of __ptrauth. + PADAK_AddrDiscPtrAuth, + + // Extra discriminator argument of __ptrauth. + PADAK_ExtraDiscPtrAuth, + }; + + bool checkPointerAuthDiscriminatorArg(Expr *Arg, PointerAuthDiscArgKind Kind, + unsigned &IntVal); + /// Diagnose function specifiers on a declaration of an identifier that /// does not identify a function. void DiagnoseFunctionSpecifiers(const DeclSpec &DS); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index dfae2f5511b43..c6ffe7bbf5257 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -11454,6 +11454,7 @@ QualType ASTContext::mergeTypes(QualType LHS, QualType RHS, bool OfBlockPointer, if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() || LQuals.getAddressSpace() != RQuals.getAddressSpace() || LQuals.getObjCLifetime() != RQuals.getObjCLifetime() || + !LQuals.getPointerAuth().isEquivalent(RQuals.getPointerAuth()) || LQuals.hasUnaligned() != RQuals.hasUnaligned()) return {}; diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 36131d19cbcdf..4d07efd58f518 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -1118,6 +1118,33 @@ void CXXRecordDecl::addedMember(Decl *D) { } else if (!T.isCXX98PODType(Context)) data().PlainOldData = false; + // If a class has an address-discriminated signed pointer member, it is a + // non-POD type and its copy constructor, move constructor, copy assignment + // operator, move assignment operator are non-trivial. + if (PointerAuthQualifier Q = T.getPointerAuth()) { + if (Q.isAddressDiscriminated()) { + struct DefinitionData &Data = data(); + Data.PlainOldData = false; + Data.HasTrivialSpecialMembers &= + ~(SMF_CopyConstructor | SMF_MoveConstructor | SMF_CopyAssignment | + SMF_MoveAssignment); + setArgPassingRestrictions(RecordArgPassingKind::CanNeverPassInRegs); + + // Copy/move constructors/assignment operators of a union are deleted by + // default if it has an address-discriminated ptrauth field. + if (isUnion()) { + data().DefaultedCopyConstructorIsDeleted = true; + data().DefaultedMoveConstructorIsDeleted = true; + data().DefaultedCopyAssignmentIsDeleted = true; + data().DefaultedMoveAssignmentIsDeleted = true; + data().NeedOverloadResolutionForCopyConstructor = true; + data().NeedOverloadResolutionForMoveConstructor = true; + data().NeedOverloadResolutionForCopyAssignment = true; + data().NeedOverloadResolutionForMoveAssignment = true; + } + } + } + if (Field->hasAttr()) setHasUninitializedExplicitInitFields(true); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 140f29b431fc3..d0ab60700cb15 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -2895,6 +2895,26 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp if (Quals.hasUnaligned()) mangleVendorQualifier("__unaligned"); + // __ptrauth. Note that this is parameterized. + if (PointerAuthQualifier PtrAuth = Quals.getPointerAuth()) { + mangleVendorQualifier("__ptrauth"); + // For now, since we only allow non-dependent arguments, we can just + // inline the mangling of those arguments as literals. We treat the + // key and extra-discriminator arguments as 'unsigned int' and the + // address-discriminated argument as 'bool'. + Out << "I" + "Lj" + << PtrAuth.getKey() + << "E" + "Lb" + << unsigned(PtrAuth.isAddressDiscriminated()) + << "E" + "Lj" + << PtrAuth.getExtraDiscriminator() + << "E" + "E"; + } + // Remaining ARC ownership qualifiers. switch (Quals.getObjCLifetime()) { case Qualifiers::OCL_None: diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index a6efd887d4e13..20bfb7f89625b 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -430,6 +430,7 @@ class MicrosoftCXXNameMangler { void mangleRefQualifier(RefQualifierKind RefQualifier); void manglePointerCVQualifiers(Qualifiers Quals); void manglePointerExtQualifiers(Qualifiers Quals, QualType PointeeType); + void manglePointerAuthQualifier(Qualifiers Quals); void mangleUnscopedTemplateName(GlobalDecl GD); void @@ -2340,6 +2341,17 @@ void MicrosoftCXXNameMangler::manglePointerExtQualifiers(Qualifiers Quals, Out << 'F'; } +void MicrosoftCXXNameMangler::manglePointerAuthQualifier(Qualifiers Quals) { + PointerAuthQualifier PointerAuth = Quals.getPointerAuth(); + if (!PointerAuth) + return; + + Out << "__ptrauth"; + mangleNumber(PointerAuth.getKey()); + mangleNumber(PointerAuth.isAddressDiscriminated()); + mangleNumber(PointerAuth.getExtraDiscriminator()); +} + void MicrosoftCXXNameMangler::manglePointerCVQualifiers(Qualifiers Quals) { // ::= P # no qualifiers // ::= Q # const @@ -3372,6 +3384,7 @@ void MicrosoftCXXNameMangler::mangleType(const PointerType *T, Qualifiers Quals, QualType PointeeType = T->getPointeeType(); manglePointerCVQualifiers(Quals); manglePointerExtQualifiers(Quals, PointeeType); + manglePointerAuthQualifier(Quals); // For pointer size address spaces, go down the same type mangling path as // non address space types. diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index cc8c874140167..7b1d1c7ae2131 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -2016,6 +2016,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::Ptr64: case attr::SPtr: case attr::UPtr: + case attr::PointerAuth: case attr::AddressSpace: case attr::CmseNSCall: case attr::AnnotateType: @@ -2512,6 +2513,33 @@ void clang::printTemplateArgumentList(raw_ostream &OS, printTo(OS, Args, InnerPolicy, TPL, /*isPack*/ false, /*parmIndex*/ 0); } +std::string PointerAuthQualifier::getAsString() const { + LangOptions LO; + return getAsString(PrintingPolicy(LO)); +} + +std::string PointerAuthQualifier::getAsString(const PrintingPolicy &P) const { + SmallString<64> Buf; + llvm::raw_svector_ostream StrOS(Buf); + print(StrOS, P); + return StrOS.str().str(); +} + +bool PointerAuthQualifier::isEmptyWhenPrinted(const PrintingPolicy &P) const { + return !isPresent(); +} + +void PointerAuthQualifier::print(raw_ostream &OS, + const PrintingPolicy &P) const { + if (!isPresent()) + return; + + OS << "__ptrauth("; + OS << getKey(); + OS << "," << unsigned(isAddressDiscriminated()) << "," + << getExtraDiscriminator() << ")"; +} + std::string Qualifiers::getAsString() const { LangOptions LO; return getAsString(PrintingPolicy(LO)); @@ -2541,6 +2569,10 @@ bool Qualifiers::isEmptyWhenPrinted(const PrintingPolicy &Policy) const { if (!(lifetime == Qualifiers::OCL_Strong && Policy.SuppressStrongLifetime)) return false; + if (PointerAuthQualifier PointerAuth = getPointerAuth(); + PointerAuth && !PointerAuth.isEmptyWhenPrinted(Policy)) + return false; + return true; } @@ -2651,6 +2683,14 @@ void Qualifiers::print(raw_ostream &OS, const PrintingPolicy& Policy, } } + if (PointerAuthQualifier PointerAuth = getPointerAuth()) { + if (addSpace) + OS << ' '; + addSpace = true; + + PointerAuth.print(OS, Policy); + } + if (appendSpaceIfNonEmpty && addSpace) OS << ' '; } diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index 1c73a5bf75f37..7176fe025b386 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -925,6 +925,9 @@ namespace { Qualifiers Qual = F->getType().getQualifiers(); if (Qual.hasVolatile() || Qual.hasObjCLifetime()) return false; + if (PointerAuthQualifier Q = F->getType().getPointerAuth(); + Q && Q.isAddressDiscriminated()) + return false; return true; } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index e63341c180420..f3ec498d4064b 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -1059,8 +1059,23 @@ llvm::DIType *CGDebugInfo::CreateQualifiedType(QualType Ty, // additional ones. llvm::dwarf::Tag Tag = getNextQualifier(Qc); if (!Tag) { - assert(Qc.empty() && "Unknown type qualifier for debug info"); - return getOrCreateType(QualType(T, 0), Unit); + if (Qc.getPointerAuth()) { + unsigned Key = Qc.getPointerAuth().getKey(); + bool IsDiscr = Qc.getPointerAuth().isAddressDiscriminated(); + unsigned ExtraDiscr = Qc.getPointerAuth().getExtraDiscriminator(); + bool IsaPointer = Qc.getPointerAuth().isIsaPointer(); + bool AuthenticatesNullValues = + Qc.getPointerAuth().authenticatesNullValues(); + Qc.removePointerAuth(); + assert(Qc.empty() && "Unknown type qualifier for debug info"); + llvm::DIType *FromTy = getOrCreateType(QualType(T, 0), Unit); + return DBuilder.createPtrAuthQualifiedType(FromTy, Key, IsDiscr, + ExtraDiscr, IsaPointer, + AuthenticatesNullValues); + } else { + assert(Qc.empty() && "Unknown type qualifier for debug info"); + return getOrCreateType(QualType(T, 0), Unit); + } } auto *FromTy = getOrCreateType(Qc.apply(CGM.getContext(), T), Unit); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index db8dbf86eca4f..db34e2738b4cf 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -776,11 +776,17 @@ void CodeGenFunction::EmitScalarInit(const Expr *init, const ValueDecl *D, LValue lvalue, bool capturedByInit) { Qualifiers::ObjCLifetime lifetime = lvalue.getObjCLifetime(); if (!lifetime) { - llvm::Value *value = EmitScalarExpr(init); + llvm::Value *Value; + if (PointerAuthQualifier PtrAuth = lvalue.getQuals().getPointerAuth()) { + Value = EmitPointerAuthQualify(PtrAuth, init, lvalue.getAddress()); + lvalue.getQuals().removePointerAuth(); + } else { + Value = EmitScalarExpr(init); + } if (capturedByInit) drillIntoBlockVariable(*this, lvalue, cast(D)); - EmitNullabilityCheck(lvalue, value, init->getExprLoc()); - EmitStoreThroughLValue(RValue::get(value), lvalue, true); + EmitNullabilityCheck(lvalue, Value, init->getExprLoc()); + EmitStoreThroughLValue(RValue::get(Value), lvalue, true); return; } diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 3da21cebd9d68..abb88477062fc 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -569,7 +569,15 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) { // initialized it. if (!Var->hasInitializer()) { Var->setInitializer(CGM.EmitNullConstant(E->getType())); - EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/true); + QualType RefType = M->getType().withoutLocalFastQualifiers(); + if (RefType.getPointerAuth()) { + // Use the qualifier of the reference temporary to sign the pointer. + LValue LV = MakeRawAddrLValue(Object.getPointer(), RefType, + Object.getAlignment()); + EmitScalarInit(E, M->getExtendingDecl(), LV, false); + } else { + EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/ true); + } } } else { switch (M->getStorageDuration()) { @@ -1770,16 +1778,16 @@ static ConstantEmissionKind checkVarTypeForConstantEmission(QualType type) { /// for instance if a block or lambda or a member of a local class uses a /// const int variable or constexpr variable from an enclosing function. CodeGenFunction::ConstantEmission -CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) { - ValueDecl *value = refExpr->getDecl(); +CodeGenFunction::tryEmitAsConstant(const DeclRefExpr *RefExpr) { + const ValueDecl *Value = RefExpr->getDecl(); // The value needs to be an enum constant or a constant variable. ConstantEmissionKind CEK; - if (isa(value)) { + if (isa(Value)) { CEK = CEK_None; - } else if (auto *var = dyn_cast(value)) { + } else if (const auto *var = dyn_cast(Value)) { CEK = checkVarTypeForConstantEmission(var->getType()); - } else if (isa(value)) { + } else if (isa(Value)) { CEK = CEK_AsValueOnly; } else { CEK = CEK_None; @@ -1792,15 +1800,15 @@ CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) { // It's best to evaluate all the way as an r-value if that's permitted. if (CEK != CEK_AsReferenceOnly && - refExpr->EvaluateAsRValue(result, getContext())) { + RefExpr->EvaluateAsRValue(result, getContext())) { resultIsReference = false; - resultType = refExpr->getType(); + resultType = RefExpr->getType().getUnqualifiedType(); // Otherwise, try to evaluate as an l-value. } else if (CEK != CEK_AsValueOnly && - refExpr->EvaluateAsLValue(result, getContext())) { + RefExpr->EvaluateAsLValue(result, getContext())) { resultIsReference = true; - resultType = value->getType(); + resultType = Value->getType(); // Failure. } else { @@ -1819,7 +1827,7 @@ CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) { // accessible on device. The DRE of the captured reference variable has to be // loaded from captures. if (CGM.getLangOpts().CUDAIsDevice && result.Val.isLValue() && - refExpr->refersToEnclosingVariableOrCapture()) { + RefExpr->refersToEnclosingVariableOrCapture()) { auto *MD = dyn_cast_or_null(CurCodeDecl); if (isLambdaMethod(MD) && MD->getOverloadedOperator() == OO_Call) { const APValue::LValueBase &base = result.Val.getLValueBase(); @@ -1834,17 +1842,17 @@ CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) { } // Emit as a constant. - auto C = ConstantEmitter(*this).emitAbstract(refExpr->getLocation(), - result.Val, resultType); + llvm::Constant *C = ConstantEmitter(*this).emitAbstract( + RefExpr->getLocation(), result.Val, resultType); // Make sure we emit a debug reference to the global variable. // This should probably fire even for - if (isa(value)) { - if (!getContext().DeclMustBeEmitted(cast(value))) - EmitDeclRefExprDbgValue(refExpr, result.Val); + if (isa(Value)) { + if (!getContext().DeclMustBeEmitted(cast(Value))) + EmitDeclRefExprDbgValue(RefExpr, result.Val); } else { - assert(isa(value)); - EmitDeclRefExprDbgValue(refExpr, result.Val); + assert(isa(Value)); + EmitDeclRefExprDbgValue(RefExpr, result.Val); } // If we emitted a reference constant, we need to dereference that. @@ -2235,6 +2243,15 @@ RValue CodeGenFunction::EmitLoadOfAnyValue(LValue LV, AggValueSlot Slot, /// method emits the address of the lvalue, then loads the result as an rvalue, /// returning the rvalue. RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) { + // Load from __ptrauth. + if (PointerAuthQualifier PtrAuth = LV.getQuals().getPointerAuth()) { + LV.getQuals().removePointerAuth(); + llvm::Value *Value = EmitLoadOfLValue(LV, Loc).getScalarVal(); + return RValue::get(EmitPointerAuthUnqualify(PtrAuth, Value, LV.getType(), + LV.getAddress(), + /*known nonnull*/ false)); + } + if (LV.isObjCWeak()) { // load of a __weak object. Address AddrWeakObj = LV.getAddress(); @@ -2490,6 +2507,13 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst, return EmitStoreThroughBitfieldLValue(Src, Dst); } + // Handle __ptrauth qualification by re-signing the value. + if (PointerAuthQualifier PointerAuth = Dst.getQuals().getPointerAuth()) { + Src = RValue::get(EmitPointerAuthQualify(PointerAuth, Src.getScalarVal(), + Dst.getType(), Dst.getAddress(), + /*known nonnull*/ false)); + } + // There's special magic for assigning into an ARC-qualified l-value. if (Qualifiers::ObjCLifetime Lifetime = Dst.getQuals().getObjCLifetime()) { switch (Lifetime) { @@ -5792,6 +5816,28 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) { return EmitCallee(ICE->getSubExpr()); } + // Try to remember the original __ptrauth qualifier for loads of + // function pointers. + if (ICE->getCastKind() == CK_LValueToRValue) { + const Expr *SubExpr = ICE->getSubExpr(); + if (const auto *PtrType = SubExpr->getType()->getAs()) { + std::pair Result = + EmitOrigPointerRValue(E); + + QualType FunctionType = PtrType->getPointeeType(); + assert(FunctionType->isFunctionType()); + + GlobalDecl GD; + if (const auto *VD = + dyn_cast_or_null(E->getReferencedDeclOfCallee())) { + GD = GlobalDecl(VD); + } + CGCalleeInfo CalleeInfo(FunctionType->getAs(), GD); + CGCallee Callee(CalleeInfo, Result.first, Result.second); + return Callee; + } + } + // Resolve direct calls. } else if (auto DRE = dyn_cast(E)) { if (auto FD = dyn_cast(DRE->getDecl())) { @@ -5854,6 +5900,18 @@ LValue CodeGenFunction::EmitBinaryOperatorLValue(const BinaryOperator *E) { switch (getEvaluationKind(E->getType())) { case TEK_Scalar: { + if (PointerAuthQualifier PtrAuth = + E->getLHS()->getType().getPointerAuth()) { + LValue LV = EmitCheckedLValue(E->getLHS(), TCK_Store); + LValue CopiedLV = LV; + CopiedLV.getQuals().removePointerAuth(); + llvm::Value *RV = + EmitPointerAuthQualify(PtrAuth, E->getRHS(), CopiedLV.getAddress()); + EmitNullabilityCheck(CopiedLV, RV, E->getExprLoc()); + EmitStoreThroughLValue(RValue::get(RV), CopiedLV); + return LV; + } + switch (E->getLHS()->getType().getObjCLifetime()) { case Qualifiers::OCL_Strong: return EmitARCStoreStrong(E, /*ignored*/ false).first; diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp index b016c6e36d1a8..b21ebeee4bed1 100644 --- a/clang/lib/CodeGen/CGExprConstant.cpp +++ b/clang/lib/CodeGen/CGExprConstant.cpp @@ -2049,10 +2049,13 @@ namespace { struct ConstantLValue { llvm::Constant *Value; bool HasOffsetApplied; + bool HasDestPointerAuth; /*implicit*/ ConstantLValue(llvm::Constant *value, - bool hasOffsetApplied = false) - : Value(value), HasOffsetApplied(hasOffsetApplied) {} + bool hasOffsetApplied = false, + bool hasDestPointerAuth = false) + : Value(value), HasOffsetApplied(hasOffsetApplied), + HasDestPointerAuth(hasDestPointerAuth) {} /*implicit*/ ConstantLValue(ConstantAddress address) : ConstantLValue(address.getPointer()) {} @@ -2157,6 +2160,14 @@ llvm::Constant *ConstantLValueEmitter::tryEmit() { value = applyOffset(value); } + // Apply pointer-auth signing from the destination type. + if (PointerAuthQualifier PointerAuth = DestType.getPointerAuth(); + PointerAuth && !result.HasDestPointerAuth) { + value = Emitter.tryEmitConstantSignedPointer(value, PointerAuth); + if (!value) + return nullptr; + } + // Convert to the appropriate type; this could be an lvalue for // an integer. FIXME: performAddrSpaceCast if (isa(destTy)) @@ -2200,6 +2211,12 @@ ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) { return CGM.GetWeakRefReference(D).getPointer(); auto PtrAuthSign = [&](llvm::Constant *C) { + if (PointerAuthQualifier PointerAuth = DestType.getPointerAuth()) { + C = applyOffset(C); + C = Emitter.tryEmitConstantSignedPointer(C, PointerAuth); + return ConstantLValue(C, /*applied offset*/ true, /*signed*/ true); + } + CGPointerAuthInfo AuthInfo; if (EnablePtrAuthFunctionTypeDiscrimination) @@ -2213,7 +2230,7 @@ ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) { C = CGM.getConstantSignedPointer( C, AuthInfo.getKey(), nullptr, cast_or_null(AuthInfo.getDiscriminator())); - return ConstantLValue(C, /*applied offset*/ true); + return ConstantLValue(C, /*applied offset*/ true, /*signed*/ true); } return ConstantLValue(C); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index e9a7ba509350c..8dbbcdaef25d8 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2261,6 +2261,53 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { return V; } +static bool isDeclRefKnownNonNull(CodeGenFunction &CGF, const ValueDecl *D) { + return !D->isWeak(); +} + +static bool isLValueKnownNonNull(CodeGenFunction &CGF, const Expr *E) { + E = E->IgnoreParens(); + + if (const auto *UO = dyn_cast(E)) + if (UO->getOpcode() == UO_Deref) + return CGF.isPointerKnownNonNull(UO->getSubExpr()); + + if (const auto *DRE = dyn_cast(E)) + return isDeclRefKnownNonNull(CGF, DRE->getDecl()); + + if (const auto *ME = dyn_cast(E)) { + if (isa(ME->getMemberDecl())) + return true; + return isDeclRefKnownNonNull(CGF, ME->getMemberDecl()); + } + + // Array subscripts? Anything else? + + return false; +} + +bool CodeGenFunction::isPointerKnownNonNull(const Expr *E) { + assert(E->getType()->isSignableType()); + + E = E->IgnoreParens(); + + if (isa(E)) + return true; + + if (const auto *UO = dyn_cast(E)) + if (UO->getOpcode() == UO_AddrOf) + return isLValueKnownNonNull(*this, UO->getSubExpr()); + + if (const auto *CE = dyn_cast(E)) + if (CE->getCastKind() == CK_FunctionToPointerDecay || + CE->getCastKind() == CK_ArrayToPointerDecay) + return isLValueKnownNonNull(*this, CE->getSubExpr()); + + // Maybe honor __nonnull? + + return false; +} + bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) { const Expr *E = CE->getSubExpr(); @@ -4985,6 +5032,21 @@ Value *ScalarExprEmitter::VisitBinAssign(const BinaryOperator *E) { Value *RHS; LValue LHS; + if (PointerAuthQualifier PtrAuth = E->getLHS()->getType().getPointerAuth()) { + LValue LV = CGF.EmitCheckedLValue(E->getLHS(), CodeGenFunction::TCK_Store); + LV.getQuals().removePointerAuth(); + llvm::Value *RV = + CGF.EmitPointerAuthQualify(PtrAuth, E->getRHS(), LV.getAddress()); + CGF.EmitNullabilityCheck(LV, RV, E->getExprLoc()); + CGF.EmitStoreThroughLValue(RValue::get(RV), LV); + + if (Ignore) + return nullptr; + RV = CGF.EmitPointerAuthUnqualify(PtrAuth, RV, LV.getType(), + LV.getAddress(), /*nonnull*/ false); + return RV; + } + switch (E->getLHS()->getType().getObjCLifetime()) { case Qualifiers::OCL_Strong: std::tie(LHS, RHS) = CGF.EmitARCStoreStrong(E, Ignore); diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp index 4b032306ead72..0a183a8524c17 100644 --- a/clang/lib/CodeGen/CGPointerAuth.cpp +++ b/clang/lib/CodeGen/CGPointerAuth.cpp @@ -125,6 +125,33 @@ CGPointerAuthInfo CodeGenFunction::EmitPointerAuthInfo( Schema.authenticatesNullValues(), Discriminator); } +CGPointerAuthInfo +CodeGenFunction::EmitPointerAuthInfo(PointerAuthQualifier Qual, + Address StorageAddress) { + assert(Qual && "don't call this if you don't know that the Qual is present"); + if (Qual.hasKeyNone()) + return CGPointerAuthInfo(); + + llvm::Value *Discriminator = nullptr; + if (unsigned Extra = Qual.getExtraDiscriminator()) + Discriminator = llvm::ConstantInt::get(IntPtrTy, Extra); + + if (Qual.isAddressDiscriminated()) { + assert(StorageAddress.isValid() && + "address discrimination without address"); + llvm::Value *StoragePtr = StorageAddress.emitRawPointer(*this); + if (Discriminator) + Discriminator = + EmitPointerAuthBlendDiscriminator(StoragePtr, Discriminator); + else + Discriminator = Builder.CreatePtrToInt(StoragePtr, IntPtrTy); + } + + return CGPointerAuthInfo(Qual.getKey(), Qual.getAuthenticationMode(), + Qual.isIsaPointer(), Qual.authenticatesNullValues(), + Discriminator); +} + /// Return the natural pointer authentication for values of the given /// pointee type. static CGPointerAuthInfo @@ -166,6 +193,91 @@ CGPointerAuthInfo CodeGenModule::getPointerAuthInfoForType(QualType T) { return ::getPointerAuthInfoForType(*this, T); } +static std::pair +emitLoadOfOrigPointerRValue(CodeGenFunction &CGF, const LValue &LV, + SourceLocation Loc) { + llvm::Value *Value = CGF.EmitLoadOfScalar(LV, Loc); + CGPointerAuthInfo AuthInfo; + if (PointerAuthQualifier PtrAuth = LV.getQuals().getPointerAuth()) + AuthInfo = CGF.EmitPointerAuthInfo(PtrAuth, LV.getAddress()); + else + AuthInfo = getPointerAuthInfoForType(CGF.CGM, LV.getType()); + return {Value, AuthInfo}; +} + +/// Retrieve a pointer rvalue and its ptrauth info. When possible, avoid +/// needlessly resigning the pointer. +std::pair +CodeGenFunction::EmitOrigPointerRValue(const Expr *E) { + assert(E->getType()->isSignableType()); + + E = E->IgnoreParens(); + if (const auto *Load = dyn_cast(E)) { + if (Load->getCastKind() == CK_LValueToRValue) { + E = Load->getSubExpr()->IgnoreParens(); + + // We're semantically required to not emit loads of certain DREs naively. + if (const auto *RefExpr = dyn_cast(E)) { + if (ConstantEmission Result = tryEmitAsConstant(RefExpr)) { + // Fold away a use of an intermediate variable. + if (!Result.isReference()) + return {Result.getValue(), + getPointerAuthInfoForType(CGM, RefExpr->getType())}; + + // Fold away a use of an intermediate reference. + LValue LV = Result.getReferenceLValue(*this, RefExpr); + return emitLoadOfOrigPointerRValue(*this, LV, RefExpr->getLocation()); + } + } + + // Otherwise, load and use the pointer + LValue LV = EmitCheckedLValue(E, CodeGenFunction::TCK_Load); + return emitLoadOfOrigPointerRValue(*this, LV, E->getExprLoc()); + } + } + + // Fallback: just use the normal rules for the type. + llvm::Value *Value = EmitScalarExpr(E); + return {Value, getPointerAuthInfoForType(CGM, E->getType())}; +} + +llvm::Value * +CodeGenFunction::EmitPointerAuthQualify(PointerAuthQualifier DestQualifier, + const Expr *E, + Address DestStorageAddress) { + assert(DestQualifier); + auto [Value, CurAuthInfo] = EmitOrigPointerRValue(E); + + CGPointerAuthInfo DestAuthInfo = + EmitPointerAuthInfo(DestQualifier, DestStorageAddress); + return emitPointerAuthResign(Value, E->getType(), CurAuthInfo, DestAuthInfo, + isPointerKnownNonNull(E)); +} + +llvm::Value *CodeGenFunction::EmitPointerAuthQualify( + PointerAuthQualifier DestQualifier, llvm::Value *Value, + QualType PointerType, Address DestStorageAddress, bool IsKnownNonNull) { + assert(DestQualifier); + + CGPointerAuthInfo CurAuthInfo = getPointerAuthInfoForType(CGM, PointerType); + CGPointerAuthInfo DestAuthInfo = + EmitPointerAuthInfo(DestQualifier, DestStorageAddress); + return emitPointerAuthResign(Value, PointerType, CurAuthInfo, DestAuthInfo, + IsKnownNonNull); +} + +llvm::Value *CodeGenFunction::EmitPointerAuthUnqualify( + PointerAuthQualifier CurQualifier, llvm::Value *Value, QualType PointerType, + Address CurStorageAddress, bool IsKnownNonNull) { + assert(CurQualifier); + + CGPointerAuthInfo CurAuthInfo = + EmitPointerAuthInfo(CurQualifier, CurStorageAddress); + CGPointerAuthInfo DestAuthInfo = getPointerAuthInfoForType(CGM, PointerType); + return emitPointerAuthResign(Value, PointerType, CurAuthInfo, DestAuthInfo, + IsKnownNonNull); +} + static bool isZeroConstant(const llvm::Value *Value) { if (const auto *CI = dyn_cast(Value)) return CI->isZero(); @@ -288,6 +400,23 @@ llvm::Value *CodeGenFunction::emitPointerAuthResign( return Value; } +void CodeGenFunction::EmitPointerAuthCopy(PointerAuthQualifier Qual, QualType T, + Address DestAddress, + Address SrcAddress) { + assert(Qual); + llvm::Value *Value = Builder.CreateLoad(SrcAddress); + + // If we're using address-discrimination, we have to re-sign the value. + if (Qual.isAddressDiscriminated()) { + CGPointerAuthInfo SrcPtrAuth = EmitPointerAuthInfo(Qual, SrcAddress); + CGPointerAuthInfo DestPtrAuth = EmitPointerAuthInfo(Qual, DestAddress); + Value = emitPointerAuthResign(Value, T, SrcPtrAuth, DestPtrAuth, + /*IsKnownNonNull=*/false); + } + + Builder.CreateStore(Value, DestAddress); +} + llvm::Constant * CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key, llvm::Constant *StorageAddress, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index aa07e5d6c8099..4c5e8a8a44926 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4432,10 +4432,10 @@ class CodeGenFunction : public CodeGenTypeCache { } bool isReference() const { return ValueAndIsReference.getInt(); } - LValue getReferenceLValue(CodeGenFunction &CGF, Expr *refExpr) const { + LValue getReferenceLValue(CodeGenFunction &CGF, const Expr *RefExpr) const { assert(isReference()); return CGF.MakeNaturalAlignAddrLValue(ValueAndIsReference.getPointer(), - refExpr->getType()); + RefExpr->getType()); } llvm::Constant *getValue() const { @@ -4444,7 +4444,7 @@ class CodeGenFunction : public CodeGenTypeCache { } }; - ConstantEmission tryEmitAsConstant(DeclRefExpr *refExpr); + ConstantEmission tryEmitAsConstant(const DeclRefExpr *RefExpr); ConstantEmission tryEmitAsConstant(const MemberExpr *ME); llvm::Value *emitScalarConstant(const ConstantEmission &Constant, Expr *E); @@ -4588,6 +4588,26 @@ class CodeGenFunction : public CodeGenTypeCache { const CGPointerAuthInfo &Info, SmallVectorImpl &Bundles); + CGPointerAuthInfo EmitPointerAuthInfo(PointerAuthQualifier Qualifier, + Address StorageAddress); + llvm::Value *EmitPointerAuthQualify(PointerAuthQualifier Qualifier, + llvm::Value *Pointer, QualType ValueType, + Address StorageAddress, + bool IsKnownNonNull); + llvm::Value *EmitPointerAuthQualify(PointerAuthQualifier Qualifier, + const Expr *PointerExpr, + Address StorageAddress); + llvm::Value *EmitPointerAuthUnqualify(PointerAuthQualifier Qualifier, + llvm::Value *Pointer, + QualType PointerType, + Address StorageAddress, + bool IsKnownNonNull); + void EmitPointerAuthCopy(PointerAuthQualifier Qualifier, QualType Type, + Address DestField, Address SrcField); + + std::pair + EmitOrigPointerRValue(const Expr *E); + llvm::Value *authPointerToPointerCast(llvm::Value *ResultPtr, QualType SourceType, QualType DestType); Address authPointerToPointerCast(Address Ptr, QualType SourceType, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index d77400e0f8272..8fa74ecff19aa 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -3400,6 +3400,45 @@ void Parser::DistributeCLateParsedAttrs(Decl *Dcl, } } +/// type-qualifier: +/// ('__ptrauth') '(' constant-expression +/// (',' constant-expression)[opt] +/// (',' constant-expression)[opt] ')' +void Parser::ParsePtrauthQualifier(ParsedAttributes &Attrs) { + assert(Tok.is(tok::kw___ptrauth)); + + IdentifierInfo *KwName = Tok.getIdentifierInfo(); + SourceLocation KwLoc = ConsumeToken(); + + BalancedDelimiterTracker T(*this, tok::l_paren); + if (T.expectAndConsume()) + return; + + ArgsVector ArgExprs; + do { + ExprResult ER = ParseAssignmentExpression(); + if (ER.isInvalid()) { + T.skipToEnd(); + return; + } + ArgExprs.push_back(ER.get()); + } while (TryConsumeToken(tok::comma)); + + T.consumeClose(); + SourceLocation EndLoc = T.getCloseLocation(); + + if (ArgExprs.empty() || ArgExprs.size() > 3) { + Diag(KwLoc, diag::err_ptrauth_qualifier_bad_arg_count); + return; + } + + Attrs.addNew(KwName, SourceRange(KwLoc, EndLoc), + /*scope*/ nullptr, SourceLocation(), ArgExprs.data(), + ArgExprs.size(), + ParsedAttr::Form::Keyword(/*IsAlignAs=*/false, + /*IsRegularKeywordAttribute=*/false)); +} + /// Bounds attributes (e.g., counted_by): /// AttrName '(' expression ')' void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName, @@ -4267,6 +4306,11 @@ void Parser::ParseDeclarationSpecifiers( getLangOpts()); break; + // __ptrauth qualifier. + case tok::kw___ptrauth: + ParsePtrauthQualifier(DS.getAttributes()); + continue; + case tok::kw___sptr: case tok::kw___uptr: case tok::kw___ptr64: @@ -5980,6 +6024,7 @@ bool Parser::isTypeSpecifierQualifier() { case tok::kw___ptr32: case tok::kw___pascal: case tok::kw___unaligned: + case tok::kw___ptrauth: case tok::kw__Nonnull: case tok::kw__Nullable: @@ -6269,6 +6314,7 @@ bool Parser::isDeclarationSpecifier( case tok::kw___forceinline: case tok::kw___pascal: case tok::kw___unaligned: + case tok::kw___ptrauth: case tok::kw__Nonnull: case tok::kw__Nullable: @@ -6533,6 +6579,12 @@ void Parser::ParseTypeQualifierListOpt( ParseHLSLQualifiers(DS.getAttributes()); continue; + // __ptrauth qualifier. + case tok::kw___ptrauth: + ParsePtrauthQualifier(DS.getAttributes()); + EndLoc = PrevTokLocation; + continue; + case tok::kw___unaligned: isInvalid = DS.SetTypeQual(DeclSpec::TQ_unaligned, Loc, PrevSpec, DiagID, getLangOpts()); diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 1591075ff05d8..14e16bc39eb3a 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -72,8 +72,11 @@ namespace { // Preceding an expression by a parenthesized type name converts the // value of the expression to the unqualified, non-atomic version of // the named type. + // Don't drop __ptrauth qualifiers. We want to treat casting to a + // __ptrauth-qualified type as an error instead of implicitly ignoring + // the qualifier. if (!S.Context.getLangOpts().ObjC && !DestType->isRecordType() && - !DestType->isArrayType()) { + !DestType->isArrayType() && !DestType.getPointerAuth()) { DestType = DestType.getAtomicUnqualifiedType(); } @@ -168,6 +171,14 @@ namespace { SrcExpr = src; } + void checkQualifiedDestType() { + // Destination type may not be qualified with __ptrauth. + if (DestType.getPointerAuth()) { + Self.Diag(DestRange.getBegin(), diag::err_ptrauth_qualifier_cast) + << DestType << DestRange; + } + } + /// Check for and handle non-overload placeholder expressions. void checkNonOverloadPlaceholders() { if (!isPlaceholder() || isPlaceholder(BuiltinType::Overload)) @@ -309,6 +320,8 @@ Sema::BuildCXXNamedCast(SourceLocation OpLoc, tok::TokenKind Kind, Op.OpRange = SourceRange(OpLoc, Parens.getEnd()); Op.DestRange = AngleBrackets; + Op.checkQualifiedDestType(); + switch (Kind) { default: llvm_unreachable("Unknown C++ cast!"); @@ -3412,6 +3425,8 @@ ExprResult Sema::BuildCStyleCastExpr(SourceLocation LPLoc, // -Wcast-qual DiagnoseCastQual(Op.Self, Op.SrcExpr, Op.DestType); + Op.checkQualifiedDestType(); + return Op.complete(CStyleCastExpr::Create( Context, Op.ResultType, Op.ValueKind, Op.Kind, Op.SrcExpr.get(), &Op.BasePath, CurFPFeatureOverrides(), CastTypeInfo, LPLoc, RPLoc)); @@ -3431,6 +3446,8 @@ ExprResult Sema::BuildCXXFunctionalCastExpr(TypeSourceInfo *CastTypeInfo, if (Op.SrcExpr.isInvalid()) return ExprError(); + Op.checkQualifiedDestType(); + auto *SubExpr = Op.SrcExpr.get(); if (auto *BindExpr = dyn_cast(SubExpr)) SubExpr = BindExpr->getSubExpr(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d0143d29a4bcc..42da9ba97e0d3 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1550,6 +1550,48 @@ bool Sema::checkConstantPointerAuthKey(Expr *Arg, unsigned &Result) { return false; } +bool Sema::checkPointerAuthDiscriminatorArg(Expr *Arg, + PointerAuthDiscArgKind Kind, + unsigned &IntVal) { + if (!Arg) { + IntVal = 0; + return true; + } + + std::optional Result = Arg->getIntegerConstantExpr(Context); + if (!Result) { + Diag(Arg->getExprLoc(), diag::err_ptrauth_arg_not_ice); + return false; + } + + unsigned Max; + bool IsAddrDiscArg = false; + + switch (Kind) { + case PADAK_AddrDiscPtrAuth: + Max = 1; + IsAddrDiscArg = true; + break; + case PADAK_ExtraDiscPtrAuth: + Max = PointerAuthQualifier::MaxDiscriminator; + break; + }; + + if (*Result < 0 || *Result > Max) { + if (IsAddrDiscArg) + Diag(Arg->getExprLoc(), diag::err_ptrauth_address_discrimination_invalid) + << Result->getExtValue(); + else + Diag(Arg->getExprLoc(), diag::err_ptrauth_extra_discriminator_invalid) + << Result->getExtValue() << Max; + + return false; + }; + + IntVal = Result->getZExtValue(); + return true; +} + static std::pair findConstantBaseAndOffset(Sema &S, Expr *E) { // Must evaluate as a pointer. @@ -3957,6 +3999,14 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange, ValType = AtomTy; } + PointerAuthQualifier PointerAuth = AtomTy.getPointerAuth(); + if (PointerAuth && PointerAuth.isAddressDiscriminated()) { + Diag(ExprRange.getBegin(), + diag::err_atomic_op_needs_non_address_discriminated_pointer) + << 0 << Ptr->getType() << Ptr->getSourceRange(); + return ExprError(); + } + // For an arithmetic operation, the implied arithmetic must be well-formed. if (Form == Arithmetic) { // GCC does not enforce these rules for GNU atomics, but we do to help catch @@ -4329,6 +4379,13 @@ ExprResult Sema::BuiltinAtomicOverloaded(ExprResult TheCallResult) { << FirstArg->getType() << 0 << FirstArg->getSourceRange(); return ExprError(); } + PointerAuthQualifier PointerAuth = ValType.getPointerAuth(); + if (PointerAuth && PointerAuth.isAddressDiscriminated()) { + Diag(FirstArg->getBeginLoc(), + diag::err_atomic_op_needs_non_address_discriminated_pointer) + << 1 << ValType << FirstArg->getSourceRange(); + return ExprError(); + } if (ValType.isConstQualified()) { Diag(DRE->getBeginLoc(), diag::err_atomic_builtin_cannot_be_const) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 240ce5391af81..5f811c824e11d 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -15454,6 +15454,12 @@ ParmVarDecl *Sema::CheckParameter(DeclContext *DC, SourceLocation StartLoc, New->setType(T); } + // __ptrauth is forbidden on parameters. + if (T.getPointerAuth()) { + Diag(NameLoc, diag::err_ptrauth_qualifier_invalid) << T << 1; + New->setInvalidDecl(); + } + // ISO/IEC TR 18037 S6.7.3: "The type of an object with automatic storage // duration shall not be qualified by an address-space qualifier." // Since all parameters have automatic store duration, they can not have @@ -19456,9 +19462,14 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl, RecordArgPassingKind::CanNeverPassInRegs) Record->setArgPassingRestrictions( RecordArgPassingKind::CanNeverPassInRegs); - } else if (FT.getQualifiers().getObjCLifetime() == Qualifiers::OCL_Weak) + } else if (FT.getQualifiers().getObjCLifetime() == Qualifiers::OCL_Weak) { Record->setArgPassingRestrictions( RecordArgPassingKind::CanNeverPassInRegs); + } else if (PointerAuthQualifier Q = FT.getPointerAuth(); + Q && Q.isAddressDiscriminated()) { + Record->setArgPassingRestrictions( + RecordArgPassingKind::CanNeverPassInRegs); + } } if (Record && FD->getType().isVolatileQualified()) diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 74f925f18560a..2247aded9384a 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -9471,6 +9471,8 @@ struct SpecialMemberDeletionInfo bool shouldDeleteForVariantObjCPtrMember(FieldDecl *FD, QualType FieldType); + bool shouldDeleteForVariantPtrAuthMember(const FieldDecl *FD); + bool visitBase(CXXBaseSpecifier *Base) { return shouldDeleteForBase(Base); } bool visitField(FieldDecl *Field) { return shouldDeleteForField(Field); } @@ -9639,6 +9641,30 @@ bool SpecialMemberDeletionInfo::shouldDeleteForVariantObjCPtrMember( return true; } +bool SpecialMemberDeletionInfo::shouldDeleteForVariantPtrAuthMember( + const FieldDecl *FD) { + QualType FieldType = S.Context.getBaseElementType(FD->getType()); + // Copy/move constructors/assignment operators are deleted if the field has an + // address-discriminated ptrauth qualifier. + PointerAuthQualifier Q = FieldType.getPointerAuth(); + + if (!Q || !Q.isAddressDiscriminated()) + return false; + + if (CSM == CXXSpecialMemberKind::DefaultConstructor || + CSM == CXXSpecialMemberKind::Destructor) + return false; + + if (Diagnose) { + auto *ParentClass = cast(FD->getParent()); + S.Diag(FD->getLocation(), diag::note_deleted_special_member_class_subobject) + << llvm::to_underlying(getEffectiveCSM()) << ParentClass + << /*IsField*/ true << FD << 4 << /*IsDtorCallInCtor*/ false << 2; + } + + return true; +} + /// Check whether we should delete a special member function due to the class /// having a particular direct or virtual base class. bool SpecialMemberDeletionInfo::shouldDeleteForBase(CXXBaseSpecifier *Base) { @@ -9677,6 +9703,9 @@ bool SpecialMemberDeletionInfo::shouldDeleteForField(FieldDecl *FD) { if (inUnion() && shouldDeleteForVariantObjCPtrMember(FD, FieldType)) return true; + if (inUnion() && shouldDeleteForVariantPtrAuthMember(FD)) + return true; + if (CSM == CXXSpecialMemberKind::DefaultConstructor) { // For a default constructor, all references must be initialized in-class // and, if a union, it must have a non-const member. @@ -9740,6 +9769,9 @@ bool SpecialMemberDeletionInfo::shouldDeleteForField(FieldDecl *FD) { if (shouldDeleteForVariantObjCPtrMember(&*UI, UnionFieldType)) return true; + if (shouldDeleteForVariantPtrAuthMember(&*UI)) + return true; + if (!UnionFieldType.isConstQualified()) AllVariantFieldsAreConst = false; @@ -10589,6 +10621,12 @@ void Sema::checkIllFormedTrivialABIStruct(CXXRecordDecl &RD) { return; } + // Ill-formed if the field is an address-discriminated pointer. + if (FT.hasAddressDiscriminatedPointerAuth()) { + PrintDiagAndRemoveAttr(6); + return; + } + if (const auto *RT = FT->getBaseElementTypeUnsafe()->getAs()) if (!RT->isDependentType() && !cast(RT->getDecl())->canPassInRegisters()) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index c25daaa022f49..3ac7d61546ceb 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -8107,6 +8107,13 @@ static QualType checkConditionalPointerCompatibility(Sema &S, ExprResult &LHS, lhQual.removeCVRQualifiers(); rhQual.removeCVRQualifiers(); + if (!lhQual.getPointerAuth().isEquivalent(rhQual.getPointerAuth())) { + S.Diag(Loc, diag::err_typecheck_cond_incompatible_ptrauth) + << LHSTy << RHSTy << LHS.get()->getSourceRange() + << RHS.get()->getSourceRange(); + return QualType(); + } + // OpenCL v2.0 specification doesn't extend compatibility of type qualifiers // (C99 6.7.3) for address spaces. We assume that the check should behave in // the same manner as it's defined for CVR qualifiers, so for OpenCL two @@ -9027,6 +9034,10 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType, else if (lhq.getObjCLifetime() != rhq.getObjCLifetime()) ConvTy = Sema::IncompatiblePointerDiscardsQualifiers; + // Treat pointer-auth mismatches as fatal. + else if (!lhq.getPointerAuth().isEquivalent(rhq.getPointerAuth())) + ConvTy = Sema::IncompatiblePointerDiscardsQualifiers; + // For GCC/MS compatibility, other qualifier mismatches are treated // as still compatible in C. else ConvTy = Sema::CompatiblePointerDiscardsQualifiers; @@ -17025,6 +17036,9 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy, } else if (lhq.getObjCLifetime() != rhq.getObjCLifetime()) { DiagKind = diag::err_typecheck_incompatible_ownership; break; + } else if (!lhq.getPointerAuth().isEquivalent(rhq.getPointerAuth())) { + DiagKind = diag::err_typecheck_incompatible_ptrauth; + break; } llvm_unreachable("unknown error case for discarding qualifiers!"); diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index dfb5824a1c3d7..8df590fa624cf 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -7770,6 +7770,11 @@ QualType Sema::FindCompositePointerType(SourceLocation Loc, else return QualType(); + if (Q1.getPointerAuth().isEquivalent(Q2.getPointerAuth())) + Quals.setPointerAuth(Q1.getPointerAuth()); + else + return QualType(); + Steps.back().Quals = Quals; if (Q1 != Quals || Q2 != Quals) NeedConstBefore = Steps.size() - 1; diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp index 6db2c246de791..f37982eddace9 100644 --- a/clang/lib/Sema/SemaObjCProperty.cpp +++ b/clang/lib/Sema/SemaObjCProperty.cpp @@ -180,6 +180,9 @@ Decl *SemaObjC::ActOnProperty(Scope *S, SourceLocation AtLoc, 0); TypeSourceInfo *TSI = SemaRef.GetTypeForDeclarator(FD.D); QualType T = TSI->getType(); + if (T.getPointerAuth().isPresent()) { + Diag(AtLoc, diag::err_ptrauth_qualifier_invalid) << T << 2; + } if (!getOwnershipRule(Attributes)) { Attributes |= deducePropertyOwnershipFromType(SemaRef, T); } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 9c2df0b21d278..55634aa75ae25 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -3709,6 +3709,10 @@ static bool isQualificationConversionStep(QualType FromType, QualType ToType, ToQuals.removeObjCGCAttr(); } + // __ptrauth qualifiers must match exactly. + if (FromQuals.getPointerAuth() != ToQuals.getPointerAuth()) + return false; + // -- for every j > 0, if const is in cv 1,j then const is in cv // 2,j, and similarly for volatile. if (!CStyle && !ToQuals.compatiblyIncludes(FromQuals, Ctx)) @@ -11467,6 +11471,17 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand, return; } + if (!FromQs.getPointerAuth().isEquivalent(ToQs.getPointerAuth())) { + S.Diag(Fn->getLocation(), diag::note_ovl_candidate_bad_ptrauth) + << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc + << FromTy << !!FromQs.getPointerAuth() + << FromQs.getPointerAuth().getAsString() << !!ToQs.getPointerAuth() + << ToQs.getPointerAuth().getAsString() << I + 1 + << (FromExpr ? FromExpr->getSourceRange() : SourceRange()); + MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl); + return; + } + unsigned CVR = FromQs.getCVRQualifiers() & ~ToQs.getCVRQualifiers(); assert(CVR && "expected qualifiers mismatch"); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 33b1d8ca4dfa0..eba7267904fb2 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2552,6 +2552,12 @@ bool Sema::CheckFunctionReturnType(QualType T, SourceLocation Loc) { return true; } + // __ptrauth is illegal on a function return type. + if (T.getPointerAuth()) { + Diag(Loc, diag::err_ptrauth_qualifier_invalid) << T << 0; + return true; + } + if (T.hasNonTrivialToPrimitiveDestructCUnion() || T.hasNonTrivialToPrimitiveCopyCUnion()) checkNonTrivialCUnion(T, Loc, NTCUC_FunctionReturn, @@ -2657,6 +2663,10 @@ QualType Sema::BuildFunctionType(QualType T, } else if (ParamType->isWebAssemblyTableType()) { Diag(Loc, diag::err_wasm_table_as_function_parameter); Invalid = true; + } else if (ParamType.getPointerAuth()) { + // __ptrauth is illegal on a function return type. + Diag(Loc, diag::err_ptrauth_qualifier_invalid) << T << 1; + Invalid = true; } // C++2a [dcl.fct]p4: @@ -4974,6 +4984,11 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, } } + // __ptrauth is illegal on a function return type. + if (T.getPointerAuth()) { + S.Diag(DeclType.Loc, diag::err_ptrauth_qualifier_invalid) << T << 0; + } + if (LangOpts.OpenCL) { // OpenCL v2.0 s6.12.5 - A block cannot be the return value of a // function. @@ -8333,6 +8348,65 @@ static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr, CurType = S.Context.getVectorType(CurType, numElts, VecKind); } +/// Handle the __ptrauth qualifier. +static void HandlePtrAuthQualifier(ASTContext &Ctx, QualType &T, + const ParsedAttr &Attr, Sema &S) { + + assert((Attr.getNumArgs() > 0 && Attr.getNumArgs() <= 3) && + "__ptrauth qualifier takes between 1 and 3 arguments"); + Expr *KeyArg = Attr.getArgAsExpr(0); + Expr *IsAddressDiscriminatedArg = + Attr.getNumArgs() >= 2 ? Attr.getArgAsExpr(1) : nullptr; + Expr *ExtraDiscriminatorArg = + Attr.getNumArgs() >= 3 ? Attr.getArgAsExpr(2) : nullptr; + + unsigned Key; + if (S.checkConstantPointerAuthKey(KeyArg, Key)) { + Attr.setInvalid(); + return; + } + assert(Key <= PointerAuthQualifier::MaxKey && "ptrauth key is out of range"); + + bool IsInvalid = false; + unsigned IsAddressDiscriminated, ExtraDiscriminator; + IsInvalid |= !S.checkPointerAuthDiscriminatorArg(IsAddressDiscriminatedArg, + Sema::PADAK_AddrDiscPtrAuth, + IsAddressDiscriminated); + IsInvalid |= !S.checkPointerAuthDiscriminatorArg( + ExtraDiscriminatorArg, Sema::PADAK_ExtraDiscPtrAuth, ExtraDiscriminator); + + if (IsInvalid) { + Attr.setInvalid(); + return; + } + + if (!T->isSignableType() && !T->isDependentType()) { + S.Diag(Attr.getLoc(), diag::err_ptrauth_qualifier_nonpointer) << T; + Attr.setInvalid(); + return; + } + + if (T.getPointerAuth()) { + S.Diag(Attr.getLoc(), diag::err_ptrauth_qualifier_redundant) + << T << Attr.getAttrName()->getName(); + Attr.setInvalid(); + return; + } + + if (!S.getLangOpts().PointerAuthIntrinsics) { + S.Diag(Attr.getLoc(), diag::err_ptrauth_disabled) << Attr.getRange(); + Attr.setInvalid(); + return; + } + + assert((!IsAddressDiscriminatedArg || IsAddressDiscriminated <= 1) && + "address discriminator arg should be either 0 or 1"); + PointerAuthQualifier Qual = PointerAuthQualifier::Create( + Key, IsAddressDiscriminated, ExtraDiscriminator, + PointerAuthenticationMode::SignAndAuth, false, false); + T = S.Context.getPointerAuthType(T, Qual); +} + /// HandleArmSveVectorBitsTypeAttr - The "arm_sve_vector_bits" attribute is /// used to create fixed-length versions of sizeless SVE types defined by /// the ACLE, such as svint32_t and svbool_t. @@ -8788,6 +8862,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, HandleOpenCLAccessAttr(type, attr, state.getSema()); attr.setUsedAsTypeAttr(); break; + case ParsedAttr::AT_PointerAuth: + HandlePtrAuthQualifier(state.getSema().Context, type, attr, + state.getSema()); + attr.setUsedAsTypeAttr(); + break; case ParsedAttr::AT_LifetimeBound: if (TAL == TAL_DeclChunk) HandleLifetimeBoundAttr(state, type, attr); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index bb58ec49612c8..2469991bf2ce8 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -5287,6 +5287,17 @@ QualType TreeTransform::RebuildQualifiedType(QualType T, return QualType(); } + PointerAuthQualifier LocalPointerAuth = Quals.getPointerAuth(); + if (LocalPointerAuth.isPresent()) { + if (T.getPointerAuth().isPresent()) { + SemaRef.Diag(Loc, diag::err_ptrauth_qualifier_redundant) + << TL.getType() << "__ptrauth"; + return QualType(); + } else if (!T->isSignableType() && !T->isDependentType()) { + SemaRef.Diag(Loc, diag::err_ptrauth_qualifier_nonpointer) << T; + return QualType(); + } + } // C++ [dcl.fct]p7: // [When] adding cv-qualifications on top of the function type [...] the // cv-qualifiers are ignored. diff --git a/clang/test/AST/ast-dump-ptrauth-json.cpp b/clang/test/AST/ast-dump-ptrauth-json.cpp index 125cda0cff53a..8526598c491c1 100644 --- a/clang/test/AST/ast-dump-ptrauth-json.cpp +++ b/clang/test/AST/ast-dump-ptrauth-json.cpp @@ -1,5 +1,8 @@ // RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -std=c++11 -ast-dump=json %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -std=c++11 -ast-dump=json %s | FileCheck %s // CHECK: "name": "__builtin_ptrauth_type_discriminator", +// CHECK: "qualType": "int *__ptrauth(1,1,123)" int d = __builtin_ptrauth_type_discriminator(int()); +int * __ptrauth(1,1,123) p; diff --git a/clang/test/CodeGen/ptrauth-debuginfo.c b/clang/test/CodeGen/ptrauth-debuginfo.c new file mode 100644 index 0000000000000..b76baffadd9a1 --- /dev/null +++ b/clang/test/CodeGen/ptrauth-debuginfo.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios \ +// RUN: -fptrauth-calls -fptrauth-intrinsics -emit-llvm -fblocks \ +// RUN: %s -debug-info-kind=limited -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu \ +// RUN: -fptrauth-calls -fptrauth-intrinsics -emit-llvm -fblocks \ +// RUN: %s -debug-info-kind=limited -o - | FileCheck %s + +// Constant initializers for data pointers. +extern int external_int; + +int *__ptrauth(1, 0, 1234) g1 = &external_int; +// CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, +// CHECK-SAME: ptrAuthKey: 1, +// CHECK-SAME: ptrAuthIsAddressDiscriminated: false, +// CHECK-SAME: ptrAuthExtraDiscriminator: 1234, +// CHECK-SAME: ptrAuthIsaPointer: false, +// CHECK-SAME: ptrAuthAuthenticatesNullValues: false) + +struct A { + int value; +}; +struct A *createA(void); + +void f() { + __block struct A *__ptrauth(0, 1, 1236) ptr = createA(); + ^{ + (void)ptr->value; + }(); +} +// CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, +// CHECK-NOT: ptrAuthKey +// CHECK-SAME: ptrAuthIsAddressDiscriminated: true, +// CHECK-SAME: ptrAuthExtraDiscriminator: 1236, +// CHECK-SAME: ptrAuthIsaPointer: false, +// CHECK-SAME: ptrAuthAuthenticatesNullValues: false) diff --git a/clang/test/CodeGen/ptrauth-qualifier-const-init.c b/clang/test/CodeGen/ptrauth-qualifier-const-init.c new file mode 100644 index 0000000000000..174f328628f19 --- /dev/null +++ b/clang/test/CodeGen/ptrauth-qualifier-const-init.c @@ -0,0 +1,86 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm %s -o - | FileCheck %s + +// Constant initializers for data pointers. +extern int external_int; + +// CHECK: @g1 = global ptr ptrauth (ptr @external_int, i32 1, i64 56) +int * __ptrauth(1,0,56) g1 = &external_int; + +// CHECK: @g2 = global ptr ptrauth (ptr @external_int, i32 1, i64 1272, ptr @g2) +int * __ptrauth(1,1,1272) g2 = &external_int; + +// CHECK: @g3 = global ptr null +int * __ptrauth(1,1,871) g3 = 0; + +// FIXME: should we make a ptrauth constant for this absolute symbol? +// CHECK: @g4 = global ptr inttoptr (i64 1230 to ptr) +int * __ptrauth(1,1,1902) g4 = (int*) 1230; + +// CHECK: @ga = global [3 x ptr] [ +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 712, ptr @ga), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 712, ptr getelementptr inbounds ([3 x ptr], ptr @ga, i32 0, i32 1)), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 712, ptr getelementptr inbounds ([3 x ptr], ptr @ga, i32 0, i32 2))] +int * __ptrauth(1,1,712) ga[3] = { &external_int, &external_int, &external_int }; + +struct A { + int * __ptrauth(1,0,431) f0; + int * __ptrauth(1,0,9182) f1; + int * __ptrauth(1,0,783) f2; +}; + +// CHECK: @gs1 = global %struct.A { +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 431), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 9182), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 783) } +struct A gs1 = { &external_int, &external_int, &external_int }; + +struct B { + int * __ptrauth(1,1,1276) f0; + int * __ptrauth(1,1,23674) f1; + int * __ptrauth(1,1,163) f2; +}; + +// CHECK: @gs2 = global %struct.B { +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 1276, ptr @gs2), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 23674, ptr getelementptr inbounds (%struct.B, ptr @gs2, i32 0, i32 1)), +// CHECK-SAME: ptr ptrauth (ptr @external_int, i32 1, i64 163, ptr getelementptr inbounds (%struct.B, ptr @gs2, i32 0, i32 2)) } +struct B gs2 = { &external_int, &external_int, &external_int }; + +// Constant initializers for function pointers. +extern void external_function(void); +typedef void (*fpt)(void); + +// CHECK: @f1 = global ptr ptrauth (ptr @external_function, i32 1, i64 56) +fpt __ptrauth(1,0,56) f1 = &external_function; + +// CHECK: @f2 = global ptr ptrauth (ptr @external_function, i32 1, i64 1272, ptr @f2) +fpt __ptrauth(1,1,1272) f2 = &external_function; + +// CHECK: @fa = global [3 x ptr] [ +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 712, ptr @fa), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 712, ptr getelementptr inbounds ([3 x ptr], ptr @fa, i32 0, i32 1)), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 712, ptr getelementptr inbounds ([3 x ptr], ptr @fa, i32 0, i32 2))] +fpt __ptrauth(1,1,712) fa[3] = { &external_function, &external_function, &external_function }; + +struct C { + fpt __ptrauth(1,0,431) f0; + fpt __ptrauth(1,0,9182) f1; + fpt __ptrauth(1,0,783) f2; +}; +// CHECK: @fs1 = global %struct.C { +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 431), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 9182), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 783) } +struct C fs1 = { &external_function, &external_function, &external_function }; + +struct D { + fpt __ptrauth(1,1,1276) f0; + fpt __ptrauth(1,1,23674) f1; + fpt __ptrauth(1,1,163) f2; +}; +// CHECK: @fs2 = global %struct.D { +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 1276, ptr @fs2), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 23674, ptr getelementptr inbounds (%struct.D, ptr @fs2, i32 0, i32 1)), +// CHECK-SAME: ptr ptrauth (ptr @external_function, i32 1, i64 163, ptr getelementptr inbounds (%struct.D, ptr @fs2, i32 0, i32 2)) } +struct D fs2 = { &external_function, &external_function, &external_function }; diff --git a/clang/test/CodeGen/ptrauth-qualifier-function.c b/clang/test/CodeGen/ptrauth-qualifier-function.c new file mode 100644 index 0000000000000..cd25b77a01548 --- /dev/null +++ b/clang/test/CodeGen/ptrauth-qualifier-function.c @@ -0,0 +1,145 @@ +// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,TYPE %s +// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,TYPE %s +// RUN: %clang_cc1 %s -triple arm64-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,ZERO %s +// RUN: %clang_cc1 %s -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,ZERO %s +// RUN: %clang_cc1 -xc++ %s -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,TYPE,CHECK-CXX %s +// RUN: %clang_cc1 -xc++ %s -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck --check-prefixes=CHECK,TYPE,CHECK-CXX %s + +#ifdef __cplusplus +extern "C" { +#endif + +void (*fptr)(void); +void (* __ptrauth(0, 0, 42) f2ptr_42_discm)(int); +void f(int); +void (* const __ptrauth(0, 0, 42) f_const_ptr)(int) = &f; + +// CHECK-LABEL: define {{.*}}void @test_assign_to_qualified +void test_assign_to_qualified() { + f2ptr_42_discm = (void (*)(int))fptr; + + // CHECK: [[ENTRY:.*]]:{{$}} + // CHECK: [[FPTR:%.*]] = load ptr, ptr @fptr + // CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[FPTR]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[RESIGN1:.*]], label %[[JOIN1:.*]] + // ZERO-NEXT: br i1 [[CMP]], label %[[RESIGN2:.*]], label %[[JOIN2:.*]] + + // TYPE: [[RESIGN1]]: + // TYPE-NEXT: [[FPTR2:%.*]] = ptrtoint ptr [[FPTR]] to i64 + // TYPE-NEXT: [[FPTR4:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR2]], i32 0, i64 18983, i32 0, i64 2712) + // TYPE-NEXT: [[FPTR5:%.*]] = inttoptr i64 [[FPTR4]] to ptr + // TYPE-NEXT: br label %[[JOIN1]] + + // TYPE: [[JOIN1]]: + // TYPE-NEXT: [[FPTR6:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[FPTR5]], %[[RESIGN1]] ] + // TYPE-NEXT: [[CMP:%.*]] = icmp ne ptr [[FPTR6]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[RESIGN2:.*]], label %[[JOIN2:.*]] + + // CHECK: [[RESIGN2]]: + // TYPE-NEXT: [[FPTR7:%.*]] = ptrtoint ptr [[FPTR6]] to i64 + // TYPE-NEXT: [[FPTR8:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR7]], i32 0, i64 2712, i32 0, i64 42) + // ZERO-NEXT: [[FPTR7:%.*]] = ptrtoint ptr [[FPTR]] to i64 + // ZERO-NEXT: [[FPTR8:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR7]], i32 0, i64 0, i32 0, i64 42) + // CHECK-NEXT: [[FPTR9:%.*]] = inttoptr i64 [[FPTR8]] to ptr + // CHECK-NEXT: br label %[[JOIN2]] + + // CHECK: [[JOIN2]] + // TYPE-NEXT: [[FPTR10:%.*]] = phi ptr [ null, %[[JOIN1]] ], [ [[FPTR9]], %[[RESIGN2]] ] + // ZERO-NEXT: [[FPTR10:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[FPTR9]], %[[RESIGN2]] ] + // CHECK-NEXT store void (i32)* [[FPTR10]], void (i32)** @f2ptr_42_discm +} + +// CHECK-LABEL: define {{.*}}void @test_assign_from_qualified +void test_assign_from_qualified() { + fptr = (void (*)(void))f2ptr_42_discm; + + // CHECK: [[ENTRY:.*]]:{{$}} + // CHECK: [[FPTR:%.*]] = load ptr, ptr @f2ptr_42_discm + // CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[FPTR]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[RESIGN1:.*]], label %[[JOIN1:.*]] + // ZERO-NEXT: br i1 [[CMP]], label %[[RESIGN2:.*]], label %[[JOIN2:.*]] + + // TYPE: [[RESIGN1]]: + // TYPE-NEXT: [[FPTR1:%.*]] = ptrtoint ptr [[FPTR]] to i64 + // TYPE-NEXT: [[FPTR2:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR1]], i32 0, i64 42, i32 0, i64 2712) + // TYPE-NEXT: [[FPTR3:%.*]] = inttoptr i64 [[FPTR2]] to ptr + // TYPE-NEXT: br label %[[JOIN1]] + + // TYPE: [[JOIN1]]: + // TYPE-NEXT: [[FPTR4:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[FPTR3]], %[[RESIGN1]] ] + // TYPE-NEXT: [[CMP:%.*]] = icmp ne ptr [[FPTR4]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[RESIGN2:.*]], label %[[JOIN2:.*]] + + // CHECK: [[RESIGN2]]: + // TYPE-NEXT: [[FPTR6:%.*]] = ptrtoint ptr [[FPTR4]] to i64 + // TYPE-NEXT: [[FPTR7:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR6]], i32 0, i64 2712, i32 0, i64 18983) + // ZERO-NEXT: [[FPTR6:%.*]] = ptrtoint ptr [[FPTR]] to i64 + // ZERO-NEXT: [[FPTR7:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[FPTR6]], i32 0, i64 42, i32 0, i64 0) + // CHECK-NEXT: [[FPTR8:%.*]] = inttoptr i64 [[FPTR7]] to ptr + // CHECK-NEXT: br label %[[JOIN2]] + + // CHECK: [[JOIN2]] + // TYPE-NEXT: [[FPTR9:%.*]] = phi ptr [ null, %[[JOIN1]] ], [ [[FPTR8]], %[[RESIGN2]] ] + // ZERO-NEXT: [[FPTR9:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[FPTR8]], %[[RESIGN2]] ] + // CHECK-NEXT store void ()* [[FPTR10]], void ()** @f2ptr_42_discm +} + +// CHECK-LABEL: define {{.*}}void @test_const_ptr_function_call() +void test_const_ptr_function_call(void) { + f_const_ptr(1); + + // TYPE: call void ptrauth (ptr @f, i32 0, i64 2712)(i32 noundef 1) [ "ptrauth"(i32 0, i64 2712) ] + // ZERO: call void ptrauth (ptr @f, i32 0)(i32 noundef 1) [ "ptrauth"(i32 0, i64 0) ] +} + +#ifdef __cplusplus +void (* get_fptr(void))(int); +void (* __ptrauth(0, 0, 42) f_const_ptr2)(int) = get_fptr(); +void (* const __ptrauth(0, 1, 43) &f_ref)(int) = f_const_ptr2; + +// CHECK-CXX-LABEL: define {{.*}}internal void @__cxx_global_var_init() +// CHECK-CXX: [[ENTRY:.*]]: +// CHECK-CXX: %[[CALL:.*]] = call ptr @get_fptr() +// CHECK-CXX: %[[V0:.*]] = icmp ne ptr %[[CALL]], null +// CHECK-CXX: br i1 %[[V0]], label %[[RESIGN_NONNULL:.*]], label %[[RESIGN_CONT:.*]] + +// CHECK-CXX: [[RESIGN_NONNULL]]: +// CHECK-CXX: %[[V1:.*]] = ptrtoint ptr %[[CALL]] to i64 +// CHECK-CXX: %[[V2:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V1]], i32 0, i64 2712, i32 0, i64 42) +// CHECK-CXX: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr +// CHECK-CXX: br label %[[RESIGN_CONT]] + +// CHECK-CXX: [[RESIGN_CONT]]: +// CHECK-CXX: %[[V4:.*]] = phi ptr [ null, %[[ENTRY]] ], [ %[[V3]], %[[RESIGN_NONNULL]] ] +// CHECK-CXX: store ptr %[[V4]], ptr @f_const_ptr2, align 8 + +// CHECK-CXX-LABEL: define {{.*}}internal void @__cxx_global_var_init.1() +// CHECK-CXX: [[ENTRY:.*]]: +// CHECK-CXX: %[[V0:.*]] = load ptr, ptr @f_const_ptr2, align 8 +// CHECK-CXX: %[[V1:.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @_ZGR5f_ref_ to i64), i64 43) +// CHECK-CXX: %[[V2:.*]] = icmp ne ptr %[[V0]], null +// CHECK-CXX: br i1 %[[V2]], label %[[RESIGN_NONNULL:.*]], label %[[RESIGN_CONT:.*]] + +// CHECK-CXX: [[RESIGN_NONNULL]]: +// CHECK-CXX: %[[V3:.*]] = ptrtoint ptr %[[V0]] to i64 +// CHECK-CXX: %[[V4:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V3]], i32 0, i64 42, i32 0, i64 %[[V1]]) +// CHECK-CXX: %[[V5:.*]] = inttoptr i64 %[[V4]] to ptr +// CHECK-CXX: br label %[[RESIGN_CONT]] + +// CHECK-CXX: [[RESIGN_CONT]]: +// CHECK-CXX: %[[V6:.*]] = phi ptr [ null, %[[ENTRY]] ], [ %[[V5]], %[[RESIGN_NONNULL]] ] +// CHECK-CXX: store ptr %[[V6]], ptr @_ZGR5f_ref_, align 8 +// CHECK-CXX: store ptr @_ZGR5f_ref_, ptr @f_ref, align 8 + +// CHECK-CXX-LABEL: define {{.*}}void @test_const_ptr_ref_function_call() +void test_const_ptr_ref_function_call(void) { + f_ref(1); + + // CHECK-CXX: %[[V0:.*]] = load ptr, ptr @f_ref, align 8 + // CHECK-CXX: %[[V1:.*]] = load ptr, ptr %[[V0]], align 8 + // CHECK-CXX: %[[V2:.*]] = ptrtoint ptr %[[V0]] to i64 + // CHECK-CXX: %[[V3:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V2]], i64 43) + // CHECK-CXX: call void %[[V1]](i32 noundef 1) [ "ptrauth"(i32 0, i64 %[[V3]]) ] +} +} +#endif diff --git a/clang/test/CodeGen/ptrauth-qualifier-loadstore.c b/clang/test/CodeGen/ptrauth-qualifier-loadstore.c new file mode 100644 index 0000000000000..db259ed950fec --- /dev/null +++ b/clang/test/CodeGen/ptrauth-qualifier-loadstore.c @@ -0,0 +1,745 @@ +// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fptrauth-function-pointer-type-discrimination -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm %s -o - | FileCheck %s + +#define IQ __ptrauth(1,0,50) +#define AQ __ptrauth(1,1,50) +#define DIFF_IQ __ptrauth(1,0,100) +#define DIFF_AQ __ptrauth(1,1,100) +#define ZERO_IQ __ptrauth(1,0,0) +#define ZERO_AQ __ptrauth(1,1,0) + +extern int external_int; +extern int * global_upi; +extern int * IQ global_iqpi; +extern int * AQ global_aqpi; +extern void use_upi(int *ptr); + +typedef void func_t(void); +extern void external_func(void); +extern func_t *global_upf; +extern func_t * IQ global_iqpf; +extern func_t * AQ global_aqpf; +extern void use_upf(func_t *ptr); + +// Data with address-independent qualifiers. + +// CHECK-LABEL: define {{.*}}void @test_store_data_i_constant() +void test_store_data_i_constant() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @external_int to i64), i32 1, i64 50) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * IQ iqpi = &external_int; +// CHECK-NEXT: [[T0:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @external_int to i64), i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T0]] to ptr +// CHECK-NEXT: store ptr [[SIGNED]], ptr [[V]], +// CHECK-NEXT: ret void + iqpi = &external_int; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_iu() +void test_store_data_iu() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * IQ iqpi = global_upi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpi = global_upi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_ia() +void test_store_data_ia() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * IQ iqpi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[RESULT:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[RESULT]], ptr [[V]], +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[RESULT]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[RESULT]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[RESULT:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: call void @use_upi(ptr noundef [[RESULT]]) + use_upi(iqpi = global_aqpi); +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_ii_same() +void test_store_data_ii_same() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: store ptr [[LOAD]], ptr [[V]], + int * IQ iqpi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: store ptr [[LOAD]], ptr [[V]], + iqpi = global_iqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_ii_different() +void test_store_data_ii_different() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 100) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * DIFF_IQ iqpi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 100) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpi = global_iqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_ii_zero() +void test_store_data_ii_zero() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 0) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * ZERO_IQ iqpi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[V]] +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 0, i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr @global_iqpi, + global_iqpi = iqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_load_data_i() +void test_load_data_i() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int *upi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + upi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 50) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: call void @use_upi(ptr noundef [[T0]]) + use_upi(global_iqpi); +} + +// Data with address-discriminated qualifiers. + +// CHECK-LABEL: define {{.*}}void @test_store_data_a_constant() +void test_store_data_a_constant() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @external_int to i64), i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * AQ aqpi = &external_int; +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @external_int to i64), i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpi = &external_int; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_au() +void test_store_data_au() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upi, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * AQ aqpi = global_upi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upi, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpi = global_upi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_ai() +void test_store_data_ai() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * AQ aqpi = global_iqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpi, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpi = global_iqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_aa_same() +void test_store_data_aa_same() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * AQ aqpi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpi = global_aqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_aa_different() +void test_store_data_aa_different() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 100) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * DIFF_AQ aqpi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 100) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpi = global_aqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_store_data_aa_zero() +void test_store_data_aa_zero() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[NEWDISC:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int * ZERO_AQ aqpi = global_aqpi; +// CHECK: [[LOAD:%.*]] = load ptr, ptr [[V]], +// CHECK-NEXT: [[OLDDISC:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr @global_aqpi, + global_aqpi = aqpi; +} + +// CHECK-LABEL: define {{.*}}void @test_load_data_a() +void test_load_data_a() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 [[OLDDISC]]) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + int *upi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 [[OLDDISC]]) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + upi = global_aqpi; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpi, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpi to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 1, i64 [[OLDDISC]]) +// CHECK-NEXT: [[AUTHED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[AUTHED]], {{.*}} ] +// CHECK-NEXT: call void @use_upi(ptr noundef [[T0]]) + use_upi(global_aqpi); +} + +// Function with address-independent qualifiers. + +// CHECK-LABEL: define {{.*}}void @test_store_function_i_constant() +void test_store_function_i_constant() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @external_func, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 1, i64 50) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * IQ iqpf = &external_func; +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @external_func, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 1, i64 50) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpf = &external_func; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_iu() +void test_store_function_iu() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 18983, i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * IQ iqpf = global_upf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 18983, i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpf = global_upf; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_ia() +void test_store_function_ia() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * IQ iqpf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 50) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[RESULT:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[RESULT]], ptr [[V]], +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[RESULT]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[RESULT]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: call void @use_upf(ptr noundef [[T0]]) + use_upf(iqpf = global_aqpf); +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_ii_same() +void test_store_function_ii_same() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: store ptr [[LOAD]], ptr [[V]], + func_t * IQ iqpf = global_iqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: store ptr [[LOAD]], ptr [[V]], + iqpf = global_iqpf; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_ii_different() +void test_store_function_ii_different() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 100) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * DIFF_IQ iqpf = global_iqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 100) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + iqpf = global_iqpf; +} + +// CHECK-LABEL: define {{.*}}void @test_load_function_i() +void test_load_function_i() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t *upf = global_iqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + upf = global_iqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: call void @use_upf(ptr noundef [[T0]]) + use_upf(global_iqpf); +} + +// Function with address-discriminated qualifiers. + +// CHECK-LABEL: define {{.*}}void @test_store_function_a_constant() +void test_store_function_a_constant() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @external_func, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * AQ aqpf = &external_func; +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[SIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @external_func, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[T0:%.*]] = inttoptr i64 [[SIGN]] to ptr +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpf = &external_func; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_au() +void test_store_function_au() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upf, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 18983, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * AQ aqpf = global_upf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_upf, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 18983, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpf = global_upf; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_ai() +void test_store_function_ai() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * AQ aqpf = global_iqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_iqpf, +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 50, i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpf = global_iqpf; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_aa_same() +void test_store_function_aa_same() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * AQ aqpf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpf = global_aqpf; +} + +// CHECK-LABEL: define {{.*}}void @test_store_function_aa_different() +void test_store_function_aa_different() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 100) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t * DIFF_AQ aqpf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[V]] to i64 +// CHECK-NEXT: [[NEWDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 100) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 1, i64 [[NEWDISC]]) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + aqpf = global_aqpf; +} + +// CHECK-LABEL: define {{.*}}void @test_load_function_a() +void test_load_function_a() { +// CHECK: [[V:%.*]] = alloca ptr, +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + func_t *upf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: store ptr [[T0]], ptr [[V]], + upf = global_aqpf; +// CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr @global_aqpf, +// CHECK-NEXT: [[OLDDISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @global_aqpf to i64), i64 50) +// CHECK-NEXT: [[T0:%.*]] = icmp ne ptr [[LOAD]], null +// CHECK-NEXT: br i1 [[T0]], +// CHECK: [[T0:%.*]] = ptrtoint ptr [[LOAD]] to i64 +// CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 1, i64 [[OLDDISC]], i32 0, i64 18983) +// CHECK-NEXT: [[SIGNED:%.*]] = inttoptr i64 [[T1]] to ptr +// CHECK-NEXT: br label +// CHECK: [[T0:%.*]] = phi ptr [ null, {{.*}} ], [ [[SIGNED]], {{.*}} ] +// CHECK-NEXT: call void @use_upf(ptr noundef [[T0]]) + use_upf(global_aqpf); +} diff --git a/clang/test/CodeGenCXX/mangle-itanium-ptrauth.cpp b/clang/test/CodeGenCXX/mangle-itanium-ptrauth.cpp new file mode 100644 index 0000000000000..88d80423c3764 --- /dev/null +++ b/clang/test/CodeGenCXX/mangle-itanium-ptrauth.cpp @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -std=c++11 -fptrauth-intrinsics -fptrauth-calls -emit-llvm -o - -triple=arm64-apple-ios %s | FileCheck %s +// RUN: %clang_cc1 -std=c++11 -fptrauth-intrinsics -fptrauth-calls -emit-llvm -o - -triple=aarch64-linux-gnu %s | FileCheck %s + +// CHECK: define {{.*}}void @_Z3fooPU9__ptrauthILj3ELb1ELj234EEPi( +void foo(int * __ptrauth(3, 1, 234) *) {} + +template +void foo(T t) {} + +// CHECK: define weak_odr void @_Z3fooIPU9__ptrauthILj1ELb0ELj64EEPiEvT_( +template void foo(int * __ptrauth(1, 0, 64) *); + diff --git a/clang/test/CodeGenCXX/mangle-ms-ptrauth.cpp b/clang/test/CodeGenCXX/mangle-ms-ptrauth.cpp new file mode 100644 index 0000000000000..95e5efa472dfd --- /dev/null +++ b/clang/test/CodeGenCXX/mangle-ms-ptrauth.cpp @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -std=c++11 -fptrauth-intrinsics -fptrauth-calls -emit-llvm -o - -triple=aarch64-windows-msvc %s | FileCheck %s + +template +struct S {}; + +// CHECK: @"?s@@3U?$S@PE__ptrauth1A@ENC@AH@@A" = +S s; + +// CHECK: define dso_local void @"?foo@@YAXPEAPE__ptrauth20OK@AH@Z"( +void foo(int * __ptrauth(3, 1, 234) *) {} + +template +void foo(T t) {} + +// CHECK: define weak_odr dso_local void @"??$foo@PEAPE__ptrauth0A@EA@AH@@YAXPEAPE__ptrauth0A@EA@AH@Z"( +template void foo(int * __ptrauth(1, 0, 64) *); + diff --git a/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp new file mode 100644 index 0000000000000..7d6de50d926b5 --- /dev/null +++ b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp @@ -0,0 +1,168 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -std=c++11 -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,IOS %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -std=c++11 -emit-llvm %s -o - | FileCheck %s + +#define AQ __ptrauth(1,1,50) +#define IQ __ptrauth(1,0,50) + +// CHECK: %[[STRUCT_SA:.*]] = type { ptr, ptr } +// CHECK: %[[STRUCT_SI:.*]] = type { ptr } + +struct SA { + int * AQ m0; // Signed using address discrimination. + int * AQ m1; // Signed using address discrimination. +}; + +struct SI { + int * IQ m; // No address discrimination. +}; + +struct __attribute__((trivial_abi)) TrivialSA { + int * AQ m0; // Signed using address discrimination. + int * AQ m1; // Signed using address discrimination. +}; + +// Check that TrivialSA is passed indirectly despite being annotated with +// 'trivial_abi'. + +// CHECK: define {{.*}}void @_Z18testParamTrivialSA9TrivialSA(ptr noundef %{{.*}}) + +void testParamTrivialSA(TrivialSA a) { +} + +// CHECK: define {{.*}}void @_Z19testCopyConstructor2SA(ptr +// CHECK: call {{.*}}@_ZN2SAC1ERKS_( + +// CHECK: define linkonce_odr {{.*}}@_ZN2SAC1ERKS_( +// CHECK: call {{.*}}@_ZN2SAC2ERKS_( + +void testCopyConstructor(SA a) { + SA t = a; +} + +// CHECK: define {{.*}}void @_Z19testMoveConstructor2SA(ptr +// CHECK: call {{.*}}@_ZN2SAC1EOS_( + +// CHECK: define linkonce_odr {{.*}}@_ZN2SAC1EOS_( +// CHECK: call {{.*}}@_ZN2SAC2EOS_( + +void testMoveConstructor(SA a) { + SA t = static_cast(a); +} + +// CHECK: define {{.*}}void @_Z18testCopyAssignment2SA(ptr +// CHECK: call noundef nonnull align 8 dereferenceable(16) ptr @_ZN2SAaSERKS_( + +// CHECK: define {{.*}}linkonce_odr noundef nonnull align 8 dereferenceable(16) ptr @_ZN2SAaSERKS_(ptr noundef nonnull align 8 dereferenceable(16) %[[THIS:.*]], ptr noundef nonnull align 8 dereferenceable(16) %0) +// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8 +// CHECK: %[[_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8 +// CHECK: store ptr %[[V0:.*]], ptr %[[_ADDR]], align 8 +// CHECK: %[[THISI:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8 +// CHECK: %[[M0:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[THISI]], i32 0, i32 0 +// CHECK: %[[V1:.*]] = load ptr, ptr %[[_ADDR]], align 8 +// CHECK: %[[M02:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[V1]], i32 0, i32 0 +// CHECK: %[[V2:.*]] = load ptr, ptr %[[M02]], align 8 +// CHECK: %[[V3:.*]] = ptrtoint ptr %[[M02]] to i64 +// CHECK: %[[V4:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V3]], i64 50) +// CHECK: %[[V5:.*]] = ptrtoint ptr %[[M0]] to i64 +// CHECK: %[[V6:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 50) +// CHECK: %[[V8:.*]] = ptrtoint ptr %[[V2]] to i64 +// CHECK: %[[V9:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V8]], i32 1, i64 %[[V4]], i32 1, i64 %[[V6]]) + +void testCopyAssignment(SA a) { + SA t; + t = a; +} + +// CHECK: define {{.*}}void @_Z18testMoveAssignment2SA(ptr +// CHECK: call noundef nonnull align 8 dereferenceable(16) ptr @_ZN2SAaSEOS_( + +// CHECK: define {{.*}}linkonce_odr noundef nonnull align 8 dereferenceable(16) ptr @_ZN2SAaSEOS_(ptr noundef nonnull align 8 dereferenceable(16) %[[THIS:.*]], ptr noundef nonnull align 8 dereferenceable(16) %0) +// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8 +// CHECK: %[[_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8 +// CHECK: store ptr %[[V0:.*]], ptr %[[_ADDR]], align 8 +// CHECK: %[[THISI:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8 +// CHECK: %[[M0:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[THISI]], i32 0, i32 0 +// CHECK: %[[V1:.*]] = load ptr, ptr %[[_ADDR]], align 8 +// CHECK: %[[M02:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[V1]], i32 0, i32 0 +// CHECK: %[[V2:.*]] = load ptr, ptr %[[M02]], align 8 +// CHECK: %[[V3:.*]] = ptrtoint ptr %[[M02]] to i64 +// CHECK: %[[V4:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V3]], i64 50) +// CHECK: %[[V5:.*]] = ptrtoint ptr %[[M0]] to i64 +// CHECK: %[[V6:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 50) +// CHECK: %[[V8:.*]] = ptrtoint ptr %[[V2]] to i64 +// CHECK: %[[V9:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V8]], i32 1, i64 %[[V4]], i32 1, i64 %[[V6]]) + +void testMoveAssignment(SA a) { + SA t; + t = static_cast(a); +} + +// CHECK: define {{.*}}void @_Z19testCopyConstructor2SI(i +// CHECK: call void @llvm.memcpy.p0.p0.i64( + +void testCopyConstructor(SI a) { + SI t = a; +} + +// CHECK: define {{.*}}void @_Z19testMoveConstructor2SI( +// CHECK: call void @llvm.memcpy.p0.p0.i64( + +void testMoveConstructor(SI a) { + SI t = static_cast(a); +} + +// CHECK: define {{.*}}void @_Z18testCopyAssignment2SI( +// CHECK: call void @llvm.memcpy.p0.p0.i64( + +void testCopyAssignment(SI a) { + SI t; + t = a; +} + +// CHECK: define {{.*}}void @_Z18testMoveAssignment2SI( +// CHECK: call void @llvm.memcpy.p0.p0.i64( + +void testMoveAssignment(SI a) { + SI t; + t = static_cast(a); +} + +// CHECK: define linkonce_odr {{.*}}@_ZN2SAC2ERKS_(ptr noundef nonnull align 8 dereferenceable(16) %[[THIS:.*]], ptr noundef nonnull align 8 dereferenceable(16) %0) +// IOS: %[[RETVAL:.*]] = alloca ptr, align 8 +// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8 +// CHECK: %[[_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8 +// CHECK: store ptr %[[V0:.*]], ptr %[[_ADDR]], align 8 +// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8 +// IOS: store ptr %[[THIS1]], ptr %[[RETVAL]], align 8 +// CHECK: %[[M0:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[THIS1]], i32 0, i32 0 +// CHECK: %[[V1:.*]] = load ptr, ptr %[[_ADDR]], align 8 +// CHECK: %[[M02:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[V1]], i32 0, i32 0 +// CHECK: %[[V2:.*]] = load ptr, ptr %[[M02]], align 8 +// CHECK: %[[V3:.*]] = ptrtoint ptr %[[M02]] to i64 +// CHECK: %[[V4:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V3]], i64 50) +// CHECK: %[[V5:.*]] = ptrtoint ptr %[[M0]] to i64 +// CHECK: %[[V6:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 50) +// CHECK: %[[V8:.*]] = ptrtoint ptr %[[V2]] to i64 +// CHECK: %[[V9:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V8]], i32 1, i64 %[[V4]], i32 1, i64 %[[V6]]) + +// CHECK: define linkonce_odr {{.*}}@_ZN2SAC2EOS_(ptr noundef nonnull align 8 dereferenceable(16) %[[THIS:.*]], ptr noundef nonnull align 8 dereferenceable(16) %0) +// IOS: %[[RETVAL:.*]] = alloca ptr, align 8 +// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8 +// CHECK: %[[_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8 +// CHECK: store ptr %[[V0:.*]], ptr %[[_ADDR]], align 8 +// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8 +// IOS: store ptr %[[THIS1]], ptr %[[RETVAL]], align 8 +// CHECK: %[[M0:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[THIS1]], i32 0, i32 0 +// CHECK: %[[V1:.*]] = load ptr, ptr %[[_ADDR]], align 8 +// CHECK: %[[M02:.*]] = getelementptr inbounds nuw %[[STRUCT_SA]], ptr %[[V1]], i32 0, i32 0 +// CHECK: %[[V2:.*]] = load ptr, ptr %[[M02]], align 8 +// CHECK: %[[V3:.*]] = ptrtoint ptr %[[M02]] to i64 +// CHECK: %[[V4:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V3]], i64 50) +// CHECK: %[[V5:.*]] = ptrtoint ptr %[[M0]] to i64 +// CHECK: %[[V6:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 50) +// CHECK: %[[V8:.*]] = ptrtoint ptr %[[V2]] to i64 +// CHECK: %[[V9:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V8]], i32 1, i64 %[[V4]], i32 1, i64 %[[V6]]) diff --git a/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm new file mode 100644 index 0000000000000..e5cb71bad47c0 --- /dev/null +++ b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios11 -fptrauth-calls -fptrauth-intrinsics -std=c++11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s + +#define AQ __ptrauth(1,1,50) + +struct AddrDiscStrong0 { + int * AQ f0; // Signed using address discrimination. + __strong id f1; +}; + +struct AddrDiscStrong1 { + AddrDiscStrong1(const AddrDiscStrong1 &); + int * AQ f0; // Signed using address discrimination. + __strong id f1; +}; + +// Check that AddrDiscStrong0 is destructed in the callee. + +// CHECK: define void @_Z24testParamAddrDiscStrong015AddrDiscStrong0(ptr noundef %[[A:.*]]) +// CHECK: call noundef ptr @_ZN15AddrDiscStrong0D1Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %[[A]]) +// CHECK: ret void + +// CHECK: define linkonce_odr noundef ptr @_ZN15AddrDiscStrong0D1Ev( + +void testParamAddrDiscStrong0(AddrDiscStrong0 a) { +} + +// Check that AddrDiscStrong1 is not destructed in the callee because it has a +// non-trivial copy constructor. + +// CHECK: define void @_Z24testParamAddrDiscStrong115AddrDiscStrong1(ptr noundef %{{.*}}) +// CHECK-NOT: call +// CHECK: ret void + +void testParamAddrDiscStrong1(AddrDiscStrong1 a) { +} diff --git a/clang/test/Parser/ptrauth-qualifier.c b/clang/test/Parser/ptrauth-qualifier.c new file mode 100644 index 0000000000000..2071ac6c2d661 --- /dev/null +++ b/clang/test/Parser/ptrauth-qualifier.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics %s + +#if __aarch64__ +#define VALID_DATA_KEY 2 +#else +#error Provide these constants if you port this test +#endif + +int * __ptrauth(VALID_DATA_KEY) valid0; + +typedef int *intp; + +int nonConstantGlobal = 5; + +__ptrauth int invalid0; // expected-error{{expected '('}} +__ptrauth() int invalid1; // expected-error{{expected expression}} +int * __ptrauth(VALID_DATA_KEY, 1, 1000, 12) invalid12; // expected-error{{qualifier must take between 1 and 3 arguments}} diff --git a/clang/test/Preprocessor/ptrauth_extension.c b/clang/test/Preprocessor/ptrauth_extension.c new file mode 100644 index 0000000000000..d6b79187ba62d --- /dev/null +++ b/clang/test/Preprocessor/ptrauth_extension.c @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-intrinsics | \ +// RUN: FileCheck %s --check-prefixes=INTRIN + +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN + +#if __has_extension(ptrauth_qualifier) +// INTRIN: has_ptrauth_qualifier +void has_ptrauth_qualifier() {} +#else +// NOINTRIN: no_ptrauth_qualifier +void no_ptrauth_qualifier() {} +#endif diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c new file mode 100644 index 0000000000000..ccb9a1abcc14d --- /dev/null +++ b/clang/test/Sema/ptrauth-atomic-ops.c @@ -0,0 +1,118 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics %s + +#include + +int i; +int *__ptrauth(2, 1, 100) authenticated_ptr = &i; +int *__ptrauth(2, 0, 200) non_addr_discriminatedauthenticated_ptr = &i; +int * wat = &i; +#define ATOMIZE(p) (__typeof__(p) volatile _Atomic *)(long)(&p) + +void f() { + static int j = 1; + __c11_atomic_init(ATOMIZE(authenticated_ptr), 5); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_store(ATOMIZE(authenticated_ptr), 0, memory_order_relaxed); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_load(ATOMIZE(authenticated_ptr), memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_store(ATOMIZE(authenticated_ptr), 1, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_store_n(ATOMIZE(authenticated_ptr), 4, memory_order_release); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_store(ATOMIZE(authenticated_ptr), j, memory_order_release); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_exchange(ATOMIZE(authenticated_ptr), 1, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_exchange(ATOMIZE(authenticated_ptr), &j, &j, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_fetch_add(ATOMIZE(authenticated_ptr), 1, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_add(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_sub(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_min(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_max(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __c11_atomic_fetch_and(ATOMIZE(authenticated_ptr), 1, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_and(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_or(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + __atomic_fetch_xor(ATOMIZE(authenticated_ptr), 3, memory_order_seq_cst); + // expected-error@-1 {{address argument to atomic operation must be a pointer to a non address discriminated type ('volatile __ptrauth(2,1,100) _Atomic(int *) *' invalid)}} + + __c11_atomic_init(ATOMIZE(non_addr_discriminatedauthenticated_ptr), &j); + __c11_atomic_store(ATOMIZE(non_addr_discriminatedauthenticated_ptr), 0, memory_order_relaxed); + __c11_atomic_load(ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); + __atomic_store(&j, ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_release); + // expected-warning@-1 {{incompatible pointer types passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'int *'}} + __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} + __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst); + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type 'long'}} + __c11_atomic_fetch_and(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} + + + __sync_fetch_and_add(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_fetch_and_sub(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_fetch_and_or(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_fetch_and_and(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_fetch_and_xor(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_fetch_and_nand(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + + __sync_add_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_sub_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_or_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_and_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_xor_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_nand_and_fetch(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + + __sync_bool_compare_and_swap(&authenticated_ptr, 1, 0); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_val_compare_and_swap(&authenticated_ptr, 1, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + + __sync_lock_test_and_set(&authenticated_ptr, 1); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + __sync_lock_release(&authenticated_ptr); + // expected-error@-1 {{address argument to __sync operation must be a pointer to a non address discriminated type ('int *__ptrauth(2,1,100)' invalid)}} + + +int i = 0; + + __sync_fetch_and_add(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_fetch_and_sub(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_fetch_and_or(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_fetch_and_and(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_fetch_and_xor(&non_addr_discriminatedauthenticated_ptr, &i); + + __sync_add_and_fetch(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_sub_and_fetch(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_or_and_fetch(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_and_and_fetch(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_xor_and_fetch(&non_addr_discriminatedauthenticated_ptr, &i); + + __sync_bool_compare_and_swap(&non_addr_discriminatedauthenticated_ptr, &i, &i); + __sync_val_compare_and_swap(&non_addr_discriminatedauthenticated_ptr, &i, &i); + + __sync_lock_test_and_set(&non_addr_discriminatedauthenticated_ptr, &i); + __sync_lock_release(&non_addr_discriminatedauthenticated_ptr); +} diff --git a/clang/test/Sema/ptrauth-qualifier.c b/clang/test/Sema/ptrauth-qualifier.c new file mode 100644 index 0000000000000..99d16b062ca6f --- /dev/null +++ b/clang/test/Sema/ptrauth-qualifier.c @@ -0,0 +1,103 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s + +#if __has_feature(ptrauth_qualifier) +#warning __ptrauth qualifier enabled! +// expected-warning@-1 {{__ptrauth qualifier enabled!}} +#endif + +#if __aarch64__ +#define VALID_CODE_KEY 0 +#define VALID_DATA_KEY 2 +#define INVALID_KEY 200 +#else +#error Provide these constants if you port this test +#endif + +int * __ptrauth(VALID_DATA_KEY) valid0; +int *ptr0; + +typedef int *intp; + +int nonConstantGlobal = 5; + +__ptrauth(INVALID_KEY) int invalid2; // expected-error{{200 does not identify a valid pointer authentication key for the current target}} +__ptrauth(VALID_DATA_KEY) int invalid3; // expected-error {{'__ptrauth' qualifier only applies to pointer types; 'int' is invalid}} +__ptrauth(VALID_DATA_KEY) int *invalid4; // expected-error {{'__ptrauth' qualifier only applies to pointer types; 'int' is invalid}} +int * (__ptrauth(VALID_DATA_KEY) invalid5); // expected-error{{expected identifier or '('}} expected-error{{expected ')'}} expected-note {{to match this '('}} +int *__ptrauth(VALID_DATA_KEY) __ptrauth(VALID_DATA_KEY) invalid6; // expected-error{{type 'int *__ptrauth(2,0,0)' is already __ptrauth-qualified}} +int * __ptrauth(VALID_DATA_KEY, 2) invalid7; // expected-error {{invalid address discrimination flag '2'; '__ptrauth' requires '0' or '1'}} +int * __ptrauth(VALID_DATA_KEY, -1) invalid8; // expected-error {{invalid address discrimination flag '-1'; '__ptrauth' requires '0' or '1'}} +int * __ptrauth(VALID_DATA_KEY, 1, -1) invalid9; // expected-error {{invalid extra discriminator flag '-1'; '__ptrauth' requires a value between '0' and '65535'}} +int * __ptrauth(VALID_DATA_KEY, 1, 100000) invalid10; // expected-error {{invalid extra discriminator flag '100000'; '__ptrauth' requires a value between '0' and '65535'}} +int * __ptrauth(VALID_DATA_KEY, 1, nonConstantGlobal) invalid12; // expected-error {{argument to '__ptrauth' must be an integer constant expression}} +int * __ptrauth(VALID_DATA_KEY, nonConstantGlobal, 1000) invalid13; // expected-error {{argument to '__ptrauth' must be an integer constant expression}} +int * __ptrauth(nonConstantGlobal, 1, 1000) invalid14; // expected-error{{expression is not an integer constant expression}} + +int * __ptrauth(VALID_DATA_KEY) valid0; +int * __ptrauth(VALID_DATA_KEY) *valid1; +__ptrauth(VALID_DATA_KEY) intp valid2; +__ptrauth(VALID_DATA_KEY) intp *valid3; +intp __ptrauth(VALID_DATA_KEY) valid4; +intp __ptrauth(VALID_DATA_KEY) *valid5; +int * __ptrauth(VALID_DATA_KEY, 0) valid6; +int * __ptrauth(VALID_DATA_KEY, 1) valid7; +int * __ptrauth(VALID_DATA_KEY, (_Bool) 1) valid8; +int * __ptrauth(VALID_DATA_KEY, 1, 0) valid9; +int * __ptrauth(VALID_DATA_KEY, 1, 65535) valid10; + +int * __ptrauth(VALID_DATA_KEY) array0[10]; +int (* __ptrauth(VALID_DATA_KEY) array1)[10]; + +extern intp redeclaration0; // expected-note {{previous declaration}} +extern intp __ptrauth(VALID_DATA_KEY) redeclaration0; // expected-error{{redeclaration of 'redeclaration0' with a different type: '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)') vs 'intp' (aka 'int *')}} + +extern intp redeclaration1; // expected-note {{previous declaration}} +extern intp __ptrauth(VALID_DATA_KEY) redeclaration1; // expected-error{{redeclaration of 'redeclaration1' with a different type: '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)') vs 'intp' (aka 'int *')}} + +intp __ptrauth(VALID_DATA_KEY) redeclaration2; // expected-note {{previous definition}} +intp redeclaration2 = 0; // expected-error{{redefinition of 'redeclaration2' with a different type: 'intp' (aka 'int *') vs '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)')}} + +intp __ptrauth(VALID_DATA_KEY) redeclaration3; // expected-note {{previous definition}} +intp redeclaration3 = 0; // expected-error{{redefinition of 'redeclaration3' with a different type: 'intp' (aka 'int *') vs '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)')}} + +void illegal0(intp __ptrauth(VALID_DATA_KEY)); // expected-error {{parameter type may not be qualified with '__ptrauth'; type is '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)')}} +intp __ptrauth(VALID_DATA_KEY) illegal1(void); // expected-error {{return type may not be qualified with '__ptrauth'; type is '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)')}} + +static_assert(_Generic(typeof(valid0), int * __ptrauth(VALID_DATA_KEY) : 1, int * : 0, default : 0)); +static_assert(_Generic(typeof(valid0), int * __ptrauth(VALID_CODE_KEY) : 0, default : 1)); +static_assert(_Generic(typeof_unqual(valid0), int * __ptrauth(VALID_DATA_KEY) : 0, int * : 1, default : 0)); +static_assert(_Generic(valid0, int * __ptrauth(VALID_DATA_KEY) : 0, int * : 1, default : 0)); // expected-warning {{association of type 'int *__ptrauth(2,0,0)' will never be selected}} + +static_assert(_Generic(array0, int * __ptrauth(VALID_DATA_KEY) * : 1, default : 0)); +static_assert(_Generic(*array1, int * : 1, default : 0)); + +void test_code(intp p) { + p = (intp __ptrauth(VALID_DATA_KEY)) 0; // expected-error {{cannot cast to '__ptrauth'-qualified type '__ptrauth(2,0,0) intp' (aka 'int *__ptrauth(2,0,0)')}} + + __ptrauth(VALID_DATA_KEY) intp pSpecial = p; + pSpecial = p; + intp pNormal = pSpecial; + pNormal = pSpecial; + + intp __ptrauth(VALID_DATA_KEY) *ppSpecial0 = &pSpecial; + intp __ptrauth(VALID_DATA_KEY) *ppSpecial1 = &pNormal; // expected-error {{initializing '__ptrauth(2,0,0) intp *' (aka 'int *__ptrauth(2,0,0) *') with an expression of type 'intp *' (aka 'int **') changes pointer authentication of pointee type}} + intp *ppNormal0 = &pSpecial; // expected-error {{initializing 'intp *' (aka 'int **') with an expression of type '__ptrauth(2,0,0) intp *' (aka 'int *__ptrauth(2,0,0) *') changes pointer authentication of pointee type}} + intp *ppNormal1 = &pNormal; + + intp *pp5 = (p ? &pSpecial : &pNormal); // expected-error {{'__ptrauth' qualification mismatch ('__ptrauth(2,0,0) intp *' (aka 'int *__ptrauth(2,0,0) *') and 'intp *' (aka 'int **'))}} +} + +void test_array(void) { + intp __ptrauth(VALID_DATA_KEY) pSpecialArray[10]; + intp __ptrauth(VALID_DATA_KEY) *ppSpecial0 = pSpecialArray; + intp __ptrauth(VALID_DATA_KEY) *ppSpecial1 = &pSpecialArray[0]; +} + +__attribute__((overloadable)) int overload_func(int **); +__attribute__((overloadable)) float overload_func(int * __ptrauth(VALID_DATA_KEY) *); + +static_assert(_Generic(typeof(overload_func(&ptr0)), int : 1, default : 0)); +static_assert(_Generic(typeof(overload_func(&valid0)), float : 1, default : 0)); + +void func(int array[__ptrauth(VALID_DATA_KEY) 10]); // expected-error {{'__ptrauth' qualifier only applies to pointer types; 'int[10]' is invalid}} diff --git a/clang/test/SemaCXX/ptrauth-qualifier.cpp b/clang/test/SemaCXX/ptrauth-qualifier.cpp new file mode 100644 index 0000000000000..a7dc6ae2ffe86 --- /dev/null +++ b/clang/test/SemaCXX/ptrauth-qualifier.cpp @@ -0,0 +1,213 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s + +#define AQ __ptrauth(1,1,50) +#define AQ2 __ptrauth(1,1,51) +#define IQ __ptrauth(1,0,50) + +struct __attribute__((trivial_abi)) AddrDisc { // expected-warning {{'trivial_abi' cannot be applied to 'AddrDisc'}} expected-note {{'trivial_abi' is disallowed on 'AddrDisc' because it has an address-discriminated '__ptrauth' field}} + int * AQ m0; +}; + +struct __attribute__((trivial_abi)) NoAddrDisc { + int * IQ m0; +}; + +namespace test_union { + + union U0 { + int * AQ f0; // expected-note 4 {{'U0' is implicitly deleted because variant field 'f0' has an address-discriminated '__ptrauth' qualifier}} + + // ptrauth fields that don't have an address-discriminated qualifier don't + // delete the special functions. + int * IQ f1; + }; + + union U1 { + int * AQ f0; // expected-note 8 {{'U1' is implicitly deleted because variant field 'f0' has an address-discriminated '__ptrauth' qualifier}} + U1() = default; + ~U1() = default; + U1(const U1 &) = default; // expected-warning {{explicitly defaulted copy constructor is implicitly deleted}} expected-note 2 {{explicitly defaulted function was implicitly deleted here}} expected-note{{replace 'default'}} + U1(U1 &&) = default; // expected-warning {{explicitly defaulted move constructor is implicitly deleted}} expected-note{{replace 'default'}} + U1 & operator=(const U1 &) = default; // expected-warning {{explicitly defaulted copy assignment operator is implicitly deleted}} expected-note 2 {{explicitly defaulted function was implicitly deleted here}} expected-note{{replace 'default'}} + U1 & operator=(U1 &&) = default; // expected-warning {{explicitly defaulted move assignment operator is implicitly deleted}} expected-note{{replace 'default'}} + }; + + // It's fine if the user has explicitly defined the special functions. + union U2 { + int * AQ f0; + U2() = default; + ~U2() = default; + U2(const U2 &); + U2(U2 &&); + U2 & operator=(const U2 &); + U2 & operator=(U2 &&); + }; + + // Address-discriminated ptrauth fields in anonymous union fields delete the + // defaulted copy/move constructors/assignment operators of the containing + // class. + struct S0 { + union { + int * AQ f0; // expected-note 4 {{' is implicitly deleted because variant field 'f0' has an address-discriminated '__ptrauth' qualifier}} + char f1; + }; + }; + + struct S1 { + union { + union { + int * AQ f0; // expected-note 4 {{implicitly deleted because variant field 'f0' has an address-discriminated '__ptrauth' qualifier}} + char f1; + } u; // expected-note 4 {{'S1' is implicitly deleted because field 'u' has a deleted}} + int f2; + }; + }; + + U0 *x0; + U1 *x1; + U2 *x2; + S0 *x3; + S1 *x4; + + // No diagnostics since constructors/destructors of the unions aren't deleted by default. + void testDefaultConstructor() { + U0 u0; + U1 u1; + U2 u2; + S0 s0; + S1 s1; + } + + // No diagnostics since destructors of the unions aren't deleted by default. + void testDestructor(U0 *u0, U1 *u1, U2 *u2, S0 *s0, S1 *s1) { + delete u0; + delete u1; + delete u2; + delete s0; + delete s1; + } + + void testCopyConstructor(U0 *u0, U1 *u1, U2 *u2, S0 *s0, S1 *s1) { + U0 t0(*u0); // expected-error {{call to implicitly-deleted copy constructor}} + U1 t1(*u1); // expected-error {{call to implicitly-deleted copy constructor}} + U2 t2(*u2); + S0 t3(*s0); // expected-error {{call to implicitly-deleted copy constructor}} + S1 t4(*s1); // expected-error {{call to implicitly-deleted copy constructor}} + } + + void testCopyAssignment(U0 *u0, U1 *u1, U2 *u2, S0 *s0, S1 *s1) { + *x0 = *u0; // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x1 = *u1; // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x2 = *u2; + *x3 = *s0; // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x4 = *s1; // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + } + + void testMoveConstructor(U0 *u0, U1 *u1, U2 *u2, S0 *s0, S1 *s1) { + U0 t0(static_cast(*u0)); // expected-error {{call to implicitly-deleted copy constructor}} + U1 t1(static_cast(*u1)); // expected-error {{call to implicitly-deleted copy constructor}} + U2 t2(static_cast(*u2)); + S0 t3(static_cast(*s0)); // expected-error {{call to implicitly-deleted copy constructor}} + S1 t4(static_cast(*s1)); // expected-error {{call to implicitly-deleted copy constructor}} + } + + void testMoveAssignment(U0 *u0, U1 *u1, U2 *u2, S0 *s0, S1 *s1) { + *x0 = static_cast(*u0); // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x1 = static_cast(*u1); // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x2 = static_cast(*u2); + *x3 = static_cast(*s0); // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + *x4 = static_cast(*s1); // expected-error {{cannot be assigned because its copy assignment operator is implicitly deleted}} + } +} + +bool test_composite_type0(bool c, int * AQ * a0, int * AQ * a1) { + auto t = c ? a0 : a1; + return a0 == a1; +} + +bool test_composite_type1(bool c, int * AQ * a0, int * AQ2 * a1) { + auto t = c ? a0 : a1; // expected-error {{incompatible operand types ('int *__ptrauth(1,1,50) *' and 'int *__ptrauth(1,1,51) *')}} + return a0 == a1; // expected-error {{comparison of distinct pointer types ('int *__ptrauth(1,1,50) *' and 'int *__ptrauth(1,1,51) *')}} +} + +void test_bad_call_diag(void *AQ *ptr); // expected-note{{candidate function not viable: 1st argument ('void *__ptrauth(1,1,51) *') has __ptrauth(1,1,51) qualifier, but parameter has __ptrauth(1,1,50) qualifier}} expected-note {{candidate function not viable: 1st argument ('void **') has no '__ptrauth' qualifier, but parameter has __ptrauth(1,1,50) qualifier}} +void test_bad_call_diag2(void **ptr); // expected-note {{candidate function not viable: 1st argument ('void *__ptrauth(1,1,50) *') has __ptrauth(1,1,50) qualifier, but parameter has no '__ptrauth' qualifier}} + +int test_call_diag() { + void *AQ ptr1, *AQ2 ptr2, *ptr3; + test_bad_call_diag(&ptr2); // expected-error {{no matching function for call to 'test_bad_call_diag'}} + test_bad_call_diag(&ptr3); // expected-error {{no matching function for call to 'test_bad_call_diag'}} + test_bad_call_diag2(&ptr1); // expected-error {{no matching function for call to 'test_bad_call_diag2'}} +} + +namespace test_constexpr { + constexpr int i = 100; + constexpr const int * AQ p = &i; + constexpr const int * const AQ *pp = &p; + constexpr int i1 = **((const int * const AQ *)pp); + constexpr int i2 = **((const int * const AQ2 *)pp); + // expected-error@-1 {{constexpr variable 'i2' must be initialized by a constant expression}} + // expected-note@-2 {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}} +} + +namespace test_lambda { + void test() { + int * AQ v0; + int * AQ *v1; + + [v0, v1]() { + static_assert(__is_same(decltype(v0), int * AQ)); + static_assert(__is_same(decltype(v1), int * AQ *)); + }(); + + [v2 = v0, v3 = v1]() { + static_assert(__is_same(decltype(v2), int *)); + static_assert(__is_same(decltype(v3), int * AQ *)); + }(); + } +} + +namespace test_concept { + template struct is_qualified { + static constexpr bool value = false; + }; + + template struct is_qualified { + static constexpr bool value = true; + }; + + template + concept Ptrauthable = is_qualified::value; + // expected-note@-1 2 {{because 'is_qualified::value' evaluated to false}} + // expected-note@-2 2 {{because 'is_qualified::value' evaluated to false}} + + template + requires(Ptrauthable) + struct S {}; + // expected-note@-2 {{because 'int *' does not satisfy 'Ptrauthable'}} + // expected-note@-3 {{because 'int *__ptrauth(1,1,51)' does not satisfy 'Ptrauthable'}} + + S s0; + S s1; + // expected-error@-1 {{constraints not satisfied for class template 'S' [with T = int *]}} + S s1; + // expected-error@-1 {{constraints not satisfied for class template 'S' [with T = int *__ptrauth(1,1,51)]}} + + template + requires(Ptrauthable) + void func(T *); + // expected-note@-1 {{candidate template ignored: constraints not satisfied [with T = int *]}} + // expected-note@-3 {{because 'int *' does not satisfy 'Ptrauthable'}} + // expected-note@-3 {{candidate template ignored: constraints not satisfied [with T = int *__ptrauth(1,1,51)]}} + // expected-note@-5 {{because 'int *__ptrauth(1,1,51)' does not satisfy 'Ptrauthable'}} + + void test() { + int * AQ p0; + int *p1; + int * AQ2 p2; + func(&p0); + func(&p1); // expected-error {{no matching function for call to 'func'}} + func(&p2); // expected-error {{no matching function for call to 'func'}} + } +} diff --git a/clang/test/SemaCXX/ptrauth-template-parameters.cpp b/clang/test/SemaCXX/ptrauth-template-parameters.cpp new file mode 100644 index 0000000000000..ee23d3f2ec456 --- /dev/null +++ b/clang/test/SemaCXX/ptrauth-template-parameters.cpp @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics -std=c++11 %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics -std=c++11 %s + +template struct G { + T __ptrauth(0,0,1234) test; + // expected-error@-1 2 {{type '__ptrauth(0,0,1234) T' is already __ptrauth-qualified}} +}; + +template struct Indirect { + G layers; + // expected-note@-1{{in instantiation of template class 'G' requested here}} + // expected-note@-2{{in instantiation of template class 'G' requested here}} +}; + +template +struct TemplateParameters { + void * __ptrauth(K, 0, 100) m1; // expected-error {{expression is not an integer constant expression}} + void * __ptrauth(0, A, 100) m2; // expected-error {{argument to '__ptrauth' must be an integer constant expression}} + void * __ptrauth(0, 0, D) m3; // expected-error {{argument to '__ptrauth' must be an integer constant expression}} +}; + +void f3() { + // FIXME: consider loosening the restrictions so that the first two cases are accepted. + Indirect one; + // expected-note@-1{{in instantiation of template class 'Indirect' requested here}} + Indirect two; + // expected-note@-1{{in instantiation of template class 'Indirect' requested here}} + Indirect three; +} diff --git a/clang/test/SemaObjC/ptrauth-qualifier.m b/clang/test/SemaObjC/ptrauth-qualifier.m new file mode 100644 index 0000000000000..4836a653dd02f --- /dev/null +++ b/clang/test/SemaObjC/ptrauth-qualifier.m @@ -0,0 +1,56 @@ +// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics %s + +#if __has_feature(ptrauth_qualifier) +#warning __ptrauth qualifier enabled! +// expected-warning@-1 {{__ptrauth qualifier enabled!}} +#endif + +@interface Foo +// expected-warning@-1 {{class 'Foo' defined without specifying a base class}} +// expected-note@-2 {{add a super class to fix this problem}} + +@property void *__ptrauth(1, 1, 1) invalid1; +// expected-error@-1 {{property may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}} + +@property void *__ptrauth(1, 0, 1) invalid2; +// expected-error@-1 {{property may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}} + +- (void *__ptrauth(1, 1, 1))invalid5; +// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}} + +- (void *__ptrauth(1, 0, 1))invalid6; +// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}} + +- (void)invalid9:(void *__ptrauth(1, 1, 1))a; +// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}} +// expected-note@-2 {{method 'invalid9:' declared here}} + +- (void)invalid10:(void *__ptrauth(1, 0, 1))a; +// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}} +// expected-note@-2 {{method 'invalid10:' declared here}} + +@end + +@implementation Foo +// expected-warning@-1 2{{method definition for}} + +- (void *__ptrauth(1, 1, 1))invalid13 { +// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}} + return 0; +} + +- (void *__ptrauth(1, 0, 1))invalid14 { +// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}} + return 0; +} + +- (void)invalid17:(void *__ptrauth(1, 1, 1))a { +// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}} +} + +- (void)invalid18:(void *__ptrauth(1, 0, 1))a { +// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}} +} + +@end diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp index abaa787f5432b..53da1bf6765e7 100644 --- a/libcxxabi/test/test_demangle.pass.cpp +++ b/libcxxabi/test/test_demangle.pass.cpp @@ -30243,6 +30243,9 @@ const char* cases[][2] = { {"_Z1fDSDRm", "f(_Sat unsigned long _Fract)"}, {"_Z11bfloat16addDF16bDF16b", "bfloat16add(std::bfloat16_t, std::bfloat16_t)"}, + + {"_Z3fooPU9__ptrauthILj3ELb1ELj234EEPi", "foo(int* __ptrauth<3u, true, 234u>*)"}, + {"_Z3fooIPU9__ptrauthILj1ELb0ELj64EEPiEvT_", "void foo*>(int* __ptrauth<1u, false, 64u>*)"}, // clang-format on }; diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangle.h b/llvm/include/llvm/Demangle/MicrosoftDemangle.h index 276efa7603690..b9a25e361eec0 100644 --- a/llvm/include/llvm/Demangle/MicrosoftDemangle.h +++ b/llvm/include/llvm/Demangle/MicrosoftDemangle.h @@ -173,6 +173,14 @@ class Demangler { Qualifiers demanglePointerExtQualifiers(std::string_view &MangledName); + bool isMemberPointer(std::string_view MangledName, bool &Error); + + std::optional + demanglePointerAuthQualifier(std::string_view &MangledName); + + PointerAuthQualifierNode * + createPointerAuthQualifier(std::string_view &MangledName); + // Parser functions. This is a recursive-descent parser. TypeNode *demangleType(std::string_view &MangledName, QualifierMangleMode QMM); diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h index 09b9d947464ae..d72fb47cd9b04 100644 --- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h +++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h @@ -253,7 +253,8 @@ enum class NodeKind { LocalStaticGuardVariable, FunctionSymbol, VariableSymbol, - SpecialTableSymbol + SpecialTableSymbol, + PointerAuthQualifier, }; struct Node { @@ -295,6 +296,7 @@ struct SymbolNode; struct FunctionSymbolNode; struct VariableSymbolNode; struct SpecialTableSymbolNode; +struct PointerAuthQualifierNode; struct TypeNode : public Node { explicit TypeNode(NodeKind K) : Node(K) {} @@ -467,6 +469,8 @@ struct PointerTypeNode : public TypeNode { // If this is a member pointer, this is the class that the member is in. QualifiedNameNode *ClassParent = nullptr; + PointerAuthQualifierNode *PointerAuthQualifier = nullptr; + // Represents a type X in "a pointer to X", "a reference to X", or // "rvalue-reference to X" TypeNode *Pointee = nullptr; @@ -625,6 +629,22 @@ struct FunctionSymbolNode : public SymbolNode { FunctionSignatureNode *Signature = nullptr; }; +struct PointerAuthQualifierNode : public Node { + PointerAuthQualifierNode() : Node(NodeKind::PointerAuthQualifier) {} + + // __ptrauth takes three arguments: + // - key + // - isAddressDiscriminated + // - extra discriminator + static constexpr unsigned NumArgs = 3; + typedef std::array ArgArray; + + void output(OutputBuffer &OB, OutputFlags Flags) const override; + + // List of arguments. + NodeArrayNode *Components = nullptr; +}; + } // namespace ms_demangle } // namespace llvm diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp index 8d5f6b21e2e76..b22928be3be50 100644 --- a/llvm/lib/Demangle/MicrosoftDemangle.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp @@ -66,7 +66,7 @@ static bool startsWith(std::string_view S, std::string_view PrefixA, return llvm::itanium_demangle::starts_with(S, Prefix); } -static bool isMemberPointer(std::string_view MangledName, bool &Error) { +bool Demangler::isMemberPointer(std::string_view MangledName, bool &Error) { Error = false; const char F = MangledName.front(); MangledName.remove_prefix(1); @@ -107,6 +107,7 @@ static bool isMemberPointer(std::string_view MangledName, bool &Error) { consumeFront(MangledName, 'E'); // 64-bit consumeFront(MangledName, 'I'); // restrict consumeFront(MangledName, 'F'); // unaligned + demanglePointerAuthQualifier(MangledName); if (MangledName.empty()) { Error = true; @@ -2099,6 +2100,8 @@ PointerTypeNode *Demangler::demanglePointerType(std::string_view &MangledName) { Qualifiers ExtQuals = demanglePointerExtQualifiers(MangledName); Pointer->Quals = Qualifiers(Pointer->Quals | ExtQuals); + Pointer->PointerAuthQualifier = createPointerAuthQualifier(MangledName); + Pointer->Pointee = demangleType(MangledName, QualifierMangleMode::Mangle); return Pointer; } @@ -2147,6 +2150,49 @@ Demangler::demanglePointerExtQualifiers(std::string_view &MangledName) { return Quals; } +std::optional +Demangler::demanglePointerAuthQualifier(std::string_view &MangledName) { + if (!consumeFront(MangledName, "__ptrauth")) + return std::nullopt; + + constexpr unsigned NumArgs = PointerAuthQualifierNode::NumArgs; + PointerAuthQualifierNode::ArgArray Array; + + for (unsigned I = 0; I < NumArgs; ++I) { + bool IsNegative = false; + uint64_t Value = 0; + std::tie(Value, IsNegative) = demangleNumber(MangledName); + if (IsNegative) + return std::nullopt; + + Array[I] = Value; + } + + return Array; +} + +PointerAuthQualifierNode * +Demangler::createPointerAuthQualifier(std::string_view &MangledName) { + constexpr unsigned NumArgs = PointerAuthQualifierNode::NumArgs; + std::optional Vals = + demanglePointerAuthQualifier(MangledName); + + if (!Vals) + return nullptr; + + PointerAuthQualifierNode *PtrAuthQual = + Arena.alloc(); + NodeArrayNode *Array = Arena.alloc(); + PtrAuthQual->Components = Array; + Array->Count = NumArgs; + Array->Nodes = Arena.allocArray(NumArgs); + + for (unsigned I = 0; I < NumArgs; ++I) + Array->Nodes[I] = Arena.alloc((*Vals)[I], false); + + return PtrAuthQual; +} + ArrayTypeNode *Demangler::demangleArrayType(std::string_view &MangledName) { assert(MangledName.front() == 'Y'); MangledName.remove_prefix(1); diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp index ec6e67058c683..61e4961c714bc 100644 --- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp @@ -521,6 +521,9 @@ void PointerTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const { assert(false); } outputQualifiers(OB, Quals, false, false); + + if (PointerAuthQualifier) + PointerAuthQualifier->output(OB, Flags); } void PointerTypeNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const { @@ -591,6 +594,13 @@ void FunctionSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const { Signature->outputPost(OB, Flags); } +void PointerAuthQualifierNode::output(OutputBuffer &OB, + OutputFlags Flags) const { + OB << "__ptrauth("; + Components->output(OB, Flags); + OB << ")"; +} + void VariableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const { const char *AccessSpec = nullptr; bool IsStatic = true; diff --git a/llvm/test/Demangle/ms-ptrauth.test b/llvm/test/Demangle/ms-ptrauth.test new file mode 100644 index 0000000000000..18a9f37bec67a --- /dev/null +++ b/llvm/test/Demangle/ms-ptrauth.test @@ -0,0 +1,12 @@ +; RUN: llvm-undname < %s | FileCheck %s + +; CHECK-NOT: Invalid mangled name + +?s@@3U?$S@PE__ptrauth1A@ENC@AH@@A +; CHECK: struct S s + +?foo@@YAXPEAPE__ptrauth20OK@AH@Z +; CHECK: void __cdecl foo(int *__ptrauth(3, 1, 234)*) + +??$foo@PEAPE__ptrauth0A@EA@AH@@YAXPEAPE__ptrauth0A@EA@AH@Z +; CHECK: void __cdecl foo(int *__ptrauth(1, 0, 64)*) From a1d52fcdca8bda4fe8c6652b2de83f408f4ad4f2 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 15 Apr 2025 13:03:58 -0700 Subject: [PATCH 033/710] Clarify documentation for -funique-source-file-names. Reviewers: efriedma-quic, teresajohnson Reviewed By: teresajohnson, efriedma-quic Pull Request: https://github.com/llvm/llvm-project/pull/135832 --- clang/docs/UsersManual.rst | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index d4656a7e63c99..69256527f40c9 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2301,11 +2301,31 @@ are listed below. When enabled, allows the compiler to assume that each object file passed to the linker has been compiled using a unique source file - name. This is useful for reducing link times when doing ThinLTO + path. This is useful for reducing link times when doing ThinLTO in combination with whole-program devirtualization or CFI. - A misuse of this flag will generally result in a duplicate symbol - error at link time. + The full source path passed to the compiler must be unique. This + means that, for example, the following is a usage error: + + .. code-block:: console + + $ cd foo + $ clang -funique-source-file-names -c foo.c + $ cd ../bar + $ clang -funique-source-file-names -c foo.c + $ cd .. + $ clang foo/foo.o bar/foo.o + + but this is not: + + .. code-block:: console + + $ clang -funique-source-file-names -c foo/foo.c + $ clang -funique-source-file-names -c bar/foo.c + $ clang foo/foo.o bar/foo.o + + A misuse of this flag may result in a duplicate symbol error at + link time. .. option:: -fforce-emit-vtables From 2271f0bebd48c9ed8b16b500886a819c4f269a6a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 15 Apr 2025 13:02:32 -0700 Subject: [PATCH 034/710] [SLP]Check for perfect/shuffled match for the split node If the potential split node is a perfect/shuffled match of another split node, need to skip creation of the another split node with the same scalars, it should be a buildvector. Fixes #135800 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++ .../X86/split-node-full-match.ll | 74 +++++++++++++++++++ .../X86/split-node-no-reorder-copy.ll | 3 +- 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 253933a2438cd..234cd340ebc13 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9575,6 +9575,24 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, !SplitAlternateInstructions) return false; + // Check if this is a duplicate of another split entry. + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp() + << ".\n"); + for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) { + if (E->isSame(VL)) { + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " + << *LocalState.getMainOp() << ".\n"); + return false; + } + SmallPtrSet Values(llvm::from_range, E->Scalars); + if (all_of(VL, [&](Value *V) { + return isa(V) || Values.contains(V); + })) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); + return false; + } + } + ReorderIndices.assign(VL.size(), VL.size()); SmallBitVector Op1Indices(VL.size()); for (auto [Idx, V] : enumerate(VL)) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll new file mode 100644 index 0000000000000..10e73b042f19b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(double %0) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[_THREAD:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = call double null(ptr null, ptr null, ptr null) +; CHECK-NEXT: [[TMP2:%.*]] = call double null(ptr null, ptr null, ptr null) +; CHECK-NEXT: br i1 false, label %[[BB3:.*]], label %[[BB7:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = call double null(ptr null, ptr null, ptr null) +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP4]], i32 0 +; CHECK-NEXT: br label %[[BB7]] +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x double> [ [[TMP6]], %[[BB3]] ], [ zeroinitializer, [[DOTTHREAD:%.*]] ] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP11]], <2 x double> [[TMP10]], i64 4) +; CHECK-NEXT: br i1 false, label %[[DOTLR_PH272_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]] +; CHECK: [[_LR_PH272_PREHEADER:.*:]] +; CHECK-NEXT: br i1 false, [[DOT_CRIT_EDGE]], label %[[BB13:.*]] +; CHECK: [[BB13]]: +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP15]], <2 x double> splat (double 0x7FF8000000000000), i64 4) +; CHECK-NEXT: br i1 false, label %[[BB17:.*]], [[DOT_CRIT_EDGE]] +; CHECK: [[BB17]]: +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <6 x double> , double [[TMP0]], i32 3 +; CHECK-NEXT: br [[DOT_CRIT_EDGE]] +; CHECK: [[__CRIT_EDGE:.*:]] +; CHECK-NEXT: [[TMP19:%.*]] = phi <6 x double> [ [[TMP12]], %[[BB7]] ], [ [[TMP18]], %[[BB17]] ], [ [[TMP16]], %[[BB13]] ], [ [[TMP12]], %[[DOTLR_PH272_PREHEADER]] ] +; CHECK-NEXT: ret void +; +.thread: + %1 = call double null(ptr null, ptr null, ptr null) + %2 = call double null(ptr null, ptr null, ptr null) + br i1 false, label %3, label %5 + +3: + %4 = call double null(ptr null, ptr null, ptr null) + br label %5 + +5: + %.1226 = phi double [ %4, %3 ], [ 0.000000e+00, %.thread ] + %.1222 = phi double [ 0.000000e+00, %3 ], [ 0.000000e+00, %.thread ] + %.1218 = phi double [ %0, %3 ], [ 0.000000e+00, %.thread ] + %.1216 = phi double [ 0.000000e+00, %3 ], [ 0.000000e+00, %.thread ] + br i1 false, label %.lr.ph272.preheader, label %._crit_edge + +.lr.ph272.preheader: + br i1 false, label %._crit_edge, label %6 + +6: + %7 = fdiv double 0.000000e+00, 0.000000e+00 + %8 = fsub double 0.000000e+00, %7 + %9 = fdiv double 0.000000e+00, 0.000000e+00 + %10 = fsub double 0.000000e+00, %9 + br i1 false, label %11, label %._crit_edge + +11: + br label %._crit_edge + +._crit_edge: + %.2227.lcssa = phi double [ %.1226, %5 ], [ 0.000000e+00, %11 ], [ %.1226, %6 ], [ %.1226, %.lr.ph272.preheader ] + %.2223.lcssa = phi double [ %.1222, %5 ], [ 0.000000e+00, %11 ], [ %.1222, %6 ], [ %.1222, %.lr.ph272.preheader ] + %.2219.lcssa = phi double [ %.1218, %5 ], [ 0.000000e+00, %11 ], [ %.1218, %6 ], [ %.1218, %.lr.ph272.preheader ] + %.2.lcssa = phi double [ %.1216, %5 ], [ %0, %11 ], [ %.1216, %6 ], [ %.1216, %.lr.ph272.preheader ] + %.0213.lcssa = phi double [ %2, %5 ], [ 0.000000e+00, %11 ], [ %10, %6 ], [ %2, %.lr.ph272.preheader ] + %.0211.lcssa = phi double [ %1, %5 ], [ 0.000000e+00, %11 ], [ %8, %6 ], [ %1, %.lr.ph272.preheader ] + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll index b7b6c10137b64..9abb994db1e73 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll @@ -15,8 +15,9 @@ define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4) ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <8 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7 ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8) ; CHECK-NEXT: [[TMP18:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8) +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP9]], i32 15 ; CHECK-NEXT: [[TMP20:%.*]] = fmul <16 x float> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP20]]) ; CHECK-NEXT: [[TMP22:%.*]] = call float @foo(float [[TMP21]]) From 823adc7a2dc90cdd0f953f3dc9684481368f2b62 Mon Sep 17 00:00:00 2001 From: YongKang Zhu Date: Tue, 15 Apr 2025 13:19:15 -0700 Subject: [PATCH 035/710] [BOLT] Validate secondary entry point (#135731) Some functions have their sizes as zero in input binary's symbol table, like those compiled by assembler. When figuring out function sizes, we may create label symbol if it doesn't point to any constant island. However, before function size is known, marker symbol can not be correctly associated to a function and therefore all such checks would fail and we could end up adding a code label pointing to constant island as secondary entry point and later mistakenly marking the function as not simple. Querying the global marker symbol array has big throughput overhead. Instead we can run an extra check when post processing entry points to identify such label symbols that actually point to constant islands. --- bolt/include/bolt/Core/BinaryFunction.h | 9 +++++ bolt/lib/Core/BinaryFunction.cpp | 9 +++++ .../AArch64/validate-secondary-entry-point.s | 34 +++++++++++++++++++ .../RISCV/validate-secondary-entry-point.s | 34 +++++++++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 bolt/test/AArch64/validate-secondary-entry-point.s create mode 100644 bolt/test/RISCV/validate-secondary-entry-point.s diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index d3d11f8c5fb73..a52998564ee1b 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1174,6 +1174,11 @@ class BinaryFunction { return getSecondaryEntryPointSymbol(BB.getLabel()); } + /// Remove a label from the secondary entry point map. + void removeSymbolFromSecondaryEntryPointMap(const MCSymbol *Label) { + SecondaryEntryPoints.erase(Label); + } + /// Return true if the basic block is an entry point into the function /// (either primary or secondary). bool isEntryPoint(const BinaryBasicBlock &BB) const { @@ -2126,6 +2131,10 @@ class BinaryFunction { return Islands && !Islands->DataOffsets.empty(); } + bool isStartOfConstantIsland(uint64_t Offset) const { + return hasConstantIsland() && Islands->DataOffsets.count(Offset); + } + /// Return true iff the symbol could be seen inside this function otherwise /// it is probably another function. bool isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index c4f4d234b30c0..184a4462b356a 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1896,6 +1896,15 @@ void BinaryFunction::postProcessEntryPoints() { if (BC.isAArch64() && Offset == getSize()) continue; + // If we have grabbed a wrong code label which actually points to some + // constant island inside the function, ignore this label and remove it + // from the secondary entry point map. + if (isStartOfConstantIsland(Offset)) { + BC.SymbolToFunctionMap.erase(Label); + removeSymbolFromSecondaryEntryPointMap(Label); + continue; + } + BC.errs() << "BOLT-WARNING: reference in the middle of instruction " "detected in function " << *this << " at offset 0x" << Twine::utohexstr(Offset) << '\n'; diff --git a/bolt/test/AArch64/validate-secondary-entry-point.s b/bolt/test/AArch64/validate-secondary-entry-point.s new file mode 100644 index 0000000000000..0099a0ee4fe99 --- /dev/null +++ b/bolt/test/AArch64/validate-secondary-entry-point.s @@ -0,0 +1,34 @@ +# This test is to verify that BOLT won't take a label pointing to constant +# island as a secondary entry point (function `_start` doesn't have ELF size +# set originally) and the function won't otherwise be mistaken as non-simple. + +# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo +# RUN: llvm-bolt %t.so -o %t.bolt.so --print-cfg 2>&1 | FileCheck %s +# CHECK-NOT: BOLT-WARNING: reference in the middle of instruction detected \ +# CHECK-NOT: function _start at offset 0x{{[0-9a-f]+}} +# CHECK: Binary Function "_start" after building cfg + + .text + + .global _foo + .type _foo, %function +_foo: + ret + + .global _start + .type _start, %function +_start: + b _foo + + .balign 16 +_random_consts: + .long 0x12345678 + .long 0x90abcdef + + .global _bar + .type _bar, %function +_bar: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE diff --git a/bolt/test/RISCV/validate-secondary-entry-point.s b/bolt/test/RISCV/validate-secondary-entry-point.s new file mode 100644 index 0000000000000..0c29f5c97c689 --- /dev/null +++ b/bolt/test/RISCV/validate-secondary-entry-point.s @@ -0,0 +1,34 @@ +# This test is to verify that BOLT won't take a label pointing to constant +# island as a secondary entry point (function `_start` doesn't have ELF size +# set originally) and the function won't otherwise be mistaken as non-simple. + +# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo +# RUN: llvm-bolt %t.so -o %t.bolt.so --print-cfg 2>&1 | FileCheck %s +# CHECK-NOT: BOLT-WARNING: reference in the middle of instruction detected \ +# CHECK-NOT: function _start at offset 0x{{[0-9a-f]+}} +# CHECK: Binary Function "_start" after building cfg + + .text + + .global _foo + .type _foo, %function +_foo: + ret + + .global _start + .type _start, %function +_start: + j _foo + + .balign 16 +_random_consts: + .long 0x12345678 + .long 0x90abcdef + + .global _bar + .type _bar, %function +_bar: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_RISCV_NONE From 14cb6566d6701feaef2ffd686af5de4ff9e3eb29 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 15 Apr 2025 13:30:33 -0700 Subject: [PATCH 036/710] [lldb-dap] Improve error reporting for dap command arguments. (#135684) Previously the error only contained the failed to parse JSON message, which has no additional context. This improves the error messages and improves the consistency of handling properties in protocol structures. Updating the fields to use 'ObjectMapper.map' instead of 'ObjectMapper.mapOptional' caught that adapterID was misspelled as well. For example, previously: ``` $ echo 'Content-Length: 81\r\n\r\n{"type":"request","command":"initialize","seq":1,"arguments":{"adapterID":12345}} | lldb-dap ``` Worked without an error but now it reports: ``` invalid arguments for request 'initialize': expected string at arguments.adapterID { "adapterID": /* error: expected string */ 12345 } ``` --- lldb/tools/lldb-dap/Handler/RequestHandler.h | 8 ++-- lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp | 2 +- .../lldb-dap/Protocol/ProtocolRequests.cpp | 42 ++++++++++++------- .../lldb-dap/Protocol/ProtocolRequests.h | 4 +- .../tools/lldb-dap/Protocol/ProtocolTypes.cpp | 6 +-- 5 files changed, 37 insertions(+), 25 deletions(-) diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h index 50795f8252de3..7e56c258ad78a 100644 --- a/lldb/tools/lldb-dap/Handler/RequestHandler.h +++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h @@ -120,11 +120,13 @@ class RequestHandler : public BaseRequestHandler { } Args arguments; - llvm::json::Path::Root root; - if (request.arguments && !fromJSON(request.arguments, arguments, root)) { + llvm::json::Path::Root root("arguments"); + if (request.arguments && !fromJSON(*request.arguments, arguments, root)) { std::string parse_failure; llvm::raw_string_ostream OS(parse_failure); - root.printErrorContext(request.arguments, OS); + OS << "invalid arguments for request '" << request.command + << "': " << llvm::toString(root.getError()) << "\n"; + root.printErrorContext(*request.arguments, OS); protocol::ErrorMessage error_message; error_message.format = parse_failure; diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp index af63cc803e545..bfd68448fb483 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp @@ -178,7 +178,7 @@ bool fromJSON(json::Value const &Params, Response &R, json::Path P) { return false; } - return O.map("success", R.success) && O.mapOptional("message", R.message) && + return O.map("success", R.success) && O.map("message", R.message) && mapRaw(Params, "body", R.body, P); } diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index 7163399899f7e..3523f8ac87ec9 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -20,16 +20,16 @@ namespace lldb_dap::protocol { bool fromJSON(const llvm::json::Value &Params, CancelArguments &CA, llvm::json::Path P) { llvm::json::ObjectMapper O(Params, P); - return O && O.mapOptional("requestId", CA.requestId) && - O.mapOptional("progressId", CA.progressId); + return O && O.map("requestId", CA.requestId) && + O.map("progressId", CA.progressId); } bool fromJSON(const json::Value &Params, DisconnectArguments &DA, json::Path P) { json::ObjectMapper O(Params, P); - return O && O.mapOptional("restart", DA.restart) && - O.mapOptional("terminateDebuggee", DA.terminateDebuggee) && - O.mapOptional("suspendDebuggee", DA.suspendDebuggee); + return O && O.map("restart", DA.restart) && + O.map("terminateDebuggee", DA.terminateDebuggee) && + O.map("suspendDebuggee", DA.suspendDebuggee); } bool fromJSON(const llvm::json::Value &Params, PathFormat &PF, @@ -75,23 +75,33 @@ bool fromJSON(const llvm::json::Value &Params, InitializeRequestArguments &IRA, const json::Object *O = Params.getAsObject(); - for (auto &kv : ClientFeatureByKey) - if (std::optional v = O->getBoolean(kv.first()); v && *v) + for (auto &kv : ClientFeatureByKey) { + const json::Value *value_ref = O->get(kv.first()); + if (!value_ref) + continue; + + const std::optional value = value_ref->getAsBoolean(); + if (!value) { + P.field(kv.first()).report("expected bool"); + return false; + } + + if (*value) IRA.supportedFeatures.insert(kv.second); + } - return OM.mapOptional("adatperID", IRA.adatperID) && - OM.mapOptional("clientID", IRA.clientID) && - OM.mapOptional("clientName", IRA.clientName) && - OM.mapOptional("locale", IRA.locale) && - OM.mapOptional("linesStartAt1", IRA.linesStartAt1) && - OM.mapOptional("columnsStartAt1", IRA.columnsStartAt1) && - OM.mapOptional("pathFormat", IRA.pathFormat) && - OM.mapOptional("$__lldb_sourceInitFile", IRA.lldbExtSourceInitFile); + return OM.map("adapterID", IRA.adapterID) && + OM.map("clientID", IRA.clientID) && + OM.map("clientName", IRA.clientName) && OM.map("locale", IRA.locale) && + OM.map("linesStartAt1", IRA.linesStartAt1) && + OM.map("columnsStartAt1", IRA.columnsStartAt1) && + OM.map("pathFormat", IRA.pathFormat) && + OM.map("$__lldb_sourceInitFile", IRA.lldbExtSourceInitFile); } bool fromJSON(const json::Value &Params, SourceArguments &SA, json::Path P) { json::ObjectMapper O(Params, P); - return O && O.mapOptional("source", SA.source) && + return O && O.map("source", SA.source) && O.map("sourceReference", SA.sourceReference); } diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index 22d400fd494a5..6623dfa0db05c 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -100,7 +100,7 @@ enum PathFormat : unsigned { ePatFormatPath, ePathFormatURI }; /// Arguments for `initialize` request. struct InitializeRequestArguments { /// The ID of the debug adapter. - std::string adatperID; + std::string adapterID; /// The ID of the client using this adapter. std::optional clientID; @@ -113,7 +113,7 @@ struct InitializeRequestArguments { /// Determines in what format paths are specified. The default is `path`, /// which is the native format. - std::optional pathFormat = ePatFormatPath; + PathFormat pathFormat = ePatFormatPath; /// If true all line numbers are 1-based (default). std::optional linesStartAt1; diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp index f4f0bf8dcea84..4d1e90215bbb4 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp @@ -38,9 +38,9 @@ bool fromJSON(const json::Value &Params, PresentationHint &PH, json::Path P) { bool fromJSON(const json::Value &Params, Source &S, json::Path P) { json::ObjectMapper O(Params, P); - return O && O.mapOptional("name", S.name) && O.mapOptional("path", S.path) && - O.mapOptional("presentationHint", S.presentationHint) && - O.mapOptional("sourceReference", S.sourceReference); + return O && O.map("name", S.name) && O.map("path", S.path) && + O.map("presentationHint", S.presentationHint) && + O.map("sourceReference", S.sourceReference); } json::Value toJSON(const ExceptionBreakpointsFilter &EBF) { From 85eb44e304e0a0a7da78448ceee60fdfec235edb Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 15 Apr 2025 13:29:11 -0700 Subject: [PATCH 037/710] [SLP]Fix number of operands for the split node FOr the split node number of operands should be requested via getNumOperands() function, even if the main op is CallInst. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 + .../X86/split-node-num-operands.ll | 121 ++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 234cd340ebc13..b174f0f03fca6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7577,6 +7577,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Res.takeVector(); }; auto GetNumOperands = [](const TreeEntry *TE) { + if (TE->State == TreeEntry::SplitVectorize) + return TE->getNumOperands(); if (auto *CI = dyn_cast(TE->getMainOp()); CI) return CI->arg_size(); return TE->getNumOperands(); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll new file mode 100644 index 0000000000000..5aa4dba2b8a1b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -slp-threshold=-1000 < %s | FileCheck %s + +define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) { +; CHECK-LABEL: define i64 @Foo( +; CHECK-SAME: ptr align 8 dereferenceable(344) [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 104 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 112 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> , i64 [[TMP1]], i32 1 +; CHECK-NEXT: br label %[[BB16:.*]] +; CHECK: [[BB16]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i64> [ [[TMP13]], [[TMP2]] ], [ [[TMP29:%.*]], %[[TMP25]] ] +; CHECK-NEXT: switch i32 0, label %[[BB19:.*]] [ +; CHECK-NEXT: i32 0, label %[[TMP25]] +; CHECK-NEXT: ] +; CHECK: [[BB19]]: +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 0, i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 0, i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[TMP22]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> [[TMP18]], <2 x i32> +; CHECK-NEXT: br label %[[TMP25]] +; CHECK: [[TMP25]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i64> [ [[TMP17]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i64> [ [[TMP23]], %[[BB19]] ], [ zeroinitializer, %[[BB16]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i64> [ [[TMP24]], %[[BB19]] ], [ [[TMP15]], %[[BB16]] ] +; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i64> [[TMP18]], <2 x i64> , <2 x i32> +; CHECK-NEXT: br i1 false, label %[[DOTLOOPEXIT206:.*]], label %[[BB16]] +; CHECK: [[_LOOPEXIT206:.*:]] +; CHECK-NEXT: switch i32 0, label %[[BB32:.*]] [ +; CHECK-NEXT: i32 0, [[DOTCONT174:label %.*]] +; CHECK-NEXT: i32 1, label %[[BB30:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB30]]: +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i64> [[TMP27]], <4 x i64> , <4 x i32> +; CHECK-NEXT: br [[DOTCONT174]] +; CHECK: [[BB32]]: +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP27]], i64 0, i32 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 0, i32 2 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i64> [[TMP34]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i64> [[TMP28]], i64 0, i32 0 +; CHECK-NEXT: br [[DOTCONT174]] +; CHECK: [[_CONT174:.*:]] +; CHECK-NEXT: [[TMP37:%.*]] = phi <2 x i64> [ [[TMP26]], %[[BB32]] ], [ zeroinitializer, %[[BB30]] ], [ [[TMP26]], %[[DOTLOOPEXIT206]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i64> [ [[TMP35]], %[[BB32]] ], [ [[TMP31]], %[[BB30]] ], [ [[TMP27]], %[[DOTLOOPEXIT206]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <2 x i64> [ [[TMP36]], %[[BB32]] ], [ zeroinitializer, %[[BB30]] ], [ [[TMP28]], %[[DOTLOOPEXIT206]] ] +; CHECK-NEXT: ret i64 0 +; + %3 = getelementptr i8, ptr %0, i64 104 + %4 = getelementptr i8, ptr %0, i64 112 + %5 = getelementptr i8, ptr %0, i64 24 + %6 = load i64, ptr %3, align 8 + %7 = load i64, ptr %4, align 8 + %8 = load i64, ptr %5, align 8 + %9 = load i64, ptr %0, align 8 + br label %10 + +10: + %11 = phi i64 [ %9, %2 ], [ 0, %18 ] + %12 = phi i64 [ %8, %2 ], [ %12, %18 ] + %13 = phi i64 [ %7, %2 ], [ 0, %18 ] + %14 = phi i64 [ %6, %2 ], [ 0, %18 ] + switch i32 0, label %15 [ + i32 0, label %18 + ] + +15: + %16 = tail call i64 @llvm.umin.i64(i64 0, i64 0) + %17 = tail call i64 @llvm.umax.i64(i64 0, i64 0) + br label %18 + +18: + %19 = phi i64 [ %17, %15 ], [ 0, %10 ] + %20 = phi i64 [ %16, %15 ], [ 0, %10 ] + %21 = phi i64 [ %11, %15 ], [ 0, %10 ] + %22 = phi i64 [ %12, %15 ], [ 0, %10 ] + %23 = phi i64 [ %13, %15 ], [ %1, %10 ] + %24 = phi i64 [ %14, %15 ], [ 0, %10 ] + br i1 false, label %.loopexit206, label %10 + +.loopexit206: + switch i32 0, label %26 [ + i32 0, label %.cont174 + i32 1, label %25 + ] + +25: + br label %.cont174 + +26: + %27 = tail call i64 @llvm.umin.i64(i64 0, i64 0) + %28 = tail call i64 @llvm.umax.i64(i64 0, i64 0) + br label %.cont174 + +.cont174: + %.sroa.139.1 = phi i64 [ %28, %26 ], [ %19, %25 ], [ %19, %.loopexit206 ] + %.sroa.133.1 = phi i64 [ %27, %26 ], [ 0, %25 ], [ %20, %.loopexit206 ] + %.sroa.81.1 = phi i64 [ %23, %26 ], [ 0, %25 ], [ %23, %.loopexit206 ] + %.sroa.75.1 = phi i64 [ %24, %26 ], [ 0, %25 ], [ %24, %.loopexit206 ] + %.sroa.21.1 = phi i64 [ %21, %26 ], [ 0, %25 ], [ %21, %.loopexit206 ] + %.sroa.15.1 = phi i64 [ %22, %26 ], [ 0, %25 ], [ %22, %.loopexit206 ] + %29 = phi i64 [ %28, %26 ], [ 0, %25 ], [ %19, %.loopexit206 ] + %30 = phi i64 [ %27, %26 ], [ 0, %25 ], [ %20, %.loopexit206 ] + ret i64 0 +} + +declare i64 @llvm.umax.i64(i64, i64) + +declare i64 @llvm.umin.i64(i64, i64) + From 2b983a24583dd4e131d727717872a56712b5dd52 Mon Sep 17 00:00:00 2001 From: Zhuoran Yin Date: Tue, 15 Apr 2025 16:36:25 -0400 Subject: [PATCH 038/710] [MLIR][AMDGPU] Adding dynamic size check to avoid subword buffer load (#135014) Motivation: amdgpu buffer load instruction will return all zeros when loading sub-word values. For example, assuming the buffer size is exactly one word and we attempt to invoke `llvm.amdgcn.raw.ptr.buffer.load.v2i32` starting from byte 2 of the word, we will not receive the actual value of the buffer but all zeros for the first word. This is because the boundary has been crossed for the first word. This PR come up with a fix to this problem, such that, it creates a bounds check against the buffer load instruction. It will compare the offset + vector size to see if the upper bound of the address will exceed the buffer size. If it does, masked transfer read will be optimized to `vector.load` + `arith.select`, else, it will continue to fall back to default lowering of the masked vector load. --- .../mlir/Dialect/AMDGPU/Transforms/Passes.td | 13 +- .../Dialect/AMDGPU/Transforms/CMakeLists.txt | 1 + .../AMDGPU/Transforms/TransferReadToLoad.cpp | 159 ++++++++++++++++-- .../Dialect/AMDGPU/transfer-read-to-load.mlir | 94 +++++++++-- .../llvm-project-overlay/mlir/BUILD.bazel | 1 + 5 files changed, 233 insertions(+), 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index 761caa448a57c..0e858108acf35 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -54,15 +54,20 @@ def AmdgpuResolveStridedMetadataPass : Pass<"amdgpu-resolve-strided-metadata"> { def AmdgpuTransferReadToLoadPass : Pass<"amdgpu-transfer-read-to-load"> { let summary = "Lower the operations from the vector transfer_read to vector load"; let description = [{ - This pass creates a transfer read op lowering. A vector trasfer read op - will be lowered to a combination of vector.load, arith.select and - vector.broadcast. + This pass creates a transfer read op lowering optimization. The lowering + will produce a conditional check at runtime. If within bounds, a vector + trasfer read op will be lowered to a combination of vector.load, arith.select + and vector.broadcast. If not, it will fallback to the default lowering + of the transfer_read op. This pattern will make it possible for masked transfer_read to be lowered towards buffer load with bounds check, allowing a more optimized global load accessing pattern compared with existing implementation of llvm.intr.masked.load on vectors. }]; - let dependentDialects = []; + let dependentDialects = [ + "scf::SCFDialect", + "memref::MemRefDialect" + ]; } #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt index bc5b6e9186449..8709a27e0168e 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms MLIRAMDGPUUtils MLIRArithDialect MLIRMemRefDialect + MLIRSCFDialect MLIRVectorDialect MLIRControlFlowDialect MLIRFuncDialect diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp index 3c1a2eb962037..f665c1794cdd4 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp @@ -9,13 +9,22 @@ #include "mlir/Dialect/AMDGPU/Transforms/Passes.h" #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LogicalResult.h" -#include "mlir/Transforms/WalkPatternRewriteDriver.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/Support/MathExtras.h" namespace mlir::amdgpu { #define GEN_PASS_DEF_AMDGPUTRANSFERREADTOLOADPASS @@ -67,6 +76,9 @@ static LogicalResult transferPreconditions( if (!memRefType.isLastDimUnitStride()) return rewriter.notifyMatchFailure(xferOp, "!= 1 stride needs VectorToSCF"); + if (memRefType.getElementTypeBitWidth() < 8) + return rewriter.notifyMatchFailure(xferOp, "unsupported sub-byte type"); + // If there is broadcasting involved then we first load the unbroadcasted // vector, and then broadcast it with `vector.broadcast`. ArrayRef vectorShape = xferOp.getVectorType().getShape(); @@ -101,6 +113,26 @@ static LogicalResult transferPreconditions( return success(); } +static Value createVectorLoadForMaskedLoad(OpBuilder &builder, Location loc, + vector::TransferReadOp readOp, + bool requiresBroadcasting, + VectorType unbroadcastedVectorType) { + Value fill = builder.create(loc, unbroadcastedVectorType, + readOp.getPadding()); + Value load = builder.create( + loc, unbroadcastedVectorType, readOp.getSource(), readOp.getIndices()); + Value res = builder.create(loc, unbroadcastedVectorType, + readOp.getMask(), load, fill); + // Insert a broadcasting op if required. + if (requiresBroadcasting) { + res = builder.create(loc, readOp.getVectorType(), res); + } + return res; +} + +static constexpr char kTransferReadNeedsMask[] = + "amdgpu.buffer_transfer_read_needs_mask"; + namespace { struct TransferReadLowering final : OpRewritePattern { @@ -108,6 +140,8 @@ struct TransferReadLowering final : OpRewritePattern { LogicalResult matchAndRewrite(vector::TransferReadOp readOp, PatternRewriter &rewriter) const override { + if (readOp->hasAttr(kTransferReadNeedsMask)) + return failure(); bool requiresBroadcasting = false; VectorType unbroadcastedVectorType; @@ -117,20 +151,115 @@ struct TransferReadLowering final : OpRewritePattern { } Location loc = readOp.getLoc(); - Value fill = rewriter.create(loc, unbroadcastedVectorType, - readOp.getPadding()); - Value load = rewriter.create( - loc, unbroadcastedVectorType, readOp.getSource(), readOp.getIndices()); - Value res = rewriter.create(loc, unbroadcastedVectorType, - readOp.getMask(), load, fill); - - // Insert a broadcasting op if required. - if (requiresBroadcasting) { - res = rewriter.create(loc, readOp.getVectorType(), - res); + Value src = readOp.getSource(); + + VectorType vectorType = readOp.getVectorType(); + int64_t vectorSize = vectorType.getNumElements(); + int64_t elementBitWidth = vectorType.getElementTypeBitWidth(); + SmallVector indices = readOp.getIndices(); + + auto stridedMetadata = + rewriter.create(loc, src); + SmallVector strides = + stridedMetadata.getConstifiedMixedStrides(); + SmallVector sizes = stridedMetadata.getConstifiedMixedSizes(); + OpFoldResult offset = stridedMetadata.getConstifiedMixedOffset(); + OpFoldResult linearizedIndices; + std::tie(std::ignore, linearizedIndices) = + memref::getLinearizedMemRefOffsetAndSize(rewriter, loc, elementBitWidth, + elementBitWidth, offset, sizes, + strides, indices); + + // TODO(jerryyin): Fix the getLinearizedMemRefOffsetAndSize() function + // Note below doesn't give the correct result for the linearized size. + // Value totalSize = getValueOrCreateConstantIndexOp( + // rewriter, loc, linearizedInfo.linearizedSize); + // It computes the multiplied sizes of all dimensions instead of taking + // the maximum of each dimension size * stride. + SmallVector productExpressions; + SmallVector productResults; + unsigned sourceRank = cast(src.getType()).getRank(); + + SmallVector symbols(2 * sourceRank); + SmallVector offsetValues; + bindSymbolsList(rewriter.getContext(), MutableArrayRef{symbols}); + + size_t symbolIndex = 0; + for (size_t i = 0; i < sourceRank; ++i) { + AffineExpr strideExpr, sizeExpr; + OpFoldResult stride = strides[i]; + OpFoldResult size = sizes[i]; + if (auto constantStride = getConstantIntValue(stride)) { + strideExpr = rewriter.getAffineConstantExpr(*constantStride); + } else { + strideExpr = symbols[symbolIndex++]; + offsetValues.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, stride)); + } + + if (auto constantSize = getConstantIntValue(size)) { + sizeExpr = rewriter.getAffineConstantExpr(*constantSize); + } else { + sizeExpr = symbols[symbolIndex++]; + offsetValues.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, size)); + } + + productExpressions.push_back(strideExpr * sizeExpr); } - rewriter.replaceOp(readOp, res); + AffineMap maxMap = AffineMap::get( + /*dimCount=*/0, /*symbolCount=*/symbolIndex, productExpressions, + rewriter.getContext()); + Value totalSize = + rewriter.create(loc, maxMap, offsetValues); + + // delta = bufferSize - linearizedOffset + Value vectorSizeOffset = + rewriter.create(loc, vectorSize); + Value linearIndex = + getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices); + Value delta = rewriter.create(loc, totalSize, linearIndex); + + // 1) check if delta < vectorSize + Value isOutofBounds = rewriter.create( + loc, arith::CmpIPredicate::ult, delta, vectorSizeOffset); + + // 2) check if (detla_bytes % (32 / elementBitwidth) != 0) + Value deltaBytes = rewriter.create( + loc, delta, + rewriter.create(loc, elementBitWidth / 8)); + Value elementsPerWord = rewriter.create( + loc, llvm::divideCeil(32, elementBitWidth)); + Value isNotWordAligned = rewriter.create( + loc, arith::CmpIPredicate::ne, + rewriter.create(loc, deltaBytes, elementsPerWord), + rewriter.create(loc, 0)); + + // We take the fallback of transfer_read default lowering only it is both + // out-of-bounds and not word aligned. The fallback ensures correct results + // when loading at the boundary of the buffer since buffer load returns + // inconsistent zeros for the whole word when boundary is crossed. + Value ifCondition = + rewriter.create(loc, isOutofBounds, isNotWordAligned); + + auto thenBuilder = [&](OpBuilder &builder, Location loc) { + Operation *read = builder.clone(*readOp.getOperation()); + read->setAttr(kTransferReadNeedsMask, builder.getUnitAttr()); + Value readResult = read->getResult(0); + builder.create(loc, readResult); + }; + + auto elseBuilder = [&](OpBuilder &builder, Location loc) { + Value res = createVectorLoadForMaskedLoad( + builder, loc, readOp, requiresBroadcasting, unbroadcastedVectorType); + rewriter.create(loc, res); + }; + + auto ifOp = + rewriter.create(loc, ifCondition, thenBuilder, elseBuilder); + + rewriter.replaceOp(readOp, ifOp); return success(); } @@ -149,6 +278,8 @@ struct AmdgpuTransferReadToLoadPass final void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateAmdgpuTransferReadToLoadPatterns(patterns); - walkAndApplyPatterns(getOperation(), std::move(patterns)); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + return signalPassFailure(); + } } }; diff --git a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir index 3e1283579f2b1..d0805b6b8a973 100644 --- a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir +++ b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir @@ -9,11 +9,71 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant 0.0 -// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] + +// CHECK: %[[FALSE:.*]] = arith.constant false +// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) { +// CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]] + +// CHECK: } else { // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] -// CHECK: return %[[SELECT]] : vector<4xf32> +// CHECK: %[[SELECT:.*]] = arith.select %[[ARG2]], %[[LOAD]] + +// CHECK: return %[[IF]] : vector<4xf32> + +// ----- + +// CHECK: #map = affine_map<()[s0, s1] -> (s0 * 8 + s1)> +// CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_f16( +// CHECK-SAME: %[[ARG0:.+]]: memref<8x8xf16, #amdgpu.address_space>, +// CHECK-SAME: %[[ARG1:.+]]: index, %[[ARG2:.+]]: index, +// CHECK-SAME: %[[ARG3:.+]]: vector<4xi1>) +func.func @transfer_to_maskedload_fatrawbuffer_f16(%mem : memref<8x8xf16, #amdgpu.address_space>, %idx0 : index, %idx1 : index, %mask : vector<4xi1>) -> vector<4xf16> { + %cf0 = arith.constant 0.0 : f16 + %res = vector.transfer_read %mem[%idx0, %idx1], %cf0, %mask {in_bounds = [true]} : memref<8x8xf16, #amdgpu.address_space>, vector<4xf16> + return %res : vector<4xf16> +} +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 +// CHECK-DAG: %[[SIZE:.*]] = arith.constant 64 +// CHECK-DAG: %[[BYTES:.*]] = arith.constant 2 +// CHECK-DAG: %[[VECTORSIZE:.*]] = arith.constant 4 + +// CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[ARG2]]] +// CHECK: %[[DELTA:.*]] = arith.subi %[[SIZE]], %[[LINEAR]] +// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[VECTORSIZE]] + +// CHECK: %[[DELTABYTES:.*]] = arith.muli %[[DELTA]], %[[BYTES]] +// CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[BYTES]] +// CHECK: %[[COND2:.*]] = arith.cmpi ne, %[[REM]], %[[C0]] + +// CHECK: %[[COND:.*]] = arith.andi %[[COND1]], %[[COND2]] +// CHECK: %[[IF:.*]] = scf.if %[[COND]] -> (vector<4xf16>) { +// CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG2]]] +// CHECK: } else { +// CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG2]]] +// CHECK: return %[[IF]] : vector<4xf16> + +// ----- + +// CHECK: #map = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)> +// CHECK: #map1 = affine_map<()[s0, s1, s2] -> (s0 * s1, s2)> +// CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_dynamic_i8( +// CHECK-SAME: %[[ARG0:.*]]: memref> +// CHECK-SAME: %[[ARG1:.*]]: index, %[[ARG2:.*]]: index +// CHECK-SAME: %[[ARG3:.*]]: vector<4xi1> +func.func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(%mem : memref>, %idx0 : index, %idx1 : index, %mask : vector<4xi1>) -> vector<4xi8> { + %cf0 = arith.constant 0 : i8 + %res = vector.transfer_read %mem[%idx0, %idx1], %cf0, %mask {in_bounds = [true]} : memref>, vector<4xi8> + return %res : vector<4xi8> +} + +// CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<4xi8> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG0]] +// CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[STRIDES]]#0, %[[ARG2]]] +// CHECK: %[[SIZE:.*]] = affine.max #map1()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[SIZES]]#1] +// CHECK: %[[IF:.*]] = scf.if +// CHECK: return // ----- @@ -26,8 +86,8 @@ func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index, %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant 0.0 -// CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32> +// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 +// CHECK: %[[RES:.*]] = vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[CST]], %[[ARG2]] // CHECK: return %[[RES]] : vector<4xf32> // ----- @@ -41,8 +101,8 @@ func.func @transfer_to_maskedload_addrspace(%mem : memref<8x8xf32, #gpu.address_ %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant 0.0 -// CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space>, vector<4xf32> +// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 +// CHECK: %[[RES:.*]] = vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[CST]], %[[ARG2]] // CHECK: return %[[RES]] : vector<4xf32> // ----- @@ -59,12 +119,12 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant 0.0 -// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32> +// CHECK: %[[FALSE:.*]] = arith.constant false +// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) { // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32> -// CHECK: return %[[BROADCAST]] : vector<4xf32> // ----- @@ -79,8 +139,8 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space>, vector<1xf32> return %res : vector<1xf32> } -// CHECK: %[[CST:.*]] = arith.constant 0.0 -// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] -// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] -// CHECK: return %[[SELECT]] : vector<1xf32> +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32> +// CHECK: %[[FALSE:.*]] = arith.constant false +// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<1xf32>) { +// CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 07c522a237a8a..10503fe1d123b 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -1569,6 +1569,7 @@ cc_library( ":IR", ":MemRefDialect", ":Pass", + ":SCFDialect", ":SideEffectInterfaces", ":Support", ":TransformUtils", From f83c5fe01fbee0f53ecf69d887e7a7b054f2a9ae Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 15 Apr 2025 13:55:40 -0700 Subject: [PATCH 039/710] [nfc] Expose `canReturn` from FunctionAttrs (#135650) This is a fairly light-weight traversal and is needed in instrumentation. No need to run the whole `FunctionAttrs` pass at this stage. To avoid layering issues, this patch factors `canRun` and related under Analysis/CFG. --- llvm/include/llvm/Analysis/CFG.h | 1 + llvm/lib/Analysis/CFG.cpp | 34 ++++++++++++++++++++++ llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 35 ----------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/llvm/include/llvm/Analysis/CFG.h b/llvm/include/llvm/Analysis/CFG.h index 23bc10a4a9d1b..8451e88146d7c 100644 --- a/llvm/include/llvm/Analysis/CFG.h +++ b/llvm/include/llvm/Analysis/CFG.h @@ -174,6 +174,7 @@ bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI) { return false; } +bool canReturn(const Function &F); } // End llvm namespace #endif diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp index 841b835052380..8ced4a901557d 100644 --- a/llvm/lib/Analysis/CFG.cpp +++ b/llvm/lib/Analysis/CFG.cpp @@ -322,3 +322,37 @@ bool llvm::isPotentiallyReachable( return isPotentiallyReachable( A->getParent(), B->getParent(), ExclusionSet, DT, LI); } + +static bool instructionDoesNotReturn(const Instruction &I) { + if (auto *CB = dyn_cast(&I)) + return CB->hasFnAttr(Attribute::NoReturn); + return false; +} + +// A basic block can only return if it terminates with a ReturnInst and does not +// contain calls to noreturn functions. +static bool basicBlockCanReturn(const BasicBlock &BB) { + if (!isa(BB.getTerminator())) + return false; + return none_of(BB, instructionDoesNotReturn); +} + +// FIXME: this doesn't handle recursion. +bool llvm::canReturn(const Function &F) { + SmallVector Worklist; + SmallPtrSet Visited; + + Visited.insert(&F.front()); + Worklist.push_back(&F.front()); + + do { + const BasicBlock *BB = Worklist.pop_back_val(); + if (basicBlockCanReturn(*BB)) + return true; + for (const BasicBlock *Succ : successors(BB)) + if (Visited.insert(Succ).second) + Worklist.push_back(Succ); + } while (!Worklist.empty()); + + return false; +} diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index ef7989507c89f..bbfed2ac2c090 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -2090,41 +2090,6 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, Changed.insert(F); } -static bool instructionDoesNotReturn(Instruction &I) { - if (auto *CB = dyn_cast(&I)) - return CB->hasFnAttr(Attribute::NoReturn); - return false; -} - -// A basic block can only return if it terminates with a ReturnInst and does not -// contain calls to noreturn functions. -static bool basicBlockCanReturn(BasicBlock &BB) { - if (!isa(BB.getTerminator())) - return false; - return none_of(BB, instructionDoesNotReturn); -} - -// FIXME: this doesn't handle recursion. -static bool canReturn(Function &F) { - SmallVector Worklist; - SmallPtrSet Visited; - - Visited.insert(&F.front()); - Worklist.push_back(&F.front()); - - do { - BasicBlock *BB = Worklist.pop_back_val(); - if (basicBlockCanReturn(*BB)) - return true; - for (BasicBlock *Succ : successors(BB)) - if (Visited.insert(Succ).second) - Worklist.push_back(Succ); - } while (!Worklist.empty()); - - return false; -} - - // Set the noreturn function attribute if possible. static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, SmallSet &Changed) { From 12697c5516f8a9e4407e01d99324ce6958910184 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 15 Apr 2025 13:56:07 -0700 Subject: [PATCH 040/710] [LegalizeTypes] Check getTypeAction before calling GetScalarizedVector. (#135838) Use getTypeAction instead of trying to guess how a type will be legalized. On AArch64, v1f16 is scalarized but v1f16 is widened. Fixes #135776 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 4 +--- llvm/test/CodeGen/AArch64/pr135776.ll | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/pr135776.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f934d8b37561e..a01e1cff74564 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -400,9 +400,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { SDValue Op = N->getOperand(0); - if (Op.getValueType().isVector() - && Op.getValueType().getVectorNumElements() == 1 - && !isSimpleLegalType(Op.getValueType())) + if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector) Op = GetScalarizedVector(Op); EVT NewVT = N->getValueType(0).getVectorElementType(); return DAG.getNode(ISD::BITCAST, SDLoc(N), diff --git a/llvm/test/CodeGen/AArch64/pr135776.ll b/llvm/test/CodeGen/AArch64/pr135776.ll new file mode 100644 index 0000000000000..6f026234664fe --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr135776.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define i32 @bitcast_failure(ptr %0, <1 x i16> %1) { +; CHECK-LABEL: bitcast_failure: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: ret + %3 = bitcast <1 x i16> %1 to <1 x half> + %4 = extractelement <1 x half> %3, i64 0 + store half %4, ptr %0, align 2 + ret i32 0 +} From ddb12674300eb1af5e6945b5447e7bff7cff4cd8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 15 Apr 2025 13:53:16 -0700 Subject: [PATCH 041/710] [SLP]Insert vector instruction after landingpad If the node must be emitted in the landingpad block, need to insert the instructions after the landingpad instruction to avoid a crash. Fixes #135781 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 +- .../X86/landing-pad-for-split-node.ll | 77 +++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b174f0f03fca6..b48674f6993e3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16010,8 +16010,12 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { BasicBlock::iterator LastInstIt = LastInst->getIterator(); // If the instruction is PHI, set the insert point after all the PHIs. bool IsPHI = isa(LastInst); - if (IsPHI) + if (IsPHI) { LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); + if (LastInstIt != LastInst->getParent()->end() && + LastInstIt->getParent()->isLandingPad()) + LastInstIt = std::next(LastInstIt); + } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && doesNotNeedToSchedule(E->Scalars)) || diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll new file mode 100644 index 0000000000000..d6552adbd4abf --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-9999 < %s | FileCheck %s + +define void @test(i32 %arg) personality ptr null { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[ARG:%.*]]) personality ptr null { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(3) null, align 4 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(3) null, align 4 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(3) null, align 4 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(3) null, align 4 +; CHECK-NEXT: [[INVOKE:%.*]] = invoke i32 null(ptr addrspace(1) null, i32 0) +; CHECK-NEXT: to label %[[BB4:.*]] unwind label %[[BB5:.*]] +; CHECK: [[BB4]]: +; CHECK-NEXT: ret void +; CHECK: [[BB5]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ] +; CHECK-NEXT: [[PHI6:%.*]] = phi i32 [ 0, %[[BB]] ] +; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ 0, %[[BB]] ] +; CHECK-NEXT: [[PHI8:%.*]] = phi i32 [ 0, %[[BB]] ] +; CHECK-NEXT: [[LANDINGPAD:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LOAD1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LOAD3]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LOAD2]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[PHI]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[PHI8]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[PHI6]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[PHI7]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP7]], i64 4) +; CHECK-NEXT: br label %[[BB11:.*]] +; CHECK: [[BB9:.*]]: +; CHECK-NEXT: [[LANDINGPAD10:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: br label %[[BB11]] +; CHECK: [[BB11]]: +; CHECK-NEXT: [[TMP10:%.*]] = phi <8 x i32> [ poison, %[[BB9]] ], [ [[TMP9]], %[[BB5]] ] +; CHECK-NEXT: ret void +; +bb: + %load = load i32, ptr addrspace(3) null, align 4 + %load1 = load i32, ptr addrspace(3) null, align 4 + %load2 = load i32, ptr addrspace(3) null, align 4 + %load3 = load i32, ptr addrspace(3) null, align 4 + %invoke = invoke i32 null(ptr addrspace(1) null, i32 0) + to label %bb4 unwind label %bb5 + +bb4: ; preds = %bb + ret void + +bb5: ; preds = %bb + %phi = phi i32 [ 0, %bb ] + %phi6 = phi i32 [ 0, %bb ] + %phi7 = phi i32 [ 0, %bb ] + %phi8 = phi i32 [ 0, %bb ] + %landingpad = landingpad { ptr, i32 } + cleanup + br label %bb11 + +bb9: ; No predecessors! + %landingpad10 = landingpad { ptr, i32 } + cleanup + br label %bb11 + +bb11: ; preds = %bb9, %bb5 + %phi12 = phi i32 [ 0, %bb9 ], [ %phi, %bb5 ] + %phi13 = phi i32 [ 0, %bb9 ], [ %phi8, %bb5 ] + %phi14 = phi i32 [ 0, %bb9 ], [ %phi6, %bb5 ] + %phi15 = phi i32 [ %arg, %bb9 ], [ %phi7, %bb5 ] + %phi16 = phi i32 [ 0, %bb9 ], [ %load, %bb5 ] + %phi17 = phi i32 [ 0, %bb9 ], [ %load1, %bb5 ] + %phi18 = phi i32 [ %arg, %bb9 ], [ %load2, %bb5 ] + %phi19 = phi i32 [ 0, %bb9 ], [ %load3, %bb5 ] + ret void +} From 0f3e460e06e03ce37445546457a16d6f1eee1e21 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:10:18 -0700 Subject: [PATCH 042/710] [mlir][Tensor] Generalize the pattern to swap `tensor.collapse_shape` -> `tensor.expand_shape`. (#133819) The current patterns compared the reassocation indices for the two ops and failed if neither of them were of size 1. This patch relaxes this restriction by handling a new case where the reassociation indices might be of the same size. Also generalizes to cases where when generating the swapped `tensor.expand_shape` -> `tensor.collapse_shape` if one of them is degenerate, those are not generated. Signed-off-by: MaheshRavishankar --- .../Tensor/Transforms/ReshapePatterns.cpp | 112 ++++++++++++++---- mlir/test/Dialect/Tensor/bubble-reshapes.mlir | 63 +++++++++- 2 files changed, 149 insertions(+), 26 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp index eed44e60d6591..a3de7f9b44ae6 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp @@ -167,10 +167,39 @@ struct BubbleUpExpandThroughParallelCollapse return failure(); } - // Reshapes are parallel to each other if none of the reassociation indices - // have greater than 1 index for both reshapes. + // Reshapes are parallel to each other (by construction the number of + // reassociations specified in the collapse and expand are the same), if at + // any position + // 1. either the reassociation indices are of the same size, or + // 2. either the reassociation in the collapse or the expand is of size 1. + ArrayRef staticSourceSize = collapseOp.getSrcType().getShape(); + ArrayRef staticResultSize = expandOp.getStaticOutputShape(); for (auto [expandReassociation, collapseReassociation] : llvm::zip_equal(expandReInds, collapseReInds)) { + if (collapseReassociation.size() == expandReassociation.size()) { + // Even if the reassociations are the same, the collapse/expand should + // result in the same dimensions. i.e 4x8x2 into 64 should be expanded + // into 4x8x2 again. In presense of dynamic dimensions one can only + // verify "equality" when there is only one dynamic dimension present, + // and all other static dimensions are equal. + ArrayRef collapsedStaticShapes = staticSourceSize.slice( + collapseReassociation.front(), collapseReassociation.size()); + int64_t numCollapsedDynamic = + llvm::count_if(collapsedStaticShapes, + [](int64_t d) { return ShapedType::isDynamic(d); }); + ArrayRef expandedStaticShapes = staticResultSize.slice( + expandReassociation.front(), expandReassociation.size()); + int64_t numExpandedDynamic = + llvm::count_if(expandedStaticShapes, + [](int64_t d) { return ShapedType::isDynamic(d); }); + if (numCollapsedDynamic > 1 || numExpandedDynamic > 1 || + collapsedStaticShapes != expandedStaticShapes) { + return failure(); + } + continue; + } + // If the reassociations are not same, one or the other needs to be of + // size one. if (collapseReassociation.size() != 1 && expandReassociation.size() != 1) return failure(); } @@ -178,33 +207,60 @@ struct BubbleUpExpandThroughParallelCollapse // Compute new reassociation indices and expanded/collaped shapes. SmallVector newExpandReInds, newCollapseReInds; Location loc = expandOp->getLoc(); - SmallVector collapseSizes = + SmallVector sourceSizes = tensor::getMixedSizes(rewriter, loc, collapseOp.getSrc()); - SmallVector expandSizes(getMixedValues( - expandOp.getStaticOutputShape(), expandOp.getOutputShape(), rewriter)); + SmallVector resultSizes = expandOp.getMixedOutputShape(); SmallVector newExpandSizes; - int64_t index = 0, expandIndex = 0, collapseIndex = 0; - for (auto [idx, collapseReassociation] : llvm::enumerate(collapseReInds)) { + + int64_t newExpandIndex = 0, newCollapseIndex = 0, sourceSizeIndex = 0, + resultSizeIndex = 0; + + for (size_t idx = 0, idxEnd = collapseReInds.size(); idx < idxEnd; idx++) { + auto &collapseReassociation = collapseReInds[idx]; + auto &expandReassociation = expandReInds[idx]; + + // Case 1. The reassociations are same in the collapse producer + // and expand consumer. In the swapped expand, each of the final + // dimensions are kept as is in the expand and the collapse. So, + // for every element in the `ReassocationIndices` vector add a new + // `ReassociationIndices` vector for the swapped expand and collapse + // (of size 1). + if (collapseReassociation.size() == expandReassociation.size()) { + for (size_t i = 0; i < collapseReassociation.size(); ++i) { + newCollapseReInds.push_back({newCollapseIndex++}); + newExpandReInds.push_back({newExpandIndex++}); + newExpandSizes.push_back(resultSizes[resultSizeIndex++]); + sourceSizeIndex++; + } + continue; + } + + // Case 2. The `ReassociationIndices` in the collapse is of size > 1 (and + // in the expand is of size == 1). In this case, the original dimensions + // are preserved on expansion and collapsed subsequently. if (collapseReassociation.size() != 1) { ReassociationIndices newCollapseReassociation; for (size_t i = 0; i < collapseReassociation.size(); ++i) { - newCollapseReassociation.push_back(index); - newExpandReInds.push_back({index++}); - newExpandSizes.push_back(collapseSizes[collapseIndex++]); + newCollapseReassociation.push_back(newCollapseIndex++); + newExpandReInds.push_back({newExpandIndex++}); + newExpandSizes.push_back(sourceSizes[sourceSizeIndex++]); } + resultSizeIndex++; newCollapseReInds.push_back(newCollapseReassociation); - expandIndex++; continue; } + + // Case 3. The `ReassociationIndices` in the expand is of size > 1 (and + // in the collapse is of size == 1). In this case, the expansion happens + // first and the expanded dimensions are preserved on collapse. ReassociationIndices newExpandReassociation; - auto expandReassociation = expandReInds[idx]; for (size_t i = 0; i < expandReassociation.size(); ++i) { - newExpandReassociation.push_back(index); - newCollapseReInds.push_back({index++}); - newExpandSizes.push_back(expandSizes[expandIndex++]); + newExpandReassociation.push_back(newExpandIndex++); + newCollapseReInds.push_back({newCollapseIndex++}); + newExpandSizes.push_back(resultSizes[resultSizeIndex++]); } newExpandReInds.push_back(newExpandReassociation); - collapseIndex++; + sourceSizeIndex++; } // Swap reshape order. @@ -212,11 +268,25 @@ struct BubbleUpExpandThroughParallelCollapse SmallVector staticSizes; dispatchIndexOpFoldResults(newExpandSizes, dynamicSizes, staticSizes); auto expandResultType = expandOp.getResultType().clone(staticSizes); - auto newExpand = rewriter.create( - loc, expandResultType, collapseOp.getSrc(), newExpandReInds, - newExpandSizes); - rewriter.replaceOpWithNewOp( - expandOp, newExpand.getResult(), newCollapseReInds); + Value newCollapseSrc = collapseOp.getSrc(); + // If the number of reassociation indices in the new `expand_shape` op + // matches the number of dimensions of the result, then the expand_shape + // is a no-op. + if (newExpandReInds.size() != newExpandSizes.size()) { + newCollapseSrc = rewriter.create( + loc, expandResultType, newCollapseSrc, newExpandReInds, + newExpandSizes); + } + + // If the number of reassociation indices in the new `collapse_shape` op + // matches the number of dimensions of the source, then the collapse_shape + // is a no-op. + Value replacement = newCollapseSrc; + if (newCollapseReInds.size() != newExpandSizes.size()) { + replacement = rewriter.create( + loc, newCollapseSrc, newCollapseReInds); + } + rewriter.replaceOp(expandOp, replacement); return success(); } }; diff --git a/mlir/test/Dialect/Tensor/bubble-reshapes.mlir b/mlir/test/Dialect/Tensor/bubble-reshapes.mlir index eeed794884942..81bf8e3f60e2c 100644 --- a/mlir/test/Dialect/Tensor/bubble-reshapes.mlir +++ b/mlir/test/Dialect/Tensor/bubble-reshapes.mlir @@ -48,14 +48,67 @@ func.func @no_bubble_partial_intersecting_reshapes(%arg0: tensor, % // ----- -func.func @no_bubble_0d_tensor_reshapes(%arg0: tensor, %s0: index, %s1: index, %s2: index, %s3: index) -> tensor { - %collapse = tensor.collapse_shape %arg0 [] : tensor into tensor +func.func @no_bubble_0d_tensor_reshapes(%arg0: tensor<1x1xf32>) -> tensor<1x1x1xf32> { + %collapse = tensor.collapse_shape %arg0 [] : tensor<1x1xf32> into tensor %expand = tensor.expand_shape %collapse [] - output_shape [%s0, %s1, %s2, %s3] : tensor into tensor - return %expand : tensor + output_shape [1, 1, 1] : tensor into tensor<1x1x1xf32> + return %expand : tensor<1x1x1xf32> } // CHECK: func @no_bubble_0d_tensor_reshapes -// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[ARG0:.+]]: tensor<1x1xf32> // CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}] // CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}] // CHECK: return %[[EXPAND]] + +// ----- + +// Test the case where the reassocation indices in the collapse and expand +// are of same size. +func.func @bubble_expand_match_non_unit_size_reassocation( + %arg0 : tensor<4x?x4x32x4x?xf16>, %arg1 : index, %arg2 : index) -> tensor<4x?x4x128x?x32xf16> { + %collapsed = tensor.collapse_shape %arg0 [[0, 1, 2], [3, 4], [5]] + : tensor<4x?x4x32x4x?xf16> into tensor + %expanded = tensor.expand_shape %collapsed [[0, 1, 2], [3], [4, 5]] output_shape [4, %arg1, 4, 128, %arg2, 32] + : tensor into tensor<4x?x4x128x?x32xf16> + return %expanded : tensor<4x?x4x128x?x32xf16> +} +// CHECK: func @bubble_expand_match_non_unit_size_reassocation +// CHECK-SAME: %[[ARG0:.+]]: tensor<4x?x4x32x4x?xf16> +// CHECK-SAME: %[[ARG1:[a-zA-z0-9]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index +// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] +// CHECK-SAME: {{\[}}[0], [1], [2], [3], [4], [5, 6]{{\]}} +// CHECK-SAME: [4, %[[ARG1]], 4, 32, 4, %[[ARG2]], 32] +// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[EXPANDED]] +// CHECK-SAME: {{\[}}[0], [1], [2], [3, 4], [5], [6]{{\]}} +// CHECK: return %[[COLLAPSED]] + +// ----- + +// Test the case where the trailing collapse isnt needed. +func.func @no_collapse_generated( + %arg0 : tensor<4x?x4x128x?xf16>, %arg1 : index, %arg2 : index) -> tensor<4x?x4x128x?x32xf16> { + %collapsed = tensor.collapse_shape %arg0 [[0, 1, 2], [3], [4]] + : tensor<4x?x4x128x?xf16> into tensor + %expanded = tensor.expand_shape %collapsed [[0, 1, 2], [3], [4, 5]] output_shape [4, %arg1, 4, 128, %arg2, 32] + : tensor into tensor<4x?x4x128x?x32xf16> + return %expanded : tensor<4x?x4x128x?x32xf16> +} +// CHECK: func @no_collapse_generated +// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape +// CHECK: return %[[EXPANDED]] + +// ----- + +// Test the case where the leading expand isnt needed. +func.func @no_expand_generated( + %arg0 : tensor<4x?x4x128x?x?x?xf16>, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<4x?x4x128x?x?xf16> { + %collapsed = tensor.collapse_shape %arg0 [[0, 1, 2], [3], [4], [5, 6]] + : tensor<4x?x4x128x?x?x?xf16> into tensor + %expanded = tensor.expand_shape %collapsed [[0, 1, 2], [3], [4], [5]] output_shape [4, %arg1, 4, 128, %arg2, %arg3] + : tensor into tensor<4x?x4x128x?x?xf16> + return %expanded : tensor<4x?x4x128x?x?xf16> +} +// CHECK: func @no_expand_generated +// CHECK: %[[EXPANDED:.+]] = tensor.collapse_shape +// CHECK: return %[[EXPANDED]] From d30a5b41fe72a1dd83714d3e21fd539b91e63c8c Mon Sep 17 00:00:00 2001 From: Djordje Todorovic Date: Tue, 15 Apr 2025 23:17:03 +0200 Subject: [PATCH 043/710] [RISCV] Fix xmipscmov extension name (#135647) The right name was used in riscv-toolchain-conventions docs. --- clang/test/Driver/print-supported-extensions-riscv.c | 2 +- llvm/docs/RISCVUsage.rst | 2 +- .../Target/RISCV/Disassembler/RISCVDisassembler.cpp | 4 ++-- llvm/lib/Target/RISCV/RISCVFeatures.td | 12 ++++++------ llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td | 4 ++-- llvm/lib/Target/RISCV/RISCVProcessors.td | 2 +- llvm/lib/Target/RISCV/RISCVSubtarget.cpp | 2 +- llvm/test/CodeGen/RISCV/features-info.ll | 2 +- llvm/test/CodeGen/RISCV/select-and.ll | 2 +- llvm/test/CodeGen/RISCV/select-bare.ll | 2 +- llvm/test/CodeGen/RISCV/select-cc.ll | 2 +- llvm/test/CodeGen/RISCV/select-or.ll | 2 +- llvm/test/MC/RISCV/xmips-invalid.s | 4 ++-- llvm/test/MC/RISCV/xmips-valid.s | 6 +++--- llvm/unittests/TargetParser/RISCVISAInfoTest.cpp | 2 +- 15 files changed, 25 insertions(+), 25 deletions(-) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index d06cedac5b1eb..39002d7b4780a 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -161,7 +161,7 @@ // CHECK-NEXT: xcvmac 1.0 'XCVmac' (CORE-V Multiply-Accumulate) // CHECK-NEXT: xcvmem 1.0 'XCVmem' (CORE-V Post-incrementing Load & Store) // CHECK-NEXT: xcvsimd 1.0 'XCVsimd' (CORE-V SIMD ALU) -// CHECK-NEXT: xmipscmove 1.0 'XMIPSCMove' (MIPS conditional move instruction(s) (ccmov)) +// CHECK-NEXT: xmipscmov 1.0 'XMIPSCMov' (MIPS conditional move instruction (mips.ccmov)) // CHECK-NEXT: xmipslsp 1.0 'XMIPSLSP' (MIPS optimization for hardware load-store bonding) // CHECK-NEXT: xsfcease 1.0 'XSfcease' (SiFive sf.cease Instruction) // CHECK-NEXT: xsfvcp 1.0 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions) diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index cda7e5fec8488..137b537f00ea0 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -491,7 +491,7 @@ The current vendor extensions supported are: ``experimental-Xqcisync`` LLVM implements `version 0.2 of the Qualcomm uC Sync Delay extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. -``Xmipscmove`` +``Xmipscmov`` LLVM implements conditional move for the `p8700 processor ` by MIPS. ``Xmipslsp`` diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 366291b53bebb..27809d96b647c 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -737,8 +737,8 @@ static constexpr DecoderListEntry DecoderList32[]{ {DecoderTableXSfsystem32, XSfSystemGroup, "SiFive system extensions"}, {DecoderTableXSfcease32, {RISCV::FeatureVendorXSfcease}, "SiFive sf.cease"}, {DecoderTableXmipslsp32, {RISCV::FeatureVendorXMIPSLSP}, "MIPS mips.lsp"}, - {DecoderTableXmipscmove32, - {RISCV::FeatureVendorXMIPSCMove}, + {DecoderTableXmipscmov32, + {RISCV::FeatureVendorXMIPSCMov}, "MIPS mips.ccmov"}, // Standard Extensions {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 21bcf343139c2..f51fcf82077f4 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1300,12 +1300,12 @@ def HasVendorXCVbi "'XCVbi' (CORE-V Immediate Branching)">; // MIPS Extensions -def FeatureVendorXMIPSCMove - : RISCVExtension<1, 0, "MIPS conditional move instruction(s) (ccmov)">; -def HasVendorXMIPSCMove - : Predicate<"Subtarget->hasVendorXMIPSCMove()">, - AssemblerPredicate<(all_of FeatureVendorXMIPSCMove), - "'Xmipscmove' ('mips.ccmov' instruction)">; +def FeatureVendorXMIPSCMov + : RISCVExtension<1, 0, "MIPS conditional move instruction (mips.ccmov)">; +def HasVendorXMIPSCMov + : Predicate<"Subtarget->hasVendorXMIPSCMov()">, + AssemblerPredicate<(all_of FeatureVendorXMIPSCMov), + "'Xmipscmov' ('mips.ccmov' instruction)">; def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">; def FeatureVendorXMIPSLSP : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 9be424310d660..ff751994b89b9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -107,8 +107,8 @@ class SWPFormat // MIPS extensions //===----------------------------------------------------------------------===// -let Predicates = [HasVendorXMIPSCMove], hasSideEffects = 0, mayLoad = 0, mayStore = 0, - DecoderNamespace = "Xmipscmove" in { +let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0, + DecoderNamespace = "Xmipscmov" in { def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2, GPR:$rs3), "mips.ccmov", "$rd, $rs2, $rs1, $rs3">, diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 9d48adeec5e86..ece12c48b1cd9 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -119,7 +119,7 @@ def MIPS_P8700 : RISCVProcessorModel<"mips-p8700", FeatureStdExtZbb, FeatureStdExtZifencei, FeatureStdExtZicsr, - FeatureVendorXMIPSCMove, + FeatureVendorXMIPSCMov, FeatureVendorXMIPSLSP], [TuneMIPSP8700]>; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 3c996c82fcec4..b3c313f2ed394 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -253,5 +253,5 @@ bool RISCVSubtarget::useLoadStorePairs() const { } bool RISCVSubtarget::useCCMovInsn() const { - return UseCCMovInsn && HasVendorXMIPSCMove; + return UseCCMovInsn && HasVendorXMIPSCMov; } diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 5d2f0881048bd..d377bda059d33 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -174,7 +174,7 @@ ; CHECK-NEXT: xcvmac - 'XCVmac' (CORE-V Multiply-Accumulate). ; CHECK-NEXT: xcvmem - 'XCVmem' (CORE-V Post-incrementing Load & Store). ; CHECK-NEXT: xcvsimd - 'XCVsimd' (CORE-V SIMD ALU). -; CHECK-NEXT: xmipscmove - 'XMIPSCMove' (MIPS conditional move instruction(s) (ccmov)). +; CHECK-NEXT: xmipscmov - 'XMIPSCMov' (MIPS conditional move instruction (mips.ccmov)). ; CHECK-NEXT: xmipslsp - 'XMIPSLSP' (MIPS optimization for hardware load-store bonding). ; CHECK-NEXT: xsfcease - 'XSfcease' (SiFive sf.cease Instruction). ; CHECK-NEXT: xsfvcp - 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions). diff --git a/llvm/test/CodeGen/RISCV/select-and.ll b/llvm/test/CodeGen/RISCV/select-and.ll index f827e840f4a36..2c9d0a8b56425 100644 --- a/llvm/test/CodeGen/RISCV/select-and.ll +++ b/llvm/test/CodeGen/RISCV/select-and.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-CCMOV %s ;; There are a few different ways to lower (select (and A, B), X, Y). This test diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll index c9e108a1ca9d0..fc8eaa480b116 100644 --- a/llvm/test/CodeGen/RISCV/select-bare.ll +++ b/llvm/test/CodeGen/RISCV/select-bare.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefix=RV32I -; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-CCMOV %s define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind { diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll index 1c2a0cf007d11..e69dc303d85dc 100644 --- a/llvm/test/CodeGen/RISCV/select-cc.ll +++ b/llvm/test/CodeGen/RISCV/select-cc.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck -check-prefixes=RV32I %s ; RUN: llc -mtriple=riscv64 -disable-block-placement -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-CCMOV %s define signext i32 @foo(i32 signext %a, ptr %b) nounwind { diff --git a/llvm/test/CodeGen/RISCV/select-or.ll b/llvm/test/CodeGen/RISCV/select-or.ll index 338c7c06c3ab8..091c8b1a11e71 100644 --- a/llvm/test/CodeGen/RISCV/select-or.ll +++ b/llvm/test/CodeGen/RISCV/select-or.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-CCMOV %s ;; There are a few different ways to lower (select (or A, B), X, Y). This test diff --git a/llvm/test/MC/RISCV/xmips-invalid.s b/llvm/test/MC/RISCV/xmips-invalid.s index a1c1fd0666e0a..b3834e7b3407f 100644 --- a/llvm/test/MC/RISCV/xmips-invalid.s +++ b/llvm/test/MC/RISCV/xmips-invalid.s @@ -1,5 +1,5 @@ # RUN: not llvm-mc -triple=riscv64 < %s 2>&1 | FileCheck %s -check-prefixes=CHECK-FEATURE -# RUN: not llvm-mc -triple=riscv64 -mattr=+xmipslsp,+xmipscmove < %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=riscv64 -mattr=+xmipslsp,+xmipscmov < %s 2>&1 | FileCheck %s mips.ccmov x0, x1, 0x10 # CHECK: error: invalid operand for instruction @@ -8,7 +8,7 @@ mips.ccmov x10 # CHECK: error: too few operands for instruction mips.ccmov s0, s1, s2, s3 -# CHECK-FEATURE: error: instruction requires the following: 'Xmipscmove' ('mips.ccmov' instruction) +# CHECK-FEATURE: error: instruction requires the following: 'Xmipscmov' ('mips.ccmov' instruction) mips.lwp x10, x11 # CHECK: error: too few operands for instruction diff --git a/llvm/test/MC/RISCV/xmips-valid.s b/llvm/test/MC/RISCV/xmips-valid.s index ba256a823f511..9f31e4fa2038c 100644 --- a/llvm/test/MC/RISCV/xmips-valid.s +++ b/llvm/test/MC/RISCV/xmips-valid.s @@ -1,7 +1,7 @@ -# RUN: llvm-mc %s -triple=riscv64 -mattr=+xmipslsp,+xmipscmove -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+xmipslsp,+xmipscmov -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s -# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xmipslsp,+xmipscmove < %s \ -# RUN: | llvm-objdump --mattr=+xmipslsp,+xmipscmove -M no-aliases -d - \ +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xmipslsp,+xmipscmov < %s \ +# RUN: | llvm-objdump --mattr=+xmipslsp,+xmipscmov -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-DIS %s # CHECK-INST: mips.ccmov s0, s1, s2, s3 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 2ec27ba9d91b7..ff0a5e64ab3e1 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1114,7 +1114,7 @@ R"(All available -march extensions for RISC-V xcvmac 1.0 xcvmem 1.0 xcvsimd 1.0 - xmipscmove 1.0 + xmipscmov 1.0 xmipslsp 1.0 xsfcease 1.0 xsfvcp 1.0 From 9c73eba8aa17cb7ca4248ab1c7f67ea7ec9b50b1 Mon Sep 17 00:00:00 2001 From: Aaron Puchert Date: Tue, 15 Apr 2025 23:21:34 +0200 Subject: [PATCH 044/710] Merge similar Clang Thread Safety attributes (#135561) Some of the old lock-based and new capability-based spellings behave basically in the same way, so merging them simplifies the code significantly. There are two minor functional changes: we only warn (instead of an error) when the try_acquire_capability attribute is used on something else than a function. The alternative would have been to produce an error for the old spelling, but we seem to only warn for all function attributes, so this is arguably more consistent. The second change is that we also check the first argument (which is the value returned for a successful try-acquire) for `this`. But from what I can tell, this code is defunct anyway at the moment (see #31414). --- clang/include/clang/Basic/Attr.td | 65 +++++-------------------- clang/lib/AST/ASTImporter.cpp | 28 ----------- clang/lib/Analysis/ThreadSafety.cpp | 60 ++--------------------- clang/lib/Sema/SemaDeclAttr.cpp | 55 --------------------- clang/lib/Sema/SemaDeclCXX.cpp | 13 ++--- clang/test/Sema/attr-capabilities.c | 2 +- clang/unittests/AST/ASTImporterTest.cpp | 36 -------------- 7 files changed, 21 insertions(+), 238 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 9d4900f3029c8..9465451cbfe1f 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -3818,7 +3818,9 @@ def Capability : InheritableAttr { def AssertCapability : InheritableAttr { let Spellings = [Clang<"assert_capability", 0>, - Clang<"assert_shared_capability", 0>]; + Clang<"assert_shared_capability", 0>, + GNU<"assert_exclusive_lock">, + GNU<"assert_shared_lock">]; let Subjects = SubjectList<[Function]>; let LateParsed = LateAttrParseStandard; let TemplateDependent = 1; @@ -3826,7 +3828,8 @@ def AssertCapability : InheritableAttr { let InheritEvenIfAlreadyPresent = 1; let Args = [VariadicExprArgument<"Args">]; let Accessors = [Accessor<"isShared", - [Clang<"assert_shared_capability", 0>]>]; + [Clang<"assert_shared_capability", 0>, + GNU<"assert_shared_lock">]>]; let Documentation = [AssertCapabilityDocs]; } @@ -3849,16 +3852,18 @@ def AcquireCapability : InheritableAttr { def TryAcquireCapability : InheritableAttr { let Spellings = [Clang<"try_acquire_capability", 0>, - Clang<"try_acquire_shared_capability", 0>]; - let Subjects = SubjectList<[Function], - ErrorDiag>; + Clang<"try_acquire_shared_capability", 0>, + GNU<"exclusive_trylock_function">, + GNU<"shared_trylock_function">]; + let Subjects = SubjectList<[Function]>; let LateParsed = LateAttrParseStandard; let TemplateDependent = 1; let ParseArgumentsAsUnevaluated = 1; let InheritEvenIfAlreadyPresent = 1; let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">]; let Accessors = [Accessor<"isShared", - [Clang<"try_acquire_shared_capability", 0>]>]; + [Clang<"try_acquire_shared_capability", 0>, + GNU<"shared_trylock_function">]>]; let Documentation = [TryAcquireCapabilityDocs]; } @@ -3948,54 +3953,6 @@ def AcquiredBefore : InheritableAttr { let Documentation = [Undocumented]; } -def AssertExclusiveLock : InheritableAttr { - let Spellings = [GNU<"assert_exclusive_lock">]; - let Args = [VariadicExprArgument<"Args">]; - let LateParsed = LateAttrParseStandard; - let TemplateDependent = 1; - let ParseArgumentsAsUnevaluated = 1; - let InheritEvenIfAlreadyPresent = 1; - let Subjects = SubjectList<[Function]>; - let Documentation = [Undocumented]; -} - -def AssertSharedLock : InheritableAttr { - let Spellings = [GNU<"assert_shared_lock">]; - let Args = [VariadicExprArgument<"Args">]; - let LateParsed = LateAttrParseStandard; - let TemplateDependent = 1; - let ParseArgumentsAsUnevaluated = 1; - let InheritEvenIfAlreadyPresent = 1; - let Subjects = SubjectList<[Function]>; - let Documentation = [Undocumented]; -} - -// The first argument is an integer or boolean value specifying the return value -// of a successful lock acquisition. -def ExclusiveTrylockFunction : InheritableAttr { - let Spellings = [GNU<"exclusive_trylock_function">]; - let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">]; - let LateParsed = LateAttrParseStandard; - let TemplateDependent = 1; - let ParseArgumentsAsUnevaluated = 1; - let InheritEvenIfAlreadyPresent = 1; - let Subjects = SubjectList<[Function]>; - let Documentation = [Undocumented]; -} - -// The first argument is an integer or boolean value specifying the return value -// of a successful lock acquisition. -def SharedTrylockFunction : InheritableAttr { - let Spellings = [GNU<"shared_trylock_function">]; - let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">]; - let LateParsed = LateAttrParseStandard; - let TemplateDependent = 1; - let ParseArgumentsAsUnevaluated = 1; - let InheritEvenIfAlreadyPresent = 1; - let Subjects = SubjectList<[Function]>; - let Documentation = [Undocumented]; -} - def LockReturned : InheritableAttr { let Spellings = [GNU<"lock_returned">]; let Args = [ExprArgument<"Arg">]; diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index b55b8f2c14147..00628602e61fa 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -9425,34 +9425,6 @@ Expected ASTImporter::Import(const Attr *FromAttr) { From->args_size()); break; } - case attr::AssertExclusiveLock: { - const auto *From = cast(FromAttr); - AI.importAttr(From, - AI.importArrayArg(From->args(), From->args_size()).value(), - From->args_size()); - break; - } - case attr::AssertSharedLock: { - const auto *From = cast(FromAttr); - AI.importAttr(From, - AI.importArrayArg(From->args(), From->args_size()).value(), - From->args_size()); - break; - } - case attr::ExclusiveTrylockFunction: { - const auto *From = cast(FromAttr); - AI.importAttr(From, AI.importArg(From->getSuccessValue()).value(), - AI.importArrayArg(From->args(), From->args_size()).value(), - From->args_size()); - break; - } - case attr::SharedTrylockFunction: { - const auto *From = cast(FromAttr); - AI.importAttr(From, AI.importArg(From->getSuccessValue()).value(), - AI.importArrayArg(From->args(), From->args_size()).value(), - From->args_size()); - break; - } case attr::LockReturned: { const auto *From = cast(FromAttr); AI.importAttr(From, AI.importArg(From->getArg()).value()); diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp index 6b5b49377fa08..42fb0fe7dcdaa 100644 --- a/clang/lib/Analysis/ThreadSafety.cpp +++ b/clang/lib/Analysis/ThreadSafety.cpp @@ -1511,38 +1511,17 @@ void ThreadSafetyAnalyzer::getEdgeLockset(FactSet& Result, return; auto *FunDecl = dyn_cast_or_null(Exp->getCalleeDecl()); - if(!FunDecl || !FunDecl->hasAttrs()) + if (!FunDecl || !FunDecl->hasAttr()) return; CapExprSet ExclusiveLocksToAdd; CapExprSet SharedLocksToAdd; // If the condition is a call to a Trylock function, then grab the attributes - for (const auto *Attr : FunDecl->attrs()) { - switch (Attr->getKind()) { - case attr::TryAcquireCapability: { - auto *A = cast(Attr); - getMutexIDs(A->isShared() ? SharedLocksToAdd : ExclusiveLocksToAdd, A, - Exp, FunDecl, PredBlock, CurrBlock, A->getSuccessValue(), - Negate); - break; - }; - case attr::ExclusiveTrylockFunction: { - const auto *A = cast(Attr); - getMutexIDs(ExclusiveLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock, - A->getSuccessValue(), Negate); - break; - } - case attr::SharedTrylockFunction: { - const auto *A = cast(Attr); - getMutexIDs(SharedLocksToAdd, A, Exp, FunDecl, PredBlock, CurrBlock, - A->getSuccessValue(), Negate); - break; - } - default: - break; - } - } + for (const auto *Attr : FunDecl->specific_attrs()) + getMutexIDs(Attr->isShared() ? SharedLocksToAdd : ExclusiveLocksToAdd, Attr, + Exp, FunDecl, PredBlock, CurrBlock, Attr->getSuccessValue(), + Negate); // Add and remove locks. SourceLocation Loc = Exp->getExprLoc(); @@ -1882,29 +1861,6 @@ void BuildLockset::handleCall(const Expr *Exp, const NamedDecl *D, // An assert will add a lock to the lockset, but will not generate // a warning if it is already there, and will not generate a warning // if it is not removed. - case attr::AssertExclusiveLock: { - const auto *A = cast(At); - - CapExprSet AssertLocks; - Analyzer->getMutexIDs(AssertLocks, A, Exp, D, Self); - for (const auto &AssertLock : AssertLocks) - Analyzer->addLock( - FSet, std::make_unique( - AssertLock, LK_Exclusive, Loc, FactEntry::Asserted)); - break; - } - case attr::AssertSharedLock: { - const auto *A = cast(At); - - CapExprSet AssertLocks; - Analyzer->getMutexIDs(AssertLocks, A, Exp, D, Self); - for (const auto &AssertLock : AssertLocks) - Analyzer->addLock( - FSet, std::make_unique( - AssertLock, LK_Shared, Loc, FactEntry::Asserted)); - break; - } - case attr::AssertCapability: { const auto *A = cast(At); CapExprSet AssertLocks; @@ -2499,12 +2455,6 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { getMutexIDs(A->isShared() ? SharedLocksAcquired : ExclusiveLocksAcquired, A, nullptr, D); - } else if (isa(Attr)) { - // Don't try to check trylock functions for now. - return; - } else if (isa(Attr)) { - // Don't try to check trylock functions for now. - return; } else if (isa(Attr)) { // Don't try to check trylock functions for now. return; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 20ea38b7e05db..bc891fb009410 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -538,29 +538,6 @@ static bool checkLockFunAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, return true; } -static void handleAssertSharedLockAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - SmallVector Args; - if (!checkLockFunAttrCommon(S, D, AL, Args)) - return; - - unsigned Size = Args.size(); - Expr **StartArg = Size == 0 ? nullptr : &Args[0]; - D->addAttr(::new (S.Context) - AssertSharedLockAttr(S.Context, AL, StartArg, Size)); -} - -static void handleAssertExclusiveLockAttr(Sema &S, Decl *D, - const ParsedAttr &AL) { - SmallVector Args; - if (!checkLockFunAttrCommon(S, D, AL, Args)) - return; - - unsigned Size = Args.size(); - Expr **StartArg = Size == 0 ? nullptr : &Args[0]; - D->addAttr(::new (S.Context) - AssertExclusiveLockAttr(S.Context, AL, StartArg, Size)); -} - /// Checks to be sure that the given parameter number is in bounds, and /// is an integral type. Will emit appropriate diagnostics if this returns /// false. @@ -640,26 +617,6 @@ static bool checkTryLockFunAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, return true; } -static void handleSharedTrylockFunctionAttr(Sema &S, Decl *D, - const ParsedAttr &AL) { - SmallVector Args; - if (!checkTryLockFunAttrCommon(S, D, AL, Args)) - return; - - D->addAttr(::new (S.Context) SharedTrylockFunctionAttr( - S.Context, AL, AL.getArgAsExpr(0), Args.data(), Args.size())); -} - -static void handleExclusiveTrylockFunctionAttr(Sema &S, Decl *D, - const ParsedAttr &AL) { - SmallVector Args; - if (!checkTryLockFunAttrCommon(S, D, AL, Args)) - return; - - D->addAttr(::new (S.Context) ExclusiveTrylockFunctionAttr( - S.Context, AL, AL.getArgAsExpr(0), Args.data(), Args.size())); -} - static void handleLockReturnedAttr(Sema &S, Decl *D, const ParsedAttr &AL) { // check that the argument is lockable object SmallVector Args; @@ -7528,12 +7485,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, break; // Thread safety attributes: - case ParsedAttr::AT_AssertExclusiveLock: - handleAssertExclusiveLockAttr(S, D, AL); - break; - case ParsedAttr::AT_AssertSharedLock: - handleAssertSharedLockAttr(S, D, AL); - break; case ParsedAttr::AT_PtGuardedVar: handlePtGuardedVarAttr(S, D, AL); break; @@ -7549,18 +7500,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_PtGuardedBy: handlePtGuardedByAttr(S, D, AL); break; - case ParsedAttr::AT_ExclusiveTrylockFunction: - handleExclusiveTrylockFunctionAttr(S, D, AL); - break; case ParsedAttr::AT_LockReturned: handleLockReturnedAttr(S, D, AL); break; case ParsedAttr::AT_LocksExcluded: handleLocksExcludedAttr(S, D, AL); break; - case ParsedAttr::AT_SharedTrylockFunction: - handleSharedTrylockFunctionAttr(S, D, AL); - break; case ParsedAttr::AT_AcquiredBefore: handleAcquiredBeforeAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 2247aded9384a..05991228dbfc2 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -19401,13 +19401,7 @@ bool Sema::checkThisInStaticMemberFunctionAttributes(CXXMethodDecl *Method) { Args = llvm::ArrayRef(AA->args_begin(), AA->args_size()); else if (const auto *AB = dyn_cast(A)) Args = llvm::ArrayRef(AB->args_begin(), AB->args_size()); - else if (const auto *ETLF = dyn_cast(A)) { - Arg = ETLF->getSuccessValue(); - Args = llvm::ArrayRef(ETLF->args_begin(), ETLF->args_size()); - } else if (const auto *STLF = dyn_cast(A)) { - Arg = STLF->getSuccessValue(); - Args = llvm::ArrayRef(STLF->args_begin(), STLF->args_size()); - } else if (const auto *LR = dyn_cast(A)) + else if (const auto *LR = dyn_cast(A)) Arg = LR->getArg(); else if (const auto *LE = dyn_cast(A)) Args = llvm::ArrayRef(LE->args_begin(), LE->args_size()); @@ -19415,9 +19409,10 @@ bool Sema::checkThisInStaticMemberFunctionAttributes(CXXMethodDecl *Method) { Args = llvm::ArrayRef(RC->args_begin(), RC->args_size()); else if (const auto *AC = dyn_cast(A)) Args = llvm::ArrayRef(AC->args_begin(), AC->args_size()); - else if (const auto *AC = dyn_cast(A)) + else if (const auto *AC = dyn_cast(A)) { + Arg = AC->getSuccessValue(); Args = llvm::ArrayRef(AC->args_begin(), AC->args_size()); - else if (const auto *RC = dyn_cast(A)) + } else if (const auto *RC = dyn_cast(A)) Args = llvm::ArrayRef(RC->args_begin(), RC->args_size()); if (Arg && !Finder.TraverseStmt(Arg)) diff --git a/clang/test/Sema/attr-capabilities.c b/clang/test/Sema/attr-capabilities.c index 5138803bd5eb7..12b18b687803e 100644 --- a/clang/test/Sema/attr-capabilities.c +++ b/clang/test/Sema/attr-capabilities.c @@ -14,7 +14,7 @@ struct __attribute__((capability("custom"))) CustomName {}; int Test1 __attribute__((capability("test1"))); // expected-error {{'capability' attribute only applies to structs, unions, classes, and typedefs}} int Test2 __attribute__((shared_capability("test2"))); // expected-error {{'shared_capability' attribute only applies to structs, unions, classes, and typedefs}} int Test3 __attribute__((acquire_capability("test3"))); // expected-warning {{'acquire_capability' attribute only applies to functions}} -int Test4 __attribute__((try_acquire_capability("test4"))); // expected-error {{'try_acquire_capability' attribute only applies to functions}} +int Test4 __attribute__((try_acquire_capability("test4"))); // expected-warning {{'try_acquire_capability' attribute only applies to functions}} int Test5 __attribute__((release_capability("test5"))); // expected-warning {{'release_capability' attribute only applies to functions}} struct __attribute__((capability(12))) Test3 {}; // expected-error {{expected string literal as argument of 'capability' attribute}} diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 40e1197bc21f1..4192faee1af80 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -7977,42 +7977,6 @@ TEST_P(ImportAttributes, ImportAcquiredBefore) { checkImportVariadicArg(FromAttr->args(), ToAttr->args()); } -TEST_P(ImportAttributes, ImportAssertExclusiveLock) { - AssertExclusiveLockAttr *FromAttr, *ToAttr; - importAttr("void test(int A1, int A2) " - "__attribute__((assert_exclusive_lock(A1, A2)));", - FromAttr, ToAttr); - checkImportVariadicArg(FromAttr->args(), ToAttr->args()); -} - -TEST_P(ImportAttributes, ImportAssertSharedLock) { - AssertSharedLockAttr *FromAttr, *ToAttr; - importAttr( - "void test(int A1, int A2) __attribute__((assert_shared_lock(A1, A2)));", - FromAttr, ToAttr); - checkImportVariadicArg(FromAttr->args(), ToAttr->args()); -} - -TEST_P(ImportAttributes, ImportExclusiveTrylockFunction) { - ExclusiveTrylockFunctionAttr *FromAttr, *ToAttr; - importAttr( - "void test(int A1, int A2) __attribute__((exclusive_trylock_function(1, " - "A1, A2)));", - FromAttr, ToAttr); - checkImported(FromAttr->getSuccessValue(), ToAttr->getSuccessValue()); - checkImportVariadicArg(FromAttr->args(), ToAttr->args()); -} - -TEST_P(ImportAttributes, ImportSharedTrylockFunction) { - SharedTrylockFunctionAttr *FromAttr, *ToAttr; - importAttr( - "void test(int A1, int A2) __attribute__((shared_trylock_function(1, A1, " - "A2)));", - FromAttr, ToAttr); - checkImported(FromAttr->getSuccessValue(), ToAttr->getSuccessValue()); - checkImportVariadicArg(FromAttr->args(), ToAttr->args()); -} - TEST_P(ImportAttributes, ImportLockReturned) { LockReturnedAttr *FromAttr, *ToAttr; importAttr( From 7cb7b2d39d3b4aa984bfaeaf5e69fbfb074edd41 Mon Sep 17 00:00:00 2001 From: Alexandre Ganea Date: Tue, 15 Apr 2025 17:37:01 -0400 Subject: [PATCH 045/710] [llvm] Build Windows release package with clang-cl if possible (#135446) If `clang-cl.exe` and `lld-link.exe` are installed in `%PATH%`, the Windows release build script will now use these by default, in place of MSVC. The reason for doing this is that MSVC still has, for the past year(s), a O(N^2) behavior when building certain LLVM source files, which leads to long build times (minutes per file). A report was filled here: https://developercommunity.visualstudio.com/t/ON2-in-SparseBitVectorBase-when-com/10657991 Also added a `--force-msvc` option to the script, to use MSVC even if clang-cl is installed. --- llvm/utils/release/build_llvm_release.bat | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index 588d7201fcb92..3042fc2d77dd1 100755 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -7,7 +7,7 @@ goto begin echo Script for building the LLVM installer on Windows, echo used for the releases at https://github.com/llvm/llvm-project/releases echo. -echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] +echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] [--force-msvc] echo. echo Options: echo --version: [required] version to build @@ -17,6 +17,7 @@ echo --x64: build and test x64 variant echo --arm64: build and test arm64 variant echo --skip-checkout: use local git checkout instead of downloading src.zip echo --local-python: use installed Python and does not try to use a specific version (3.10) +echo --force-msvc: use MSVC compiler for stage0, even if clang-cl is present echo. echo Note: At least one variant to build is required. echo. @@ -34,6 +35,7 @@ set x64= set arm64= set skip-checkout= set local-python= +set force-msvc= call :parse_args %* if "%help%" NEQ "" goto usage @@ -165,6 +167,24 @@ set common_cmake_flags=^ -DLLVM_ENABLE_RPMALLOC=ON ^ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp" +if "%force-msvc%" == "" ( + where /q clang-cl + if errorlevel 0 ( + where /q lld-link + if errorlevel 0 ( + set common_compiler_flags=%common_compiler_flags% -fuse-ld=lld + + set common_cmake_flags=%common_cmake_flags%^ + -DCMAKE_C_COMPILER=clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=clang-cl.exe ^ + -DCMAKE_LINKER=lld-link.exe ^ + -DLLVM_ENABLE_LLD=ON ^ + -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ + -DCMAKE_CXX_FLAGS="%common_compiler_flags%" + ) + ) +) + set cmake_profile_flags="" REM Preserve original path From d0372179fbbcb7b3fa680a78919a980fa4384c46 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 15 Apr 2025 14:43:28 -0700 Subject: [PATCH 046/710] [nfc] Add doc comment for `canReturn` in Analysis/CFG.h (#135862) --- llvm/include/llvm/Analysis/CFG.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Analysis/CFG.h b/llvm/include/llvm/Analysis/CFG.h index 8451e88146d7c..052ffb2872af7 100644 --- a/llvm/include/llvm/Analysis/CFG.h +++ b/llvm/include/llvm/Analysis/CFG.h @@ -174,7 +174,10 @@ bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI) { return false; } + +/// Return true if there is at least a path through which F can return, false if +/// there is no such path. bool canReturn(const Function &F); -} // End llvm namespace +} // namespace llvm #endif From 2d98bdc12c291523c3543ceaf1c526e25dcaedc6 Mon Sep 17 00:00:00 2001 From: Daniel Thornburgh Date: Tue, 15 Apr 2025 14:46:55 -0700 Subject: [PATCH 047/710] =?UTF-8?q?Revert=20"[llvm][clang]=20Allocate=20a?= =?UTF-8?q?=20new=20stack=20instead=20of=20spawning=20a=20new=20=E2=80=A6?= =?UTF-8?q?=20(#135865)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …thread to get more stack space (#133173)" This change breaks the Clang build on Mac AArch64. This reverts commit d0c973a7a0149db3b71767d4c5a20a31e6a8ed5b. This reverts commit 429a84f8a4bf559f43f50072747ef49d3e3b2cf1. This reverts commit 4f64c80d5a23c244f942193e58ecac666c173308. --- clang/docs/ReleaseNotes.rst | 4 - clang/include/clang/Basic/Stack.h | 5 +- clang/lib/Basic/Stack.cpp | 40 ++++-- clang/lib/Frontend/CompilerInstance.cpp | 2 +- .../llvm/Support/CrashRecoveryContext.h | 3 - llvm/include/llvm/Support/ProgramStack.h | 63 ---------- llvm/include/llvm/Support/thread.h | 1 - llvm/lib/Support/CMakeLists.txt | 1 - llvm/lib/Support/CrashRecoveryContext.cpp | 11 -- llvm/lib/Support/ProgramStack.cpp | 114 ------------------ llvm/unittests/Support/CMakeLists.txt | 1 - llvm/unittests/Support/ProgramStackTest.cpp | 35 ------ 12 files changed, 30 insertions(+), 250 deletions(-) delete mode 100644 llvm/include/llvm/Support/ProgramStack.h delete mode 100644 llvm/lib/Support/ProgramStack.cpp delete mode 100644 llvm/unittests/Support/ProgramStackTest.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 38142ad32bea0..166f26921cb71 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -195,10 +195,6 @@ Non-comprehensive list of changes in this release - Added `__builtin_elementwise_exp10`. - For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction. - Added `__builtin_elementwise_minnum` and `__builtin_elementwise_maxnum`. -- Clang itself now uses split stacks instead of threads for allocating more - stack space when running on Apple AArch64 based platforms. This means that - stack traces of Clang from debuggers, crashes, and profilers may look - different than before. New Compiler Flags ------------------ diff --git a/clang/include/clang/Basic/Stack.h b/clang/include/clang/Basic/Stack.h index 9674b9d9b62c3..30ebd94aedd1f 100644 --- a/clang/include/clang/Basic/Stack.h +++ b/clang/include/clang/Basic/Stack.h @@ -27,10 +27,7 @@ namespace clang { /// Call this once on each thread, as soon after starting the thread as /// feasible, to note the approximate address of the bottom of the stack. - /// - /// \param ForceSet set to true if you know the call is near the bottom of a - /// new stack. Used for split stacks. - void noteBottomOfStack(bool ForceSet = false); + void noteBottomOfStack(); /// Determine whether the stack is nearly exhausted. bool isStackNearlyExhausted(); diff --git a/clang/lib/Basic/Stack.cpp b/clang/lib/Basic/Stack.cpp index 8cbb84943f8d3..aa15d8e66950f 100644 --- a/clang/lib/Basic/Stack.cpp +++ b/clang/lib/Basic/Stack.cpp @@ -13,13 +13,33 @@ #include "clang/Basic/Stack.h" #include "llvm/Support/CrashRecoveryContext.h" -#include "llvm/Support/ProgramStack.h" -static LLVM_THREAD_LOCAL uintptr_t BottomOfStack = 0; +#ifdef _MSC_VER +#include // for _AddressOfReturnAddress +#endif -void clang::noteBottomOfStack(bool ForceSet) { - if (!BottomOfStack || ForceSet) - BottomOfStack = llvm::getStackPointer(); +static LLVM_THREAD_LOCAL void *BottomOfStack = nullptr; + +static void *getStackPointer() { +#if __GNUC__ || __has_builtin(__builtin_frame_address) + return __builtin_frame_address(0); +#elif defined(_MSC_VER) + return _AddressOfReturnAddress(); +#else + char CharOnStack = 0; + // The volatile store here is intended to escape the local variable, to + // prevent the compiler from optimizing CharOnStack into anything other + // than a char on the stack. + // + // Tested on: MSVC 2015 - 2019, GCC 4.9 - 9, Clang 3.2 - 9, ICC 13 - 19. + char *volatile Ptr = &CharOnStack; + return Ptr; +#endif +} + +void clang::noteBottomOfStack() { + if (!BottomOfStack) + BottomOfStack = getStackPointer(); } bool clang::isStackNearlyExhausted() { @@ -31,8 +51,7 @@ bool clang::isStackNearlyExhausted() { if (!BottomOfStack) return false; - intptr_t StackDiff = - (intptr_t)llvm::getStackPointer() - (intptr_t)BottomOfStack; + intptr_t StackDiff = (intptr_t)getStackPointer() - (intptr_t)BottomOfStack; size_t StackUsage = (size_t)std::abs(StackDiff); // If the stack pointer has a surprising value, we do not understand this @@ -47,12 +66,9 @@ bool clang::isStackNearlyExhausted() { void clang::runWithSufficientStackSpaceSlow(llvm::function_ref Diag, llvm::function_ref Fn) { llvm::CrashRecoveryContext CRC; - // Preserve the BottomOfStack in case RunSafelyOnNewStack uses split stacks. - uintptr_t PrevBottom = BottomOfStack; - CRC.RunSafelyOnNewStack([&] { - noteBottomOfStack(true); + CRC.RunSafelyOnThread([&] { + noteBottomOfStack(); Diag(); Fn(); }, DesiredStackSize); - BottomOfStack = PrevBottom; } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 5fe80fc16482e..243e0a3c15b05 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -1265,7 +1265,7 @@ bool CompilerInstance::compileModule(SourceLocation ImportLoc, // Execute the action to actually build the module in-place. Use a separate // thread so that we get a stack large enough. - bool Crashed = !llvm::CrashRecoveryContext().RunSafelyOnNewStack( + bool Crashed = !llvm::CrashRecoveryContext().RunSafelyOnThread( [&]() { GenerateModuleFromModuleMapAction Action; Instance.ExecuteAction(Action); diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h index 31293d6715757..26ddf97b3ef02 100644 --- a/llvm/include/llvm/Support/CrashRecoveryContext.h +++ b/llvm/include/llvm/Support/CrashRecoveryContext.h @@ -97,9 +97,6 @@ class CrashRecoveryContext { return RunSafelyOnThread([&]() { Fn(UserData); }, RequestedStackSize); } - bool RunSafelyOnNewStack(function_ref, - unsigned RequestedStackSize = 0); - /// Explicitly trigger a crash recovery in the current process, and /// return failure from RunSafely(). This function does not return. [[noreturn]] void HandleExit(int RetCode); diff --git a/llvm/include/llvm/Support/ProgramStack.h b/llvm/include/llvm/Support/ProgramStack.h deleted file mode 100644 index 232a7b5670b44..0000000000000 --- a/llvm/include/llvm/Support/ProgramStack.h +++ /dev/null @@ -1,63 +0,0 @@ -//===--- ProgramStack.h -----------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_SUPPORT_PROGRAMSTACK_H -#define LLVM_SUPPORT_PROGRAMSTACK_H - -#include "llvm/ADT/STLFunctionalExtras.h" - -// LLVM_HAS_SPLIT_STACKS is exposed in the header because CrashRecoveryContext -// needs to know if it's running on another thread or not. -// -// Currently only Apple AArch64 is known to support split stacks in the debugger -// and other tooling. -#if defined(__APPLE__) && defined(__aarch64__) && \ - LLVM_HAS_CPP_ATTRIBUTE(gnu::naked) && __has_extension(gnu_asm) -# define LLVM_HAS_SPLIT_STACKS -# define LLVM_HAS_SPLIT_STACKS_AARCH64 -#endif - -namespace llvm { - -/// \returns an address close to the current value of the stack pointer. -/// -/// The value is not guaranteed to point to anything specific. It can be used to -/// estimate how much stack space has been used since the previous call. -uintptr_t getStackPointer(); - -/// \returns the default stack size for this platform. -/// -/// Based on \p RLIMIT_STACK or the equivalent. -unsigned getDefaultStackSize(); - -/// Runs Fn on a new stack of at least the given size. -/// -/// \param StackSize requested stack size. A size of 0 uses the default stack -/// size of the platform. -/// -/// The preferred implementation is split stacks on platforms that have a good -/// debugging experience for them. On other platforms a new thread is used. -void runOnNewStack(unsigned StackSize, function_ref Fn); - -template -std::enable_if_t, R> -runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { - std::optional Ret; - runOnNewStack(StackSize, [&]() { Ret = Fn(std::forward(Args)...); }); - return std::move(*Ret); -} - -template -void runOnNewStack(unsigned StackSize, function_ref Fn, - Ts &&...Args) { - runOnNewStack(StackSize, [&]() { Fn(std::forward(Args)...); }); -} - -} // namespace llvm - -#endif // LLVM_SUPPORT_PROGRAMSTACK_H diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h index ef2fba822cb1c..e3005fdb63175 100644 --- a/llvm/include/llvm/Support/thread.h +++ b/llvm/include/llvm/Support/thread.h @@ -213,7 +213,6 @@ inline thread::id get_id() { return std::this_thread::get_id(); } #else // !LLVM_ENABLE_THREADS -#include "llvm/Support/ErrorHandling.h" #include namespace llvm { diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index def37f3f278d0..98ffd829d80b8 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -295,7 +295,6 @@ add_llvm_component_library(LLVMSupport Path.cpp Process.cpp Program.cpp - ProgramStack.cpp RWMutex.cpp Signals.cpp Threading.cpp diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp index 88c38d7526e71..f53aea177d612 100644 --- a/llvm/lib/Support/CrashRecoveryContext.cpp +++ b/llvm/lib/Support/CrashRecoveryContext.cpp @@ -10,7 +10,6 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ExitCodes.h" -#include "llvm/Support/ProgramStack.h" #include "llvm/Support/Signals.h" #include "llvm/Support/thread.h" #include @@ -524,13 +523,3 @@ bool CrashRecoveryContext::RunSafelyOnThread(function_ref Fn, CRC->setSwitchedThread(); return Info.Result; } - -bool CrashRecoveryContext::RunSafelyOnNewStack(function_ref Fn, - unsigned RequestedStackSize) { -#ifdef LLVM_HAS_SPLIT_STACKS - return runOnNewStack(RequestedStackSize, - function_ref([&]() { return RunSafely(Fn); })); -#else - return RunSafelyOnThread(Fn, RequestedStackSize); -#endif -} diff --git a/llvm/lib/Support/ProgramStack.cpp b/llvm/lib/Support/ProgramStack.cpp deleted file mode 100644 index 9e5a546b34974..0000000000000 --- a/llvm/lib/Support/ProgramStack.cpp +++ /dev/null @@ -1,114 +0,0 @@ -//===--- RunOnNewStack.cpp - Crash Recovery -------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/ProgramStack.h" -#include "llvm/Config/config.h" -#include "llvm/Support/Compiler.h" - -#ifdef LLVM_ON_UNIX -# include // for getrlimit -#endif - -#ifdef _MSC_VER -# include // for _AddressOfReturnAddress -#endif - -#ifndef LLVM_HAS_SPLIT_STACKS -# include "llvm/Support/thread.h" -#endif - -using namespace llvm; - -uintptr_t llvm::getStackPointer() { -#if __GNUC__ || __has_builtin(__builtin_frame_address) - return (uintptr_t)__builtin_frame_address(0); -#elif defined(_MSC_VER) - return (uintptr_t)_AddressOfReturnAddress(); -#else - volatile char CharOnStack = 0; - // The volatile store here is intended to escape the local variable, to - // prevent the compiler from optimizing CharOnStack into anything other - // than a char on the stack. - // - // Tested on: MSVC 2015 - 2019, GCC 4.9 - 9, Clang 3.2 - 9, ICC 13 - 19. - char *volatile Ptr = &CharOnStack; - return (uintptr_t)Ptr; -#endif -} - -unsigned llvm::getDefaultStackSize() { -#ifdef LLVM_ON_UNIX - rlimit RL; - getrlimit(RLIMIT_STACK, &RL); - return RL.rlim_cur; -#else - // Clang recursively parses, instantiates templates, and evaluates constant - // expressions. We've found 8MiB to be a reasonable stack size given the way - // Clang works and the way C++ is commonly written. - return 8 << 20; -#endif -} - -namespace { -#ifdef LLVM_HAS_SPLIT_STACKS_AARCH64 -[[gnu::naked]] void runOnNewStackImpl(void *Stack, void (*Fn)(void *), - void *Ctx) { - __asm__ volatile( - "mov x16, sp\n\t" - "sub x0, x0, #0x20\n\t" // subtract space from stack - "stp xzr, x16, [x0, #0x00]\n\t" // save old sp - "stp x29, x30, [x0, #0x10]\n\t" // save fp, lr - "mov sp, x0\n\t" // switch to new stack - "add x29, x0, #0x10\n\t" // switch to new frame - ".cfi_def_cfa w29, 16\n\t" - ".cfi_offset w30, -8\n\t" // lr - ".cfi_offset w29, -16\n\t" // fp - - "mov x0, x2\n\t" // Ctx is the only argument - "blr x1\n\t" // call Fn - - "ldp x29, x30, [sp, #0x10]\n\t" // restore fp, lr - "ldp xzr, x16, [sp, #0x00]\n\t" // load old sp - "mov sp, x16\n\t" - "ret" - ); -} -#endif - -#ifdef LLVM_HAS_SPLIT_STACKS -void callback(void *Ctx) { - (*reinterpret_cast *>(Ctx))(); -} -#endif -} // namespace - -#ifdef LLVM_HAS_SPLIT_STACKS -void llvm::runOnNewStack(unsigned StackSize, function_ref Fn) { - if (StackSize == 0) - StackSize = getDefaultStackSize(); - - // We use malloc here instead of mmap because: - // - it's simpler, - // - many malloc implementations will reuse the allocation in cases where - // we're bouncing accross the edge of a stack boundry, and - // - many malloc implemenations will already provide guard pages for - // allocations this large. - void *Stack = malloc(StackSize); - void *BottomOfStack = (char *)Stack + StackSize; - - runOnNewStackImpl(BottomOfStack, callback, &Fn); - - free(Stack); -} -#else -void llvm::runOnNewStack(unsigned StackSize, function_ref Fn) { - llvm::thread Thread( - StackSize == 0 ? std::nullopt : std::optional(StackSize), Fn); - Thread.join(); -} -#endif diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index e5bf820fb4d1c..6c4e7cb689b20 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -70,7 +70,6 @@ add_llvm_unittest(SupportTests PerThreadBumpPtrAllocatorTest.cpp ProcessTest.cpp ProgramTest.cpp - ProgramStackTest.cpp RecyclerTest.cpp RegexTest.cpp ReverseIterationTest.cpp diff --git a/llvm/unittests/Support/ProgramStackTest.cpp b/llvm/unittests/Support/ProgramStackTest.cpp deleted file mode 100644 index 31dfb3b88ade6..0000000000000 --- a/llvm/unittests/Support/ProgramStackTest.cpp +++ /dev/null @@ -1,35 +0,0 @@ -//===- unittest/Support/ProgramStackTest.cpp ------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/ProgramStack.h" -#include "llvm/Support/Process.h" -#include "gtest/gtest.h" - -using namespace llvm; - -static uintptr_t func(int &A) { - A = 7; - return getStackPointer(); -} - -static void func2(int &A) { - A = 5; -} - -TEST(ProgramStackTest, runOnNewStack) { - int A = 0; - uintptr_t Stack = runOnNewStack(0, function_ref(func), A); - EXPECT_EQ(A, 7); - intptr_t StackDiff = (intptr_t)llvm::getStackPointer() - (intptr_t)Stack; - size_t StackDistance = (size_t)std::abs(StackDiff); - // Page size is used as it's large enough to guarantee were not on the same - // stack but not too large to cause spurious failures. - EXPECT_GT(StackDistance, llvm::sys::Process::getPageSizeEstimate()); - runOnNewStack(0, function_ref(func2), A); - EXPECT_EQ(A, 5); -} From 8ed397d8e4d014ecc5df89a9d908c5808f201b65 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 15 Apr 2025 14:58:15 -0700 Subject: [PATCH 048/710] [DAGCombiner] Disable narrowExtractedVectorLoad for indexed loads. (#135847) The later code does not expect or preserve the index output. Fixes #135821 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +- llvm/test/CodeGen/AArch64/pr135821.ll | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/pr135821.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b322fe670d4a7..d72be359867ca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25183,7 +25183,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { return SDValue(); auto *Ld = dyn_cast(Extract->getOperand(0)); - if (!Ld || Ld->getExtensionType() || !Ld->isSimple()) + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple()) return SDValue(); // Allow targets to opt-out. diff --git a/llvm/test/CodeGen/AArch64/pr135821.ll b/llvm/test/CodeGen/AArch64/pr135821.ll new file mode 100644 index 0000000000000..cfd6cd086e130 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr135821.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s + +define <4 x float> @f(ptr %0) { +; CHECK-LABEL: f: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: ldr q1, [x0, #56]! +; CHECK-NEXT: ldr d0, [x0, #16] +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %2 = getelementptr inbounds nuw i8, ptr %0, i64 56 + %3 = load <6 x float>, ptr %2, align 4 + %4 = shufflevector <6 x float> %3, <6 x float> poison, <4 x i32> + tail call void @use(ptr %2) + ret <4 x float> %4 +} + +declare void @use(ptr) From a6208ce4c15142c26c6b73651bf466ae6b470cb0 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 15 Apr 2025 15:07:03 -0700 Subject: [PATCH 049/710] [nfc] move `isPresplitCoroSuspendExitEdge` to Analysis/CFG (#135849) --- llvm/include/llvm/Analysis/CFG.h | 19 +++++++++++++++++++ .../llvm/Transforms/Utils/BasicBlockUtils.h | 18 ------------------ llvm/lib/Analysis/CFG.cpp | 13 +++++++++++++ .../Instrumentation/InstrProfiling.cpp | 1 + .../Instrumentation/PGOCtxProfFlattening.cpp | 1 + llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 12 ------------ 6 files changed, 34 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/Analysis/CFG.h b/llvm/include/llvm/Analysis/CFG.h index 052ffb2872af7..64e2079df9db2 100644 --- a/llvm/include/llvm/Analysis/CFG.h +++ b/llvm/include/llvm/Analysis/CFG.h @@ -175,6 +175,25 @@ bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI) { return false; } +// Returns true if these basic blocks belong to a presplit coroutine and the +// edge corresponds to the 'default' case in the switch statement in the +// pattern: +// +// %0 = call i8 @llvm.coro.suspend(token none, i1 false) +// switch i8 %0, label %suspend [i8 0, label %resume +// i8 1, label %cleanup] +// +// i.e. the edge to the `%suspend` BB. This edge is special in that it will +// be elided by coroutine lowering (coro-split), and the `%suspend` BB needs +// to be kept as-is. It's not a real CFG edge - post-lowering, it will end +// up being a `ret`, and it must be thus lowerable to support symmetric +// transfer. For example: +// - this edge is not a loop exit edge if encountered in a loop (and should +// be ignored) +// - must not be split for PGO instrumentation, for example. +bool isPresplitCoroSuspendExitEdge(const BasicBlock &Src, + const BasicBlock &Dest); + /// Return true if there is at least a path through which F can return, false if /// there is no such path. bool canReturn(const Function &F); diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index 6faff3d1fd8e3..adc1851c2ec2f 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -610,24 +610,6 @@ void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder); // br/brcond/unreachable/ret bool hasOnlySimpleTerminator(const Function &F); -// Returns true if these basic blocks belong to a presplit coroutine and the -// edge corresponds to the 'default' case in the switch statement in the -// pattern: -// -// %0 = call i8 @llvm.coro.suspend(token none, i1 false) -// switch i8 %0, label %suspend [i8 0, label %resume -// i8 1, label %cleanup] -// -// i.e. the edge to the `%suspend` BB. This edge is special in that it will -// be elided by coroutine lowering (coro-split), and the `%suspend` BB needs -// to be kept as-is. It's not a real CFG edge - post-lowering, it will end -// up being a `ret`, and it must be thus lowerable to support symmetric -// transfer. For example: -// - this edge is not a loop exit edge if encountered in a loop (and should -// be ignored) -// - must not be split for PGO instrumentation, for example. -bool isPresplitCoroSuspendExitEdge(const BasicBlock &Src, - const BasicBlock &Dest); } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp index 8ced4a901557d..0d32e101ee0b4 100644 --- a/llvm/lib/Analysis/CFG.cpp +++ b/llvm/lib/Analysis/CFG.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" using namespace llvm; @@ -356,3 +357,15 @@ bool llvm::canReturn(const Function &F) { return false; } + +bool llvm::isPresplitCoroSuspendExitEdge(const BasicBlock &Src, + const BasicBlock &Dest) { + assert(Src.getParent() == Dest.getParent()); + if (!Src.getParent()->isPresplitCoroutine()) + return false; + if (auto *SW = dyn_cast(Src.getTerminator())) + if (auto *Intr = dyn_cast(SW->getCondition())) + return Intr->getIntrinsicID() == Intrinsic::coro_suspend && + SW->getDefaultDest() == &Dest; + return false; +} \ No newline at end of file diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 008c1faf0a0c3..84bf4c62c7aad 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Attributes.h" diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index 05f364a974c6c..508a41684ed20 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -21,6 +21,7 @@ #include "llvm/Transforms/Instrumentation/PGOCtxProfFlattening.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/IR/Analysis.h" diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 6f36e24000aa5..b78270f6309ff 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -1916,15 +1916,3 @@ bool llvm::hasOnlySimpleTerminator(const Function &F) { } return true; } - -bool llvm::isPresplitCoroSuspendExitEdge(const BasicBlock &Src, - const BasicBlock &Dest) { - assert(Src.getParent() == Dest.getParent()); - if (!Src.getParent()->isPresplitCoroutine()) - return false; - if (auto *SW = dyn_cast(Src.getTerminator())) - if (auto *Intr = dyn_cast(SW->getCondition())) - return Intr->getIntrinsicID() == Intrinsic::coro_suspend && - SW->getDefaultDest() == &Dest; - return false; -} From 6e2bca840df9dfcffc5068c1ad0c9575f0c57e76 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 15 Apr 2025 22:07:57 +0000 Subject: [PATCH 050/710] [gn build] Port 2d98bdc12c29 --- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 - llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn | 1 - 2 files changed, 2 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 0d2330cba6a7a..3a9f43b1070a7 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -124,7 +124,6 @@ static_library("Support") { "Parallel.cpp", "PluginLoader.cpp", "PrettyStackTrace.cpp", - "ProgramStack.cpp", "RISCVAttributeParser.cpp", "RISCVAttributes.cpp", "RISCVISAUtils.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index 19418ad52147b..bf6a0b7523279 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -71,7 +71,6 @@ unittest("SupportTests") { "Path.cpp", "PerThreadBumpPtrAllocatorTest.cpp", "ProcessTest.cpp", - "ProgramStackTest.cpp", "ProgramTest.cpp", "RISCVAttributeParserTest.cpp", "RecyclerTest.cpp", From 31f39c83259401a26b3660dd75f645002258571d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 15 Apr 2025 15:17:33 -0700 Subject: [PATCH 051/710] [AMDGPU] Remove the AnnotateKernelFeatures pass (#130198) Previously the AnnotateKernelFeatures pass infers two attributes: amdgpu-calls and amdgpu-stack-objects, which are used to help determine if flat scratch init is allowed. PR #118907 created the amdgpu-no-flat-scratch-init attribute. Continuing with that work, this patch makes use of this attribute to determine flat scratch init, replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the removal of the AnnotateKernelFeatures pass. --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 - .../AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 9 - llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 6 - .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 - llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 15 +- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 368 ++++- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 390 ++++- .../AMDGPU/GlobalISel/extractelement.ll | 71 +- .../AMDGPU/GlobalISel/flat-scratch-init.ll | 4 +- ...licit-kernarg-backend-usage-global-isel.ll | 30 +- .../GlobalISel/insertelement-stack-lower.ll | 2 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 5 +- .../GlobalISel/llvm.amdgcn.if.break.i64.ll | 3 + .../GlobalISel/llvm.amdgcn.trig.preop.ll | 24 + .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 33 + .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 33 + .../abi-attribute-hints-undefined-behavior.ll | 18 +- .../AMDGPU/addrspacecast-constantexpr.ll | 62 - llvm/test/CodeGen/AMDGPU/always-uniform.ll | 3 + ...amdgpu-codegenprepare-fold-binop-select.ll | 3 + .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +- .../annotate-kernel-features-hsa-call.ll | 331 ---- .../AMDGPU/annotate-kernel-features-hsa.ll | 165 -- .../AMDGPU/annotate-kernel-features.ll | 103 -- .../attr-amdgpu-flat-work-group-size.ll | 4 +- .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 4 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +- .../attributor-flatscratchinit-globalisel.ll | 21 +- ...utor-flatscratchinit-undefined-behavior.ll | 63 + ...tor-flatscratchinit-undefined-behavior2.ll | 870 +++++++++++ llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 4 +- .../callee-special-input-sgprs-fixed-abi.ll | 40 +- llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 8 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 3 + ...dagcomb-extract-vec-elt-different-sizes.ll | 2 + .../AMDGPU/duplicate-attribute-indirect.ll | 13 - ...cannot-create-empty-or-backward-segment.ll | 2 +- .../expand-scalar-carry-out-select-user.ll | 3 + .../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 100 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 66 + .../fast-unaligned-load-store.global.ll | 20 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 236 ++- .../flat-for-global-subtarget-feature.ll | 8 +- llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 80 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 48 + llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 60 + .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 3 + llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 62 +- llvm/test/CodeGen/AMDGPU/half.ll | 231 +++ .../AMDGPU/hsa-metadata-kernel-code-props.ll | 4 +- llvm/test/CodeGen/AMDGPU/hsa.ll | 4 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 10 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 37 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 58 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 ++- .../CodeGen/AMDGPU/invalid-addrspacecast.ll | 3 + .../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 2 + llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 2 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 30 +- .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 12 + .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 12 + .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 8 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 70 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 114 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 126 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 6 + llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 125 +- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 83 +- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 18 + llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 164 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 129 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 105 +- llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 3 +- .../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- .../match-perm-extract-vector-elt-bug.ll | 8 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 300 ++-- .../AMDGPU/memory-legalizer-flat-agent.ll | 1380 +++++++++++++++++ .../memory-legalizer-flat-nontemporal.ll | 75 + .../memory-legalizer-flat-singlethread.ll | 1380 +++++++++++++++++ .../AMDGPU/memory-legalizer-flat-system.ll | 1380 +++++++++++++++++ .../AMDGPU/memory-legalizer-flat-volatile.ll | 66 + .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++ .../AMDGPU/memory-legalizer-global-agent.ll | 273 ++++ .../memory-legalizer-global-nontemporal.ll | 15 + .../memory-legalizer-global-singlethread.ll | 276 ++++ .../AMDGPU/memory-legalizer-global-system.ll | 261 ++++ .../memory-legalizer-global-volatile.ll | 18 + .../memory-legalizer-global-wavefront.ll | 276 ++++ .../memory-legalizer-global-workgroup.ll | 276 ++++ .../memory-legalizer-local-nontemporal.ll | 9 + .../AMDGPU/memory-legalizer-local-volatile.ll | 6 + .../memory-legalizer-private-nontemporal.ll | 59 +- .../memory-legalizer-private-volatile.ll | 30 +- llvm/test/CodeGen/AMDGPU/min.ll | 210 +++ llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 21 + llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 18 + .../AMDGPU/pal-simple-indirect-call.ll | 8 - ...al-regcopy-and-spill-missed-at-regalloc.ll | 51 +- .../AMDGPU/preload-implicit-kernargs.ll | 178 +-- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 379 +++-- llvm/test/CodeGen/AMDGPU/sad.ll | 114 +- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 + .../scc-clobbered-sgpr-to-vmem-spill.ll | 464 +++--- .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 24 +- .../CodeGen/AMDGPU/simple-indirect-call.ll | 15 - llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 70 +- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 6 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 6 + llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 2 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 16 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 45 + llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 91 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +- ...ine-function-info-long-branch-reg-debug.ll | 2 +- .../machine-function-info-long-branch-reg.ll | 2 +- 119 files changed, 13654 insertions(+), 1853 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 03cd45d7de6f2..4ff761ec19b3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &); void initializeAMDGPUAlwaysInlinePass(PassRegistry&); -Pass *createAMDGPUAnnotateKernelFeaturesPass(); Pass *createAMDGPUAttributorLegacyPass(); void initializeAMDGPUAttributorLegacyPass(PassRegistry &); -void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); -extern char &AMDGPUAnnotateKernelFeaturesID; // DPP/Iterative option enables the atomic optimizer with given strategy // whereas None disables the atomic optimizer. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index a9bd41382c255..9c9fa5c6e2f0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { char AMDGPUAnnotateKernelFeatures::ID = 0; -char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; - -INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) - bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { bool HaveStackObjects = false; bool Changed = false; @@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { TM = &TPC->getTM(); return false; } - -Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { - return new AMDGPUAnnotateKernelFeatures(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 538b1b181f643..60d27a7fbef29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -149,9 +149,3 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbanklegalize", AMDGPURegBankLegalizePass( DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbank-combiner", AMDGPURegBankCombinerPass()) #undef DUMMY_MACHINE_FUNCTION_PASS - - -#define DUMMY_CGSCC_PASS(NAME, CREATE_PASS) -DUMMY_CGSCC_PASS("amdgpu-annotate-kernel-features", AMDGPUAnnotateKernelFeaturesPass()) - -#undef DUMMY_CGSCC_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5b2e0558d5664..34dacd5f9209d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -515,7 +515,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAttributorLegacyPass(*PR); - initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); @@ -1317,12 +1316,6 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { - if (TM->getTargetTriple().isAMDGCN()) { - // FIXME: This pass adds 2 hacky attributes that can be replaced with an - // analysis, and should be removed. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); - } - if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 53f5c1efd14eb..d6153ce93b451 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const CallingConv::ID CC = F.getCallingConv(); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) KernargSegmentPtr = true; @@ -629,12 +623,13 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, DispatchID = true; } - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + // FlatScratchInit cannot be true for graphics CC if enableFlatScratch() + // is false. + (ST.enableFlatScratch() || + (!AMDGPU::isGraphics(CC) && + !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) && !ST.flatScratchIsArchitected()) { FlatScratchInit = true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index d9be677a0e58d..aeb301939e986 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1079,6 +1169,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1093,6 +1185,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,6 +1228,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1144,6 +1243,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1156,6 +1258,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,6 +1271,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1199,6 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1213,6 +1324,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1227,6 +1341,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1238,6 +1354,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1393,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1287,6 +1410,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1301,6 +1427,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1312,6 +1440,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1348,7 +1480,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1356,6 +1490,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -1370,7 +1505,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1378,6 +1515,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1392,6 +1530,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1410,6 +1550,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -1466,6 +1610,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1483,6 +1630,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1500,6 +1650,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1513,6 +1665,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1559,10 +1715,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1580,10 +1739,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1601,7 +1763,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1616,6 +1780,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1654,12 +1822,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1677,12 +1848,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1700,7 +1874,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1715,6 +1891,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1756,10 +1936,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1769,10 +1952,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1782,7 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1794,6 +1982,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1828,12 +2020,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1843,12 +2038,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1858,7 +2056,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1870,6 +2070,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1907,12 +2111,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1922,12 +2129,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1937,7 +2147,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1949,6 +2161,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1987,6 +2203,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2013,6 +2232,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2039,12 +2261,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2058,6 +2282,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2116,6 +2344,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2134,6 +2365,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2152,12 +2386,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2166,6 +2402,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2219,8 +2459,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2237,8 +2480,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2312,7 +2558,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2328,7 +2577,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2394,7 +2646,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2410,7 +2665,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2594,10 +2852,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2610,10 +2871,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2671,12 +2935,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2689,12 +2956,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2753,12 +3023,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2771,12 +3044,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2835,10 +3111,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2848,10 +3127,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2902,12 +3184,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2917,12 +3202,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2974,12 +3262,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2989,12 +3280,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3047,6 +3341,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3070,6 +3367,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3144,6 +3444,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3162,6 +3465,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3232,7 +3538,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3251,7 +3560,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 92a7de9aaefd2..1d401a4ee33d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -21,11 +21,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -36,11 +39,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -110,11 +116,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -125,11 +134,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -332,6 +344,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -347,6 +362,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -415,6 +433,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -432,6 +453,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -503,6 +527,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -520,6 +547,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -592,6 +622,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -604,6 +637,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -664,6 +700,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -678,6 +717,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -741,6 +783,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -755,6 +800,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -820,7 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -828,6 +878,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -842,7 +893,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -850,6 +903,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -925,6 +979,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -942,6 +999,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1019,8 +1079,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1037,8 +1100,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1129,7 +1195,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1145,7 +1214,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1224,7 +1296,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1240,7 +1315,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1459,10 +1537,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1475,10 +1556,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1548,12 +1632,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1566,12 +1653,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1642,12 +1732,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1660,12 +1753,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1737,10 +1833,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1750,10 +1849,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1815,12 +1917,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1830,12 +1935,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1898,12 +2006,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1913,12 +2024,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1983,6 +2097,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2006,6 +2123,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2094,6 +2214,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,6 +2235,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2188,6 +2314,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2203,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2218,6 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2232,6 +2366,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2281,6 +2419,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2298,6 +2439,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2315,6 +2459,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2329,6 +2475,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2381,6 +2531,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2398,6 +2551,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2415,6 +2571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2429,6 +2587,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2482,6 +2644,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2494,6 +2659,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +2674,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2517,6 +2687,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2560,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2574,6 +2751,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2588,6 +2768,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2599,6 +2781,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2645,6 +2831,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2659,6 +2848,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2673,6 +2865,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2684,6 +2878,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +2930,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2740,6 +2940,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -2754,7 +2955,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,6 +2965,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -2776,6 +2980,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2794,6 +3000,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -2871,6 +3081,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2888,6 +3101,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2905,6 +3121,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2918,6 +3136,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2988,7 +3210,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3007,7 +3232,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 @@ -3097,10 +3325,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3118,10 +3349,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3139,7 +3373,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3154,6 +3390,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3206,12 +3446,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3229,12 +3472,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3252,7 +3498,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3267,6 +3515,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3322,12 +3574,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3345,12 +3600,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3368,7 +3626,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3383,6 +3643,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3439,10 +3703,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3452,10 +3719,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3465,7 +3735,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3477,6 +3749,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3523,12 +3799,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3538,12 +3817,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3553,7 +3835,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3565,6 +3849,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3614,12 +3902,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3629,12 +3920,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3644,7 +3938,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3656,6 +3952,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3707,6 +4007,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3733,6 +4036,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3759,12 +4065,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3778,6 +4086,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3858,6 +4170,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3876,6 +4191,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3894,12 +4212,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3908,6 +4228,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3975,6 +4299,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3982,6 +4307,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3995,6 +4322,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -4002,6 +4330,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 31a229a908142..9ef16aef0dd16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 15 +; GPRIDX-NEXT: wavefront_sgpr_count = 17 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -3168,21 +3168,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3210,7 +3213,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3225,7 +3228,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4042,7 +4045,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4057,7 +4060,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4074,7 +4077,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 14 +; GPRIDX-NEXT: wavefront_sgpr_count = 16 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4115,7 +4118,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4126,7 +4129,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4141,7 +4144,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4158,7 +4161,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4176,6 +4179,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4211,7 +4217,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4226,7 +4232,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4387,7 +4393,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4402,7 +4408,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4419,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 14 +; GPRIDX-NEXT: wavefront_sgpr_count = 16 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4463,7 +4469,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4474,7 +4480,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4489,7 +4495,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4506,7 +4512,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4524,10 +4530,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] @@ -4535,6 +4543,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4562,7 +4571,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4577,7 +4586,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 00c44c27257bb..e207d95287783 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -35,7 +35,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; RO-FLAT: scratch_store_dword ; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer -; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1 +; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init ; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; RW-FLAT-NOT: .amdhsa_enable_private_segment @@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; RO-FLAT: .amdhsa_enable_private_segment 1 ; RW-FLAT: .amdhsa_reserve_flat_scratch 0 ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 -; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4 ; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 define amdgpu_kernel void @stack_object_in_kernel_no_calls() { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 676035735d0af..86766e2904619 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 ; GFX8V4-NEXT: s_mov_b32 s5, s3 @@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V4-NEXT: flat_store_dword v[0:1], v2 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) @@ -37,7 +40,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8 -; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: s_add_i32 s12, s12, s17 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 @@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -60,9 +66,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s2, s0 ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -84,9 +92,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s2, s0 ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) @@ -111,7 +121,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ret void } -define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -167,7 +177,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -223,7 +233,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_trap() { +define amdgpu_kernel void @llvm_trap() #0 { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] @@ -246,7 +256,7 @@ define amdgpu_kernel void @llvm_trap() { unreachable } -define amdgpu_kernel void @llvm_debugtrap() { +define amdgpu_kernel void @llvm_debugtrap() #0 { ; GFX8V4-LABEL: llvm_debugtrap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_trap 3 @@ -266,7 +276,7 @@ define amdgpu_kernel void @llvm_debugtrap() { unreachable } -define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -374,3 +384,5 @@ declare void @llvm.debugtrap() !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 378c6312c52be..94853767ccfac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index a6a7f35a774db..859f7ef16e395 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_add_i32 s12, s12, s17 ; CHECK-NEXT: ds_read_b32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 9 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index dcc2c23cae046..a5a75f74833f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index ad588ebee2f9e..1deee215e522b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 @@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 40f29c56c8f12..b59f85b2dfa38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s6, s5, 31 ; GFX8-NEXT: s_add_i32 s0, s5, s6 @@ -146,6 +149,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -617,6 +623,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2187,6 +2202,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2332,6 +2350,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 @@ -2596,6 +2617,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2741,6 +2765,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s10, s0, 31 @@ -3002,6 +3029,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -3153,6 +3183,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index e3c1a52696b47..ff0114cfc3ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 @@ -113,6 +116,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -523,6 +529,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -980,6 +992,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2194,6 +2218,9 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 29fb320bf1283..c78f0a4eb61e9 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workitem_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1 ; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0 @@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_endpgm @@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 { define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 { ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 @@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 { ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 59bd4e9ac8ce6..3eba47d7d7852 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0 @@ -27,11 +26,6 @@ define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { } define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 @@ -42,11 +36,6 @@ define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { } define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 @@ -57,11 +46,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { } define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 @@ -92,12 +76,6 @@ define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 { } define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 @@ -110,12 +88,6 @@ define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace } define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 @@ -128,13 +100,6 @@ define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addr } define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 -; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 -; AKF_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 @@ -149,11 +114,6 @@ define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrsp } define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) @@ -165,11 +125,6 @@ define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspa ; Can't just search the pointer value define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 @@ -181,11 +136,6 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr ; Can't just search pointer types define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 @@ -197,11 +147,6 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; Cast group to flat, do GEP, cast back to group define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 @@ -212,10 +157,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() # } define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) @@ -229,14 +170,11 @@ attributes #1 = { nounwind } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index b6c0271e5f56f..4e7022710c671 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s0, s0, s4 @@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 7fdc012d4f1b5..e71bf15384727 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,6 +393,9 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_movk_i32 s0, 0x80 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 3e19ee5567929..85b5c7c870b23 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -34,7 +34,7 @@ define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out26, i32 %in26, ptr addrspace(1) %out27, i32 %in27, ptr addrspace(1) %out28, i32 %in28, - ptr addrspace(1) %out29, i32 %in29) { + ptr addrspace(1) %out29, i32 %in29) #0 { entry: store i32 %in0, ptr addrspace(1) %out0 store i32 %in1, ptr addrspace(1) %out1 @@ -68,3 +68,5 @@ entry: store i32 %in29, ptr addrspace(1) %out29 ret void } + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 6d205921923d3..8389a8e86cb44 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s ; TODO: The test contains UB which is refined by the Attributor and should be removed. @@ -19,12 +18,6 @@ declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 define void @use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -37,12 +30,6 @@ define void @use_workitem_id_x() #1 { } define void @use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -55,12 +42,6 @@ define void @use_workitem_id_y() #1 { } define void @use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() @@ -73,12 +54,6 @@ define void @use_workitem_id_z() #1 { } define void @use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -91,12 +66,6 @@ define void @use_workgroup_id_x() #1 { } define void @use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -109,12 +78,6 @@ define void @use_workgroup_id_y() #1 { } define void @use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() @@ -127,12 +90,6 @@ define void @use_workgroup_id_z() #1 { } define void @use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -145,12 +102,6 @@ define void @use_dispatch_ptr() #1 { } define void @use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() @@ -163,12 +114,6 @@ define void @use_queue_ptr() #1 { } define void @use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() @@ -181,14 +126,6 @@ define void @use_dispatch_id() #1 { } define void @use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR10:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -205,11 +142,6 @@ define void @use_workgroup_id_y_workgroup_id_z() #1 { } define void @func_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -220,11 +152,6 @@ define void @func_indirect_use_workitem_id_x() #1 { } define void @kernel_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -235,11 +162,6 @@ define void @kernel_indirect_use_workitem_id_x() #1 { } define void @func_indirect_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_y() @@ -250,11 +172,6 @@ define void @func_indirect_use_workitem_id_y() #1 { } define void @func_indirect_use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_z() @@ -265,11 +182,6 @@ define void @func_indirect_use_workitem_id_z() #1 { } define void @func_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -280,11 +192,6 @@ define void @func_indirect_use_workgroup_id_x() #1 { } define void @kernel_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -295,11 +202,6 @@ define void @kernel_indirect_use_workgroup_id_x() #1 { } define void @func_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_y() @@ -310,11 +212,6 @@ define void @func_indirect_use_workgroup_id_y() #1 { } define void @func_indirect_use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_z() @@ -325,11 +222,6 @@ define void @func_indirect_use_workgroup_id_z() #1 { } define void @func_indirect_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() @@ -340,11 +232,6 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 { } define void @indirect_x2_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() @@ -355,11 +242,6 @@ define void @indirect_x2_use_workgroup_id_y() #1 { } define void @func_indirect_use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_ptr() @@ -370,11 +252,6 @@ define void @func_indirect_use_dispatch_ptr() #1 { } define void @func_indirect_use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_queue_ptr() @@ -385,11 +262,6 @@ define void @func_indirect_use_queue_ptr() #1 { } define void @func_indirect_use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_id() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_id() @@ -400,11 +272,6 @@ define void @func_indirect_use_dispatch_id() #1 { } define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() @@ -415,13 +282,6 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { } define void @recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -436,11 +296,6 @@ define void @recursive_use_workitem_id_y() #1 { } define void @call_recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y() @@ -451,12 +306,6 @@ define void @call_recursive_use_workitem_id_y() #1 { } define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -470,12 +319,6 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -488,13 +331,6 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { } define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -509,11 +345,6 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt } define void @indirect_use_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) @@ -524,11 +355,6 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 { } define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) @@ -539,11 +365,6 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { } define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) @@ -554,12 +375,6 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { } define void @use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() @@ -571,11 +386,6 @@ define void @use_kernarg_segment_ptr() #1 { ret void } define void @func_indirect_use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_kernarg_segment_ptr() @@ -586,12 +396,6 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 { } define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -604,12 +408,6 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { } define void @use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -622,11 +420,6 @@ define void @use_implicitarg_ptr() #1 { } define void @func_indirect_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_implicitarg_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr() @@ -640,10 +433,6 @@ declare void @external.func() #3 ; This function gets deleted. define internal void @defined.func() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@defined.func -; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -652,11 +441,6 @@ define internal void @defined.func() #3 { } define void @func_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -667,11 +451,6 @@ define void @func_call_external() #3 { } define void @func_call_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() @@ -681,11 +460,6 @@ define void @func_call_defined() #3 { ret void } define void @func_call_asm() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR3]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]] @@ -696,11 +470,6 @@ define void @func_call_asm() #3 { } define amdgpu_kernel void @kern_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external -; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -711,11 +480,6 @@ define amdgpu_kernel void @kern_call_external() #3 { } define amdgpu_kernel void @func_kern_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined ; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() @@ -726,12 +490,6 @@ define amdgpu_kernel void @func_kern_defined() #3 { } define i32 @use_dispatch_ptr_ret_type() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret i32 0 -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -744,12 +502,6 @@ define i32 @use_dispatch_ptr_ret_type() #1 { } define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() @@ -762,12 +514,6 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { } define float @func_indirect_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() @@ -781,12 +527,6 @@ define float @func_indirect_call(ptr %fptr) #3 { declare float @extern() #3 define float @func_extern_call() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @extern() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() @@ -799,12 +539,6 @@ define float @func_extern_call() #3 { } define float @func_null_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float null() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() @@ -820,12 +554,6 @@ declare float @llvm.amdgcn.rcp.f32(float) #0 ; Calls some other recognized intrinsic define float @func_other_intrinsic_call(float %arg) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call ; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) @@ -839,11 +567,6 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 @@ -855,11 +578,6 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 @@ -871,11 +589,6 @@ define void @func_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() @@ -887,11 +600,6 @@ define void @func_indirect_sanitize_address() #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() @@ -906,11 +614,6 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { declare void @extern_func_sanitize_address() #5 define amdgpu_kernel void @kern_decl_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @extern_func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() @@ -923,10 +626,6 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { declare void @enqueue_block_decl() #6 define internal void @enqueue_block_def() #6 { -; AKF_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def ; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -935,11 +634,6 @@ define internal void @enqueue_block_def() #6 { } define amdgpu_kernel void @kern_call_enqueued_block_decl() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] { -; AKF_HSA-NEXT: call void @enqueue_block_decl() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl ; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() @@ -950,11 +644,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { } define amdgpu_kernel void @kern_call_enqueued_block_def() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @enqueue_block_def() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def ; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() @@ -965,9 +654,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { } define void @unused_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -976,9 +662,6 @@ define void @unused_enqueue_block() { } define internal void @known_func() { -; AKF_HSA-LABEL: define {{[^@]+}}@known_func() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -988,11 +671,6 @@ define internal void @known_func() { ; Should never happen define amdgpu_kernel void @kern_callsite_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @known_func() #[[ATTR7]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { ; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]] @@ -1014,15 +692,6 @@ attributes #6 = { "enqueued-block" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="gfx900" } -; AKF_HSA: attributes #[[ATTR3]] = { nounwind } -; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-calls" } -; AKF_HSA: attributes #[[ATTR5]] = { nounwind sanitize_address } -; AKF_HSA: attributes #[[ATTR6:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" } -; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" } -; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 2809f0957462a..32bb22b699b61 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -33,12 +32,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -51,14 +44,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -75,14 +60,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -99,12 +76,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() @@ -117,14 +88,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -141,14 +104,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -165,16 +120,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -207,12 +152,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -225,12 +164,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() @@ -259,14 +192,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -283,16 +208,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -313,22 +228,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -361,13 +260,6 @@ define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -382,13 +274,6 @@ define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() @@ -417,12 +302,6 @@ define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr @@ -435,12 +314,6 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr } define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr @@ -526,13 +399,6 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 { } define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared ; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) @@ -547,13 +413,6 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { } define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private ; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -568,12 +427,6 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { } define amdgpu_kernel void @use_alloca() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca -; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) @@ -586,15 +439,6 @@ define amdgpu_kernel void @use_alloca() #1 { } define amdgpu_kernel void @use_alloca_non_entry_block() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: entry: -; AKF_HSA-NEXT: br label [[BB:%.*]] -; AKF_HSA: bb: -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: entry: @@ -614,12 +458,6 @@ bb: } define void @use_alloca_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) @@ -638,9 +476,6 @@ attributes #1 = { nounwind } !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 20ce05278d213..15dc1a0529254 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s declare i32 @llvm.r600.read.tgid.x() #0 @@ -27,12 +26,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -45,14 +38,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -69,14 +54,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -93,12 +70,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() @@ -111,14 +82,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -135,14 +98,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -159,16 +114,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -201,12 +146,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() @@ -219,12 +158,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() @@ -253,14 +186,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() @@ -277,16 +202,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() @@ -307,22 +222,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() @@ -394,8 +293,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind } ;. -; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index fc13b86566f76..22cc5af30da66 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 2{{$}} +; CHECK: NumSGPRsForWavesPerEU: 24{{$}} ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 46edf06c3b62c..d0107eb3ade27 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,8 +4,8 @@ ; ALL-LABEL: {{^}}max_10_sgprs: -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 10 +; ALL: SGPRBlocks: 2 +; ALL: NumSGPRsForWavesPerEU: 24 define amdgpu_kernel void @max_10_sgprs() #0 { %one = load volatile i32, ptr addrspace(4) poison %two = load volatile i32, ptr addrspace(4) poison diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c..4507fd5865989 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 3 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 20 +; CHECK: NumSGPRsForWavesPerEU: 30 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index 682a57571d11e..35f0ccf5ba62f 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -392,7 +392,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -420,7 +421,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -434,7 +436,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -462,7 +465,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -476,7 +480,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -490,7 +495,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -504,7 +510,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_indirect_call ; GFX10: argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll new file mode 100644 index 0000000000000..1b422252573db --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s + +; +; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests +; we manually set the attribute for the functions. The purpose is to test how the amdgpu-attributor pass +; handles this situation. +; +;; tests of addrspacecast + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 { + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of intrinsics + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { + call void @calls_intrin_ascast(ptr addrspace(3) %ptr) + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } + +; GFX9: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX10: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll new file mode 100644 index 0000000000000..51caa84450ff3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -0,0 +1,870 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX8-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX9-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX942-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s + +; +; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests +; we manually set the attribute for the functions. The purpose is to test how llc handles this. +; + +;; tests of addrspacecast + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: with_private_to_flat_addrspacecast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: with_private_to_flat_addrspacecast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: s_nop 0 +; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: with_private_to_flat_addrspacecast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX8-NEXT: s_load_dword s1, s[8:9], 0xc8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8-NEXT: s_cselect_b32 s0, s0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xc8 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s0, s0, 0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s2, -1 +; GFX10-NEXT: s_cselect_b32 s0, s2, 0 +; GFX10-NEXT: s_cselect_b32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: call_with_private_to_flat_addrspacecast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8 +; GFX8-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s3 ; 4-byte Folded Spill +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8 +; GFX8-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s3 ; 4-byte Folded Reload +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: call_with_private_to_flat_addrspacecast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX9-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: call_with_private_to_flat_addrspacecast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_writelane_b32 v3, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s31, v3, 1 +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s0, s15 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX8-NEXT: s_add_u32 s8, s8, 8 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s8, s8, 8 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, 8 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_endpgm + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 { +; GFX8-LABEL: private_constant_expression_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[8:9], 0xc8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0xc8 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-ARCH-FLAT-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: private_constant_expression_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: private_constant_expression_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of intrinsics + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: calls_intrin_ascast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX8-NEXT: s_load_dword s1, s[8:9], 0xcc +; GFX8-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xcc +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: calls_intrin_ascast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: calls_intrin_ascast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: calls_intrin_ascast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc4 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc4 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: calls_intrin_ascast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: calls_intrin_ascast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s0, s15 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX8-NEXT: s_add_u32 s8, s8, 8 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s8, s8, 8 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, 8 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_endpgm + call void @calls_intrin_ascast(ptr addrspace(3) %ptr) + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 55ed11ac62972..4f341fa71cf68 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -30,9 +30,11 @@ ; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1 ; NOOPT: .amdhsa_system_sgpr_workgroup_info 0 ; NOOPT: .amdhsa_system_vgpr_workitem_id 2 -define amdgpu_kernel void @foo() { +define amdgpu_kernel void @foo() #0 { ret void } +attributes #0 = { "amdgpu-no-flat-scratch-init" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index d5da3e00df1a6..10ca3c9d5f2c8 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN-NOT: s6 -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x@rel32@hi+12 +; GCN: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+12 ; GCN-NOT: s6 -; GCN: s_mov_b32 s12, s6 +; GCN: s_mov_b32 s12, s4 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm @@ -217,7 +217,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: ; GCN-NOT: s12 -; GCN: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s13, s5 ; GCN-NOT: s12 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -233,7 +233,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: ; GCN-NOT: s12 ; GCN-NOT: s13 -; GCN: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s14, s5 ; GCN-NOT: s12 ; GCN-NOT: s13 @@ -250,8 +250,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: ; GCN-NOT: s14 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s5 ; GCN-NOT: s14 ; GCN: s_mov_b32 s32, 0 @@ -266,9 +266,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: -; GCN: s_mov_b32 s12, s6 -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s12, s4 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -283,8 +283,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: ; GCN-NOT: s13 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s14, s5 ; GCN-NOT: s13 ; GCN: s_mov_b32 s32, 0 @@ -300,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -382,7 +382,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN-DAG: s_mov_b32 s12, s6 +; GCN-DAG: s_mov_b32 s12, s4 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: s13 ; GCN-NOT: s14 @@ -400,7 +400,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s13, s7 +; GCN-DAG: s_mov_b32 s13, s5 ; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -415,7 +415,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s14, s7 +; GCN-DAG: s_mov_b32 s14, s5 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -474,7 +474,7 @@ define hidden void @use_every_sgpr_input() #1 { ; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 ; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { ; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 ; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index ee4a2ed883b63..3fe3cafd729a7 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -68,7 +68,7 @@ define amdgpu_kernel void @fadd( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: %a.val = load float, ptr addrspace(1) %a %b.val = load float, ptr addrspace(1) %b @@ -80,7 +80,7 @@ entry: define amdgpu_kernel void @fsub( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: %a.val = load float, ptr addrspace(1) %a %b.val = load float, ptr addrspace(1) %b @@ -99,7 +99,9 @@ define amdgpu_kernel void @empty( i32 %i, ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: ret void } + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index c17cf1cd6bca4..c167834470e3b 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,6 +5,9 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: s_add_i32 s12, s12, s17 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index fcb8fa5997b7e..fc17d9288bf40 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,6 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 39554e05c96b4..f964170ccdda5 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -1,11 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -14,14 +10,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() #0 { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -40,7 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { attributes #0 = { "amdgpu-no-dispatch-id" } ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 9104dc68eb9b4..72913d2596ebf 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s15 +; CHECK-NEXT: s_add_u32 s24, s24, s17 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index f3aec696abdee..e6f02295e67d5 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb ; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 0c25ca5076790..fac9f5bf826a6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -5,6 +5,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v1i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,6 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -32,6 +38,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v2i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,6 +63,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,6 +92,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,6 +117,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -128,6 +146,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v4i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -150,6 +171,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -176,6 +200,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v8i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 @@ -192,10 +219,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; VI-LABEL: extract_vector_elt_v8i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v3 @@ -213,6 +243,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v16i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,6 +268,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -261,6 +297,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v32i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 @@ -277,10 +316,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; VI-LABEL: extract_vector_elt_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v3 @@ -298,6 +340,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v64i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x10 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -320,6 +365,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -351,6 +399,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v2i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s2, s[8:9], 0xa ; SI-NEXT: s_load_dword s3, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -370,11 +421,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; VI-NEXT: s_load_dword s2, s[8:9], 0x4c ; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_lshr_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -388,6 +442,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s2, s[8:9], 0x13 ; SI-NEXT: s_load_dword s3, s[8:9], 0xa ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -406,10 +463,13 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out ; VI-NEXT: s_load_dword s2, s[8:9], 0x4c ; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_lshr_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -424,6 +484,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v4i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_load_dword s4, s[8:9], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,6 +505,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -463,6 +529,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v8i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_load_dword s4, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -481,6 +550,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -502,6 +574,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_0123: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -526,6 +601,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -558,6 +636,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_0145: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,6 +662,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -612,6 +696,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_45: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 4 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -628,6 +715,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 4 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,6 +739,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { ; SI-LABEL: reduce_load_vector_v16i8_extract_0145: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -672,6 +765,9 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 32f75f3835226..7b6a363c42708 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -14,6 +14,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,6 +29,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,6 +86,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -92,6 +101,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -145,6 +157,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -157,6 +172,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -196,6 +214,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -209,6 +230,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -251,6 +275,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -268,6 +295,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -325,6 +355,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -339,6 +372,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -386,6 +422,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -398,6 +437,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -441,6 +483,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -467,6 +512,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -523,9 +571,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_lshr_b32 s2, s4, 16 @@ -551,9 +602,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -610,6 +664,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -633,6 +690,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -718,6 +778,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -736,6 +799,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 6496b70b4d697..60334e46a4454 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -216,8 +222,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -227,6 +235,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 @@ -243,6 +252,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -351,6 +363,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -361,6 +376,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index 4e12a30c6f6f4..9919497acea73 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -76,6 +79,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -87,6 +93,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -132,6 +141,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -184,6 +196,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -237,6 +252,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -289,6 +307,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -328,6 +349,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -367,6 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -409,6 +436,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -449,6 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -489,6 +522,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -529,6 +565,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -568,10 +607,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -612,10 +654,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -656,10 +701,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -700,6 +748,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -740,6 +791,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -782,6 +836,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -822,6 +879,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -862,6 +922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -902,6 +965,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -942,6 +1008,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -982,6 +1051,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1022,6 +1094,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1062,6 +1137,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1102,6 +1180,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1153,6 +1234,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1163,6 +1247,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1205,6 +1292,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1257,6 +1347,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1310,6 +1403,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1362,10 +1458,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1407,10 +1506,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1450,10 +1552,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1491,10 +1596,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1532,10 +1640,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1573,10 +1684,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1618,10 +1732,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1662,10 +1779,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1705,10 +1825,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1749,10 +1872,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1790,10 +1916,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1831,10 +1960,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1872,10 +2004,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1913,10 +2048,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1954,10 +2092,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1995,10 +2136,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2037,6 +2181,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2054,6 +2201,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2117,6 +2267,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2134,6 +2287,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2197,6 +2353,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2215,6 +2374,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2279,6 +2441,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2302,6 +2467,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2368,6 +2536,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2385,6 +2556,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2448,6 +2622,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2465,6 +2642,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2529,6 +2709,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2547,6 +2730,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2612,6 +2798,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2635,6 +2824,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2700,6 +2892,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -2717,6 +2912,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index fee6540f43c64..fc316b736d5f1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -22,12 +22,14 @@ ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define amdgpu_kernel void @test(ptr addrspace(1) %out) { +define amdgpu_kernel void @test(ptr addrspace(1) %out) #0 { entry: store i32 0, ptr addrspace(1) %out ret void } +; ALL-LABEL: {{^}}test_addr64: + ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword ; HSA-NOADDR64: flat_store_dword @@ -35,7 +37,7 @@ entry: ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) { +define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) #0 { entry: %out.addr = alloca ptr addrspace(1), align 4, addrspace(5) @@ -51,5 +53,7 @@ entry: ret void } +attributes #0 = { "amdgpu-no-flat-scratch-init" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 45223a24e021a..a59382ba20dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -8,28 +8,34 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 8 ; VI-NOXNACK: ; TotalNumSgprs: 8 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 8 ; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; HSA-VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -41,12 +47,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 10 ; VI-NOXNACK: ; TotalNumSgprs: 10 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 10 ; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; HSA-VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -58,12 +70,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 12 ; VI-NOXNACK: ; TotalNumSgprs: 14 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 ; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; HSA-VI-XNACK: ; TotalNumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -75,12 +93,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 12 ; VI-NOXNACK: ; TotalNumSgprs: 14 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 ; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; HSA-VI-XNACK: ; TotalNumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -95,12 +119,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -115,9 +145,13 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -129,12 +163,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 64be9cb72a6ee..fb2448fb80744 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -80,8 +83,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-NEXT: s_load_dword s3, s[8:9], 0x2c +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -139,6 +145,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -194,6 +203,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 @@ -255,6 +267,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -303,10 +318,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,6 +368,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -368,6 +389,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -482,6 +506,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -503,6 +530,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -599,6 +629,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -620,6 +653,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 @@ -718,6 +754,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -725,6 +763,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0 @@ -741,6 +780,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -748,6 +789,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0 @@ -847,6 +889,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -898,10 +943,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 67bec43078803..eca8c2837b0fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -26,6 +29,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -83,6 +89,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -101,6 +110,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -162,6 +174,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -174,6 +189,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -229,6 +247,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -241,6 +262,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -294,6 +318,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -348,6 +375,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 @@ -370,7 +400,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -379,6 +411,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -421,6 +454,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -433,6 +469,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -473,6 +512,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -516,6 +558,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| @@ -537,7 +582,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -545,6 +592,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -584,6 +632,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -601,6 +652,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -655,6 +709,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -679,7 +736,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s1, s4, 16 @@ -688,6 +747,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 781a2ca3146f5..058c273a65d99 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1477,6 +1477,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bitcmp1_b32 s6, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1488,6 +1490,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 23e4ba9fd4ed7..98e0b27cd955d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -11,6 +11,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,6 +26,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -78,6 +84,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -92,6 +101,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -152,6 +164,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -164,6 +179,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -217,6 +235,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -234,6 +255,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -289,6 +313,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -301,6 +328,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -340,14 +370,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm ; @@ -388,6 +421,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -402,6 +438,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -449,6 +488,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -461,6 +503,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -501,6 +546,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -527,6 +575,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -572,6 +623,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -593,6 +647,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -672,6 +729,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index a2fca33af1046..10573aad38a51 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -46,6 +52,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,6 +66,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -81,6 +93,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -114,6 +129,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -139,6 +157,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -153,6 +174,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -183,6 +207,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -196,6 +223,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -227,6 +257,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -238,6 +271,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -265,6 +301,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -278,6 +317,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -308,6 +350,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -321,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -351,6 +399,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -366,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -401,6 +455,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -429,6 +486,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -485,6 +545,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -498,6 +561,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -529,6 +595,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -545,6 +614,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -582,6 +654,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -603,6 +678,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -648,6 +726,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -673,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -726,6 +810,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -773,6 +860,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -858,6 +948,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -886,6 +979,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -914,6 +1010,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -942,6 +1041,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -970,6 +1072,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1001,6 +1106,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1017,6 +1125,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1052,6 +1163,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1069,6 +1183,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1106,6 +1223,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1125,6 +1245,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1165,6 +1288,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1195,6 +1321,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1251,6 +1380,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: v_mov_b32_e32 v5, s3 @@ -1309,6 +1441,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1406,6 +1541,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1440,6 +1578,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1458,6 +1599,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,6 +1642,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1523,6 +1670,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1574,6 +1724,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,6 +1755,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,6 +1815,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1707,6 +1866,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1791,6 +1953,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1885,6 +2050,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2039,6 +2207,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2070,6 +2241,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2087,6 +2261,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2123,6 +2300,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2146,6 +2326,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2191,6 +2374,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2212,6 +2398,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2254,6 +2443,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2289,6 +2481,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2352,6 +2547,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2420,6 +2618,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2530,6 +2731,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -2547,6 +2751,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2577,6 +2784,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2598,6 +2808,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2629,6 +2842,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2666,6 +2882,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2706,6 +2925,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s10, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 @@ -2764,6 +2986,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s10, s7, 16 ; VI-NEXT: s_lshr_b32 s11, s3, 16 @@ -2824,6 +3049,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2853,6 +3081,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 8c017fa5ec263..cd89a36fe538b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @test( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" { + ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" "amdgpu-no-flat-scratch-init" { entry: %a.val = load half, ptr addrspace(1) %a %b.val = load half, ptr addrspace(1) %b @@ -170,7 +170,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 1 -attributes #0 = { "amdgpu-num-sgpr"="20" } +attributes #0 = { "amdgpu-num-sgpr"="20" "amdgpu-no-flat-scratch-init" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 5a2a976e23846..024593c49dba1 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -43,7 +43,7 @@ ; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269 ; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978 ; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63 -; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C +; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C ; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C ; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072 ; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370 @@ -59,7 +59,7 @@ ; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172 ; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D ; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB -; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367 +; ELF: 0210: 2E736770 725F636F 756E740C B12E7367 ; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7 ; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E ; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b4b6bef7a7ed3..ec80efc5f0362 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -7,7 +7,7 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s -define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { +define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) #0 { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ret void } -define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 @@ -163,7 +163,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 @@ -283,7 +283,7 @@ define amdgpu_kernel void @llvm_ubsantrap() { unreachable } -define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -389,3 +389,5 @@ declare void @llvm.debugtrap() !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 9eb966b4e2a94..0ca180ed6e105 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %14 ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %14 ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -46,16 +46,17 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, %14 + ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 75db7571444bc..b51cb9df8d784 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -531,14 +552,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -611,14 +635,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -689,14 +716,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -769,14 +799,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -853,9 +886,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -948,9 +984,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -1065,9 +1104,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 97c97ac8a7ad3..e11900ac0ca68 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -21,6 +21,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -68,6 +71,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -84,6 +90,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -152,6 +161,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -172,6 +184,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -251,6 +266,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -266,6 +284,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -320,6 +341,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -339,6 +363,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -407,6 +434,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,6 +459,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -498,6 +531,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -544,6 +580,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -560,6 +599,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -623,6 +665,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -668,6 +713,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -714,6 +762,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -732,6 +783,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -788,9 +842,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -807,9 +864,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -880,6 +940,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -898,6 +961,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -953,6 +1019,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -971,6 +1040,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1038,6 +1110,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1056,6 +1131,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1123,6 +1201,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1141,6 +1222,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1195,6 +1279,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1213,6 +1300,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1267,6 +1357,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1285,6 +1378,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1353,6 +1449,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1371,6 +1470,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1445,6 +1547,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1464,6 +1569,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1526,9 +1634,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1547,9 +1658,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -1612,11 +1726,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1639,11 +1756,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1712,14 +1832,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -1731,9 +1854,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1790,14 +1916,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -1809,9 +1938,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1883,14 +2015,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -1902,9 +2037,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1961,14 +2099,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -1980,9 +2121,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2054,14 +2198,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -2073,9 +2220,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2138,6 +2288,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2165,6 +2318,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2268,9 +2424,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -2294,9 +2453,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2363,9 +2525,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -2383,9 +2548,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2457,9 +2625,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2477,9 +2648,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2568,9 +2742,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2622,9 +2799,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2803,11 +2983,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2830,9 +3013,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 @@ -2923,12 +3109,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2936,6 +3124,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_perm_b32 v3, s4, v3, v12 @@ -2949,11 +3138,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3087,11 +3279,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3184,11 +3379,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll index f0609f62a9024..5dff7372ab561 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) { ; CHECK-LABEL: use_group_to_global_addrspacecast: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: s_add_i32 s12, s12, s17 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll index 621187100f323..55a5d50f06bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll @@ -6,6 +6,8 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s4, s6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 496a1c652da25..1a32953305bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,7 +7,7 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 4b6cc32522f5b..7179f687c70f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -47,11 +47,7 @@ ; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics -; GCN-O0-NEXT: CallGraph Construction -; GCN-O0-NEXT: Call Graph SCC Pass Manager -; GCN-O0-NEXT: AMDGPU Annotate Kernel Features -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager @@ -232,11 +228,7 @@ ; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics -; GCN-O1-NEXT: CallGraph Construction -; GCN-O1-NEXT: Call Graph SCC Pass Manager -; GCN-O1-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -531,11 +523,7 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE -; GCN-O1-OPTS-NEXT: CallGraph Construction -; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager @@ -848,11 +836,7 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE -; GCN-O2-NEXT: CallGraph Construction -; GCN-O2-NEXT: Call Graph SCC Pass Manager -; GCN-O2-NEXT: AMDGPU Annotate Kernel Features -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager @@ -1180,11 +1164,7 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering -; GCN-O3-NEXT: CallGraph Construction -; GCN-O3-NEXT: Call Graph SCC Pass Manager -; GCN-O3-NEXT: AMDGPU Annotate Kernel Features -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index f93d80cc7adf8..4edd0357c6e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -30,9 +30,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -59,10 +62,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -133,6 +139,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-SDAG: ; %bb.0: ; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 ; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -166,6 +175,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 637d8388cddf1..9d078f7906b4d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -63,9 +63,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -92,10 +95,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -200,6 +206,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-SDAG: ; %bb.0: ; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 ; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -233,6 +242,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 97219a8f143ce..0fe371c1b51fe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GCN-NEXT: s_add_i32 s2, s12, 42 +; GCN-NEXT: s_add_i32 s2, s14, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -74,6 +77,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 55fa02a0c582c..cc9e34be209b4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -284,6 +284,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -294,6 +297,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -309,10 +315,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -321,10 +330,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -337,10 +349,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -349,11 +364,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -366,12 +384,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -379,12 +400,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -398,25 +422,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -430,13 +460,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -444,13 +477,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -464,13 +500,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -478,13 +517,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index edb6ebcee1325..f2b0959cc706e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; @@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 04d179478590b..4ac2cc98970b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -53,6 +56,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -98,6 +104,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -147,6 +156,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -202,6 +214,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -251,6 +266,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -306,6 +324,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -348,6 +369,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -396,6 +420,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -444,6 +471,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -498,11 +528,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -551,11 +584,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -609,6 +645,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -668,6 +707,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -738,6 +780,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -803,6 +848,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -877,7 +925,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -886,6 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -946,7 +997,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -956,6 +1009,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1028,15 +1082,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: s_mov_b32 s4, m0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -1081,15 +1138,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: s_mov_b32 s4, m0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -1138,6 +1198,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1180,6 +1243,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1227,6 +1293,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1270,6 +1339,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1319,6 +1391,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1362,6 +1437,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1412,6 +1490,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1449,6 +1530,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1492,10 +1576,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1538,11 +1625,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1589,10 +1679,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1635,11 +1728,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1684,7 +1780,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1716,7 +1815,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1754,11 +1856,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1797,11 +1902,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1845,11 +1953,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1888,11 +1999,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 6f95364ac3644..919c1dfd4694e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -90,7 +93,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 51dfbda53ad4c..817c5def5614f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -117,6 +120,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -188,6 +194,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -286,6 +295,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -360,6 +372,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -445,6 +460,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -584,6 +602,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -831,6 +852,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -906,6 +930,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -982,6 +1009,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1057,6 +1087,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1131,6 +1164,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1216,6 +1252,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1305,6 +1344,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1402,6 +1444,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1504,6 +1549,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1610,6 +1658,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1727,6 +1778,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1885,6 +1939,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2062,6 +2119,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2324,6 +2384,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,7 +2694,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3112,7 +3178,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,7 +3749,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4596,7 +4668,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,6 +5458,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5480,6 +5558,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5578,6 +5659,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5670,6 +5754,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5767,12 +5854,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff @@ -5877,6 +5967,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5980,10 +6073,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 @@ -6136,6 +6232,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6292,10 +6391,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 @@ -6510,6 +6612,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6771,10 +6876,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 @@ -7156,6 +7264,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7631,7 +7742,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8354,7 +8468,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 120f47a277ee6..68a6a148819e8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -103,6 +106,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -190,6 +196,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -284,6 +293,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -383,6 +395,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -517,6 +532,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -678,6 +696,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -847,6 +868,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1023,6 +1047,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1202,7 +1229,10 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1389,6 +1419,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1473,6 +1506,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1563,6 +1599,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1647,6 +1686,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1739,12 +1781,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1837,6 +1882,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1949,13 +1997,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2082,6 +2133,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2244,8 +2298,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -2253,6 +2309,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 @@ -2452,6 +2509,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2748,7 +2808,10 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3196,7 +3259,10 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3628,7 +3694,10 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4479,8 +4548,10 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 @@ -4509,6 +4580,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 @@ -5097,7 +5169,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index b3e75e767ae64..2219ceea7ec9b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -95,6 +98,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -179,6 +185,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -294,6 +303,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -421,7 +433,10 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -638,7 +653,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index c608bef3f726e..4031be65fab61 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -112,6 +115,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -195,6 +201,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -305,6 +314,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -374,6 +386,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -448,6 +463,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -529,6 +547,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -604,6 +625,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -680,6 +704,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -755,6 +782,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,6 +864,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -933,6 +966,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1030,6 +1066,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1131,6 +1170,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1232,6 +1274,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1336,6 +1381,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1453,6 +1501,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1612,6 +1663,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,6 +1848,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2060,6 +2117,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2374,6 +2434,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2856,6 +2919,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3437,7 +3503,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4353,7 +4422,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5161,6 +5233,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5243,6 +5318,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5328,6 +5406,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5408,6 +5489,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5496,6 +5580,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5603,6 +5690,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5716,10 +5806,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 @@ -5854,6 +5947,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6013,10 +6109,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 @@ -6235,6 +6334,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6504,10 +6606,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24 @@ -6898,6 +7003,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7387,10 +7495,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 @@ -8128,6 +8239,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8898,6 +9012,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8982,6 +9099,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9068,6 +9188,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9152,6 +9275,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9241,6 +9367,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9340,6 +9469,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9452,6 +9584,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9560,6 +9695,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9683,6 +9821,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9832,6 +9973,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10014,6 +10158,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10261,6 +10408,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10574,6 +10724,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11018,6 +11171,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index c5771bc73b945..9054e509cde8e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -339,6 +348,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -424,6 +436,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -512,6 +527,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -662,6 +680,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -811,6 +832,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -896,6 +920,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -984,6 +1011,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1069,6 +1099,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1159,6 +1192,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1258,6 +1294,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1359,6 +1398,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1469,6 +1511,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1586,6 +1631,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1701,6 +1749,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1823,6 +1874,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,6 +2026,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2136,6 +2193,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2372,6 +2432,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2643,6 +2706,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3054,6 +3120,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3573,6 +3642,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4377,6 +4449,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5142,6 +5217,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5239,6 +5317,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5334,6 +5415,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5426,6 +5510,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5524,6 +5611,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5633,6 +5723,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5751,6 +5844,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5896,6 +5992,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6056,10 +6155,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6074,8 +6173,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 @@ -6275,6 +6377,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6525,10 +6630,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6545,7 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 @@ -6905,6 +7013,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7376,6 +7487,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -8078,6 +8192,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 033a66abcedb9..e8c862a3cb93c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1569,6 +1614,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1674,8 +1722,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1683,6 +1733,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1800,6 +1851,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1941,8 +1995,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1957,6 +2013,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2134,6 +2191,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2370,6 +2430,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2731,8 +2794,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2766,6 +2831,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) @@ -3122,6 +3188,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3589,12 +3658,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 +; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 @@ -3620,11 +3689,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 @@ -3667,11 +3736,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 @@ -3913,6 +3982,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4437,6 +4509,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 4dfc773d615e4..1a6fa3c518ca7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -13,7 +13,8 @@ ; GCN: s_cselect_b32 ; GCN-NOT: load_dword -; GCN: flat_load_dwordx2 +; GCN: flat_load_dword +; GCN: flat_load_dword ; GCN-NOT: load_dword ; GCN: flat_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 245a2775d9f2f..07b5e1610cfc0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s14, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index b3b529d4e5e5b..4896e504cfdf4 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s12, s12, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s12 +; GFX9-NEXT: s_mul_i32 s14, s14, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s14 ; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] @@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s12, s12, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0 +; GFX10-NEXT: s_mul_i32 s14, s14, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 88ee2a34dd49f..8d020b9e1a603 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -94,12 +96,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -107,50 +109,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -160,55 +162,57 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s15 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 ; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 @@ -268,6 +272,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 ; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v21, s1 ; CHECK-NEXT: v_mov_b32_e32 v20, s0 @@ -294,6 +300,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -379,12 +387,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -392,50 +400,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -445,55 +453,57 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s15 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 ; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 @@ -553,6 +563,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 ; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v21, s1 ; CHECK-NEXT: v_mov_b32_e32 v20, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 5af37809443e0..07ad8cb0c4a3d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-LABEL: flat_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-LABEL: flat_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -423,6 +466,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -531,6 +576,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-LABEL: flat_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -547,6 +595,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -565,6 +617,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -598,6 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -612,6 +670,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -739,6 +799,9 @@ entry: define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -750,6 +813,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -761,6 +828,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -783,6 +854,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -793,6 +866,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -873,6 +948,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -884,6 +962,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -895,6 +977,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -917,6 +1003,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -927,6 +1015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1007,6 +1097,9 @@ entry: define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1019,6 +1112,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-WGP-LABEL: flat_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1032,6 +1129,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-CU-LABEL: flat_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1057,6 +1158,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,6 +1171,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,6 +1270,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1177,6 +1285,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1190,6 +1302,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1215,6 +1331,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,6 +1344,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1323,6 +1443,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1334,6 +1457,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,6 +1472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1367,6 +1498,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,6 +1592,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,6 +1608,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1485,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,6 +1658,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,6 +1672,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1622,6 +1772,9 @@ entry: define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-LABEL: flat_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,6 +1787,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,6 +1804,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1672,6 +1833,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1683,6 +1846,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1780,6 +1945,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,6 +1962,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,6 +1983,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,6 +2017,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,6 +2032,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,6 +2149,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2166,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2000,6 +2187,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2030,6 +2221,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2043,6 +2236,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2158,6 +2353,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2174,6 +2372,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2191,6 +2393,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2223,6 +2429,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2237,6 +2445,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2352,6 +2562,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,6 +2582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2388,6 +2605,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2423,6 +2644,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2438,6 +2661,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2574,6 +2799,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2591,6 +2819,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2610,6 +2842,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2645,6 +2881,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2660,6 +2898,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2796,6 +3036,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2821,6 +3064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2846,6 +3093,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2896,6 +3147,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2910,6 +3163,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3019,6 +3274,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3046,6 +3304,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3075,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3130,6 +3396,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3146,6 +3414,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3273,6 +3543,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3299,6 +3572,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3326,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3379,6 +3660,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3394,6 +3677,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3520,6 +3805,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3548,6 +3836,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3579,6 +3871,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3637,6 +3933,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3654,6 +3952,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3798,6 +4098,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3826,6 +4129,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3857,6 +4164,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3915,6 +4226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3932,6 +4245,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4076,6 +4391,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4103,6 +4421,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4132,6 +4454,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4187,6 +4513,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4203,6 +4531,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4330,6 +4660,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4357,6 +4690,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4386,6 +4723,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4441,6 +4782,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4457,6 +4800,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4584,6 +4929,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4612,6 +4960,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4643,6 +4995,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4701,6 +5057,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4718,6 +5076,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4862,6 +5222,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4890,6 +5253,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4921,6 +5288,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5350,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4996,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5140,6 +5515,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5168,6 +5546,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5199,6 +5581,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5257,6 +5643,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5274,6 +5662,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5418,6 +5808,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5446,6 +5839,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5477,6 +5874,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5535,6 +5936,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5552,6 +5955,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5696,6 +6101,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5724,6 +6132,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5755,6 +6167,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5813,6 +6229,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5830,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5974,6 +6394,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6002,6 +6425,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6033,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6091,6 +6522,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6108,6 +6541,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6252,6 +6687,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6280,6 +6718,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6311,6 +6753,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6369,6 +6815,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6386,6 +6834,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6530,6 +6980,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6558,6 +7011,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6589,6 +7046,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6647,6 +7108,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6664,6 +7127,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6808,6 +7273,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6837,6 +7305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6866,6 +7338,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6924,6 +7400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6941,6 +7419,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7077,6 +7557,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7107,6 +7590,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7138,6 +7625,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7198,6 +7689,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7216,6 +7709,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7361,6 +7856,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7391,6 +7889,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7422,6 +7924,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7483,6 +7989,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7501,6 +8009,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8164,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7685,6 +8198,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7718,6 +8235,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7781,6 +8302,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7800,6 +8323,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7966,6 +8491,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7997,6 +8525,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8030,6 +8562,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8093,6 +8629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8112,6 +8650,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8278,6 +8818,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8308,6 +8851,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8339,6 +8886,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8399,6 +8950,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8417,6 +8970,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8566,6 +9121,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8596,6 +9154,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8627,6 +9189,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8687,6 +9253,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8705,6 +9273,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8850,6 +9420,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8881,6 +9454,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8914,6 +9491,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8977,6 +9558,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8996,6 +9579,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9162,6 +9747,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9193,6 +9781,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9226,6 +9818,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9289,6 +9885,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9308,6 +9906,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9474,6 +10074,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9505,6 +10108,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9538,6 +10145,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9601,6 +10212,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9620,6 +10233,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9786,6 +10401,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9817,6 +10435,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9850,6 +10472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9913,6 +10539,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9932,6 +10560,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10098,6 +10728,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10129,6 +10762,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10162,6 +10799,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10225,6 +10866,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10244,6 +10887,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10406,6 +11051,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10437,6 +11085,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10470,6 +11122,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10533,6 +11189,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10552,6 +11210,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10718,6 +11378,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10749,6 +11412,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10782,6 +11449,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10845,6 +11516,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10864,6 +11537,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11030,6 +11705,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11061,6 +11739,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11094,6 +11776,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11157,6 +11843,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11176,6 +11864,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11342,6 +12032,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-LABEL: flat_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11356,6 +12049,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +12067,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11398,6 +12099,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11410,6 +12113,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11509,6 +12214,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-LABEL: flat_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11523,6 +12231,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11537,6 +12249,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11565,6 +12281,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11577,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11676,6 +12396,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-LABEL: flat_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11692,6 +12415,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11709,6 +12436,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11741,6 +12472,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11755,6 +12488,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11868,6 +12603,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-LABEL: flat_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11885,6 +12623,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11904,6 +12646,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11939,6 +12685,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11954,6 +12702,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12086,6 +12836,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12097,6 +12850,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12108,6 +12865,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12130,6 +12891,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12140,6 +12903,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12220,6 +12985,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12231,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12242,6 +13014,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12264,6 +13040,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12274,6 +13052,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12354,6 +13134,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12366,6 +13149,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12379,6 +13166,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12404,6 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12415,6 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12512,6 +13307,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12524,6 +13322,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12537,6 +13339,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12562,6 +13368,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12573,6 +13381,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12670,6 +13480,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12681,6 +13494,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12692,6 +13509,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12714,6 +13535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12724,6 +13547,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12804,6 +13629,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12817,6 +13645,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12831,6 +13663,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12857,6 +13693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12869,6 +13707,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12965,6 +13805,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12977,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12990,6 +13837,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13015,6 +13866,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13026,6 +13879,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13123,6 +13978,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13137,6 +13995,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13153,6 +14015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13182,6 +14048,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13195,6 +14063,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13308,6 +14178,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13322,6 +14195,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13338,6 +14215,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13367,6 +14248,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13380,6 +14263,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13493,6 +14378,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13510,6 +14398,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13528,6 +14420,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13562,6 +14458,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13577,6 +14475,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13697,6 +14597,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13715,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,6 +14642,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13772,6 +14683,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13788,6 +14701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13929,6 +14844,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13947,6 +14865,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13967,6 +14889,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14004,6 +14930,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14020,6 +14948,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14161,6 +15091,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14186,6 +15119,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14211,6 +15148,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14261,6 +15202,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14275,6 +15218,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14384,6 +15329,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14411,6 +15359,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14439,6 +15391,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14493,6 +15449,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14509,6 +15467,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14634,6 +15594,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14660,6 +15623,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14687,6 +15654,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14740,6 +15711,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14755,6 +15728,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14881,6 +15856,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14909,6 +15887,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14939,6 +15921,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14996,6 +15982,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15013,6 +16001,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15155,6 +16145,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15183,6 +16176,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15213,6 +16210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15270,6 +16271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15287,6 +16290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15429,6 +16434,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15456,6 +16464,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15484,6 +16496,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15538,6 +16554,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15554,6 +16572,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15679,6 +16699,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15706,6 +16729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15734,6 +16761,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15788,6 +16819,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15804,6 +16837,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15929,6 +16964,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15957,6 +16995,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15987,6 +17029,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16044,6 +17090,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16061,6 +17109,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16203,6 +17253,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16231,6 +17284,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16261,6 +17318,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16318,6 +17379,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16335,6 +17398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16477,6 +17542,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16505,6 +17573,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16535,6 +17607,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16592,6 +17668,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16609,6 +17687,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16751,6 +17831,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16779,6 +17862,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16809,6 +17896,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16866,6 +17957,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16883,6 +17976,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17025,6 +18120,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17053,6 +18151,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17083,6 +18185,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17140,6 +18246,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17157,6 +18265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17299,6 +18409,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17327,6 +18440,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17357,6 +18474,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,6 +18535,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17431,6 +18554,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17573,6 +18698,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17601,6 +18729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17631,6 +18763,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17688,6 +18824,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17705,6 +18843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +18987,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17875,6 +19018,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17905,6 +19052,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17962,6 +19113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17979,6 +19132,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18121,6 +19276,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18150,6 +19308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18179,6 +19341,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18237,6 +19403,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18254,6 +19422,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18390,6 +19560,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18421,6 +19594,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18453,6 +19630,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18515,6 +19696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18534,6 +19717,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18684,6 +19869,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18714,6 +19902,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18745,6 +19937,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18806,6 +20002,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18824,6 +20022,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18977,6 +20177,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19009,6 +20212,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19043,6 +20250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19108,6 +20319,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19128,6 +20341,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19299,6 +20514,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19331,6 +20549,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19365,6 +20587,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19430,6 +20656,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19450,6 +20678,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19621,6 +20851,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19652,6 +20885,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19684,6 +20921,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19746,6 +20987,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19765,6 +21008,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19919,6 +21164,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19950,6 +21198,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19982,6 +21234,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20044,6 +21300,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20063,6 +21321,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20213,6 +21473,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20245,6 +21508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20279,6 +21546,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20344,6 +21615,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20364,6 +21637,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20535,6 +21810,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20567,6 +21845,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20601,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20666,6 +21952,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20686,6 +21974,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20857,6 +22147,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20889,6 +22182,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20923,6 +22220,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20988,6 +22289,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21008,6 +22311,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21179,6 +22484,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21211,6 +22519,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21245,6 +22557,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21310,6 +22626,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21330,6 +22648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21501,6 +22821,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21533,6 +22856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21567,6 +22894,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21632,6 +22963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21652,6 +22985,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21819,6 +23154,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21851,6 +23189,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21885,6 +23227,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21950,6 +23296,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21970,6 +23318,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22141,6 +23491,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22173,6 +23526,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22207,6 +23564,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22272,6 +23633,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22292,6 +23655,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22463,6 +23828,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22495,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22529,6 +23901,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22594,6 +23970,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22614,6 +23992,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 30c0a322d7ddc..3c24c36ec547d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -240,6 +262,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -298,6 +324,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -329,6 +357,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -537,6 +567,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -551,6 +584,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -565,6 +602,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -593,6 +634,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -605,6 +648,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -704,6 +749,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -732,6 +780,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -759,6 +811,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -814,6 +870,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +901,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1047,6 +1107,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-LABEL: flat_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,6 +1125,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,6 +1144,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: flat_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1178,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1120,6 +1193,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b80dfaea01653..b88a10ab24a98 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-LABEL: flat_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-LABEL: flat_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -516,6 +561,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-LABEL: flat_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -683,6 +743,9 @@ entry: define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -817,6 +892,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1041,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,6 +1190,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1219,6 +1339,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,6 +1488,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1487,6 +1637,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1621,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1755,6 +1935,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2068,6 +2278,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2247,6 +2472,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,6 +2666,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2649,6 +2904,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2872,6 +3142,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3095,6 +3380,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3318,6 +3618,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3541,6 +3856,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3764,6 +4094,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3987,6 +4332,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4210,6 +4570,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4433,6 +4808,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4656,6 +5046,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4879,6 +5284,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5102,6 +5522,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5325,6 +5760,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5548,6 +5998,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5771,6 +6236,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,6 +6520,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6309,6 +6804,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6578,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6847,6 +7372,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,6 +7656,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7385,6 +7940,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8224,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,6 +8508,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8192,6 +8792,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8461,6 +9076,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8730,6 +9360,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8999,6 +9644,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9268,6 +9928,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9537,6 +10212,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9806,6 +10496,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-LABEL: flat_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +10678,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10140,6 +10860,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-LABEL: flat_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10307,6 +11042,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10474,6 +11224,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10608,6 +11373,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10742,6 +11522,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10876,6 +11671,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11010,6 +11820,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11144,6 +11969,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11278,6 +12118,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12267,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11546,6 +12416,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12565,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11859,6 +12759,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,6 +12953,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12217,6 +13147,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12440,6 +13385,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12663,6 +13623,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12886,6 +13861,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13109,6 +14099,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13332,6 +14337,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13555,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13778,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14001,6 +15051,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14224,6 +15289,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14447,6 +15527,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14670,6 +15765,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14893,6 +16003,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15116,6 +16241,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,6 +16479,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15562,6 +16717,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15831,6 +17001,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16100,6 +17285,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16369,6 +17569,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16638,6 +17853,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16907,6 +18137,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17176,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17445,6 +18705,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17714,6 +18989,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17983,6 +19273,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18252,6 +19557,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18521,6 +19841,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18790,6 +20125,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19059,6 +20409,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19328,6 +20693,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19357,6 +20725,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19386,6 +20758,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19444,6 +20820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19461,6 +20839,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 1ec942ea5f47b..919fc3e8f4e4f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-CU-LABEL: flat_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-LABEL: flat_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-LABEL: flat_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-CU-LABEL: flat_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -424,6 +467,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -533,6 +578,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-LABEL: flat_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -549,6 +597,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -567,6 +619,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -600,6 +656,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -615,6 +673,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -743,6 +803,9 @@ entry: define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -754,6 +817,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -765,6 +832,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-CU-LABEL: flat_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -787,6 +858,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -797,6 +870,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -877,6 +952,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -888,6 +966,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -899,6 +981,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -921,6 +1007,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -931,6 +1019,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1011,6 +1101,9 @@ entry: define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1023,6 +1116,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-WGP-LABEL: flat_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1036,6 +1133,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-CU-LABEL: flat_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1061,6 +1162,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1073,6 +1176,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1173,6 +1278,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1185,6 +1293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1223,6 +1339,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1235,6 +1353,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1335,6 +1455,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,6 +1469,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1357,6 +1484,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1379,6 +1510,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1389,6 +1522,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,6 +1604,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1482,6 +1620,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1497,6 +1639,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,6 +1670,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1537,6 +1685,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1648,6 +1801,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1661,6 +1818,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,6 +1847,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1698,6 +1861,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1798,6 +1963,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1812,6 +1980,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,6 +2001,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1859,6 +2035,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,6 +2052,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1993,6 +2173,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2007,6 +2190,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2211,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,6 +2245,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,6 +2262,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,6 +2383,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2204,6 +2402,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,6 +2423,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,6 +2459,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2268,6 +2476,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2384,6 +2594,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2401,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2420,6 +2637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2455,6 +2676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2472,6 +2695,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2612,6 +2837,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2629,6 +2857,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2648,6 +2880,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2683,6 +2919,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2700,6 +2938,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2840,6 +3080,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2865,6 +3108,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2890,6 +3137,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2940,6 +3191,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2954,6 +3207,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3063,6 +3318,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3090,6 +3348,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3119,6 +3381,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3174,6 +3440,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3191,6 +3459,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3319,6 +3589,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3345,6 +3618,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3372,6 +3649,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3425,6 +3706,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3441,6 +3724,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3570,6 +3855,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3598,6 +3886,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3629,6 +3921,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3687,6 +3983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3706,6 +4004,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3854,6 +4154,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3882,6 +4185,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3913,6 +4220,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3971,6 +4282,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3990,6 +4303,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4138,6 +4453,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4165,6 +4483,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4194,6 +4516,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4249,6 +4575,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4266,6 +4594,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4394,6 +4724,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4421,6 +4754,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4450,6 +4787,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4505,6 +4846,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4522,6 +4865,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4650,6 +4995,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4678,6 +5026,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4709,6 +5061,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4786,6 +5144,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4934,6 +5294,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4962,6 +5325,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4993,6 +5360,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5051,6 +5422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5070,6 +5443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5218,6 +5593,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5246,6 +5624,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5277,6 +5659,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5335,6 +5721,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5354,6 +5742,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5502,6 +5892,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5530,6 +5923,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5619,6 +6020,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5638,6 +6041,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5786,6 +6191,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5814,6 +6222,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5845,6 +6257,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5903,6 +6319,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5922,6 +6340,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6070,6 +6490,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6098,6 +6521,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6129,6 +6556,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6187,6 +6618,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6206,6 +6639,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6354,6 +6789,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6382,6 +6820,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6413,6 +6855,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6471,6 +6917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6490,6 +6938,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6638,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6666,6 +7119,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6697,6 +7154,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6755,6 +7216,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6774,6 +7237,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6922,6 +7387,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6951,6 +7419,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6980,6 +7452,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7038,6 +7514,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7055,6 +7533,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7191,6 +7671,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7221,6 +7704,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7252,6 +7739,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7312,6 +7803,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7331,6 +7824,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7477,6 +7972,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7507,6 +8005,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7538,6 +8040,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7599,6 +8105,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7618,6 +8126,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7774,6 +8284,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7805,6 +8318,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7838,6 +8355,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7901,6 +8422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7922,6 +8445,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8092,6 +8617,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8123,6 +8651,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8156,6 +8688,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8219,6 +8755,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8240,6 +8778,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8410,6 +8950,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8440,6 +8983,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8471,6 +9018,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8531,6 +9082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8550,6 +9103,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8700,6 +9255,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8730,6 +9288,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8761,6 +9323,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8821,6 +9387,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8840,6 +9408,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8986,6 +9556,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9017,6 +9590,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9050,6 +9627,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9113,6 +9694,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9304,6 +9889,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9335,6 +9923,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9368,6 +9960,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9431,6 +10027,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9452,6 +10050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9622,6 +10222,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9653,6 +10256,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9686,6 +10293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9749,6 +10360,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9770,6 +10383,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9940,6 +10555,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9971,6 +10589,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10004,6 +10626,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10067,6 +10693,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10088,6 +10716,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10258,6 +10888,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10289,6 +10922,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10322,6 +10959,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10385,6 +11026,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10406,6 +11049,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10572,6 +11217,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10603,6 +11251,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10636,6 +11288,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10699,6 +11355,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10720,6 +11378,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10890,6 +11550,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10921,6 +11584,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10954,6 +11621,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11017,6 +11688,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11038,6 +11711,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11208,6 +11883,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11239,6 +11917,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11272,6 +11954,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11335,6 +12021,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11356,6 +12044,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11526,6 +12216,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-LABEL: flat_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11540,6 +12233,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11554,6 +12251,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11582,6 +12283,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11594,6 +12297,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11693,6 +12398,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-LABEL: flat_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11707,6 +12415,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11721,6 +12433,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11749,6 +12465,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11761,6 +12479,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11860,6 +12580,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-LABEL: flat_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11876,6 +12599,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11893,6 +12620,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11925,6 +12656,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11940,6 +12673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12054,6 +12789,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-LABEL: flat_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12071,6 +12809,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12090,6 +12832,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12125,6 +12871,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12141,6 +12889,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12274,6 +13024,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +13038,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12296,6 +13053,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12318,6 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12328,6 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12408,6 +13173,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12419,6 +13187,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12430,6 +13202,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12452,6 +13228,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12462,6 +13240,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12542,6 +13322,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12554,6 +13337,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12567,6 +13354,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-CU-LABEL: flat_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12592,6 +13383,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12604,6 +13397,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12704,6 +13499,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12716,6 +13514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12729,6 +13531,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12754,6 +13560,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12766,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12866,6 +13676,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12877,6 +13690,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12888,6 +13705,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12910,6 +13731,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12920,6 +13743,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13000,6 +13825,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13013,6 +13841,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13027,6 +13859,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13053,6 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13066,6 +13904,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13163,6 +14003,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13175,6 +14018,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13188,6 +14035,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13213,6 +14064,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13225,6 +14078,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13325,6 +14180,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13339,6 +14197,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13355,6 +14217,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13384,6 +14250,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13399,6 +14267,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13516,6 +14386,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13530,6 +14403,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13546,6 +14423,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13575,6 +14456,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13590,6 +14473,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13707,6 +14592,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13724,6 +14612,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13742,6 +14634,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13776,6 +14672,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13792,6 +14690,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13913,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13931,6 +14834,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13951,6 +14858,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13988,6 +14899,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14006,6 +14919,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14151,6 +15066,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14169,6 +15087,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14189,6 +15111,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14226,6 +15152,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14244,6 +15172,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14389,6 +15319,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14414,6 +15347,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14439,6 +15376,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14489,6 +15430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14503,6 +15446,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14612,6 +15557,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14639,6 +15587,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14667,6 +15619,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14721,6 +15677,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14738,6 +15696,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14864,6 +15824,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14890,6 +15853,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14917,6 +15884,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14970,6 +15941,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14986,6 +15959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15115,6 +16090,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15143,6 +16121,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15173,6 +16155,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15230,6 +16216,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15249,6 +16237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15395,6 +16385,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15423,6 +16416,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15453,6 +16450,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15510,6 +16511,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15529,6 +16532,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15675,6 +16680,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15702,6 +16710,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15730,6 +16742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15784,6 +16800,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15801,6 +16819,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15927,6 +16947,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15954,6 +16977,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15982,6 +17009,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16036,6 +17067,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16053,6 +17086,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16179,6 +17214,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16207,6 +17245,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16237,6 +17279,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16294,6 +17340,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16313,6 +17361,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16459,6 +17509,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16487,6 +17540,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16517,6 +17574,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16574,6 +17635,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16593,6 +17656,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16739,6 +17804,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16767,6 +17835,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16797,6 +17869,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16854,6 +17930,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16873,6 +17951,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17019,6 +18099,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17047,6 +18130,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17077,6 +18164,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17134,6 +18225,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17153,6 +18246,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17299,6 +18394,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17327,6 +18425,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17357,6 +18459,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,6 +18520,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17433,6 +18541,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17579,6 +18689,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17607,6 +18720,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17637,6 +18754,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17694,6 +18815,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17713,6 +18836,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17859,6 +18984,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17887,6 +19015,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17917,6 +19049,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17974,6 +19110,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17993,6 +19131,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18139,6 +19279,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18167,6 +19310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18197,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18254,6 +19405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18273,6 +19426,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18419,6 +19574,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18448,6 +19606,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18477,6 +19639,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18535,6 +19701,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18552,6 +19720,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18688,6 +19858,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18719,6 +19892,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18751,6 +19928,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18813,6 +19994,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18833,6 +20016,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18984,6 +20169,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19014,6 +20202,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19045,6 +20237,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19106,6 +20302,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19125,6 +20323,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19281,6 +20481,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19313,6 +20516,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19347,6 +20554,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19412,6 +20623,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19434,6 +20647,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19609,6 +20824,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19641,6 +20859,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19675,6 +20897,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19740,6 +20966,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19762,6 +20990,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19937,6 +21167,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19968,6 +21201,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20000,6 +21237,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20062,6 +21303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20082,6 +21325,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20237,6 +21482,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20268,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20300,6 +21552,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20362,6 +21618,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20382,6 +21640,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20533,6 +21793,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20565,6 +21828,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20599,6 +21866,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20664,6 +21935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20686,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20861,6 +22136,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20893,6 +22171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20927,6 +22209,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20992,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21014,6 +22302,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21189,6 +22479,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21221,6 +22514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21255,6 +22552,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21320,6 +22621,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21342,6 +22645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21517,6 +22822,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21549,6 +22857,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21583,6 +22895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21648,6 +22964,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21670,6 +22988,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21845,6 +23165,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21877,6 +23200,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21911,6 +23238,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21976,6 +23307,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21998,6 +23331,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22169,6 +23504,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22201,6 +23539,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22235,6 +23577,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22300,6 +23646,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22322,6 +23670,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22497,6 +23847,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22529,6 +23882,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22563,6 +23920,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22628,6 +23989,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22650,6 +24013,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22825,6 +24190,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22857,6 +24225,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22891,6 +24263,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22956,6 +24332,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22978,6 +24356,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index e1f82a70b4c0a..a88e0e217fdb4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,6 +11,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -26,6 +29,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -41,6 +48,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -142,6 +153,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -172,6 +186,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -202,6 +220,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -405,6 +427,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -420,6 +445,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -435,6 +464,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -540,6 +573,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -569,6 +605,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -597,6 +637,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -799,6 +843,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-LABEL: flat_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -814,6 +861,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -829,6 +880,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -926,6 +981,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -938,6 +996,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1013,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 588f06f1be054..7c637a20ab47b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-LABEL: flat_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-LABEL: flat_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -516,6 +561,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-LABEL: flat_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -683,6 +743,9 @@ entry: define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -817,6 +892,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1041,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,6 +1190,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1219,6 +1339,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,6 +1488,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1487,6 +1637,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1621,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1755,6 +1935,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2068,6 +2278,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2247,6 +2472,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,6 +2666,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2649,6 +2904,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2872,6 +3142,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3095,6 +3380,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3318,6 +3618,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3541,6 +3856,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3764,6 +4094,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3987,6 +4332,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4210,6 +4570,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4433,6 +4808,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4656,6 +5046,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4879,6 +5284,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5102,6 +5522,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5325,6 +5760,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5548,6 +5998,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5771,6 +6236,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,6 +6520,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6309,6 +6804,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6578,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6847,6 +7372,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,6 +7656,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7385,6 +7940,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8224,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,6 +8508,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8192,6 +8792,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8461,6 +9076,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8730,6 +9360,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8999,6 +9644,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9268,6 +9928,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9537,6 +10212,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9806,6 +10496,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +10678,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10140,6 +10860,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10307,6 +11042,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10474,6 +11224,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10608,6 +11373,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10742,6 +11522,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10876,6 +11671,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11010,6 +11820,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11144,6 +11969,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11278,6 +12118,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12267,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11546,6 +12416,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12565,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11859,6 +12759,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,6 +12953,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12217,6 +13147,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12440,6 +13385,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12663,6 +13623,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12886,6 +13861,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13109,6 +14099,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13332,6 +14337,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13555,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13778,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14001,6 +15051,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14224,6 +15289,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14447,6 +15527,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14670,6 +15765,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14893,6 +16003,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15116,6 +16241,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,6 +16479,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15562,6 +16717,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15831,6 +17001,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16100,6 +17285,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16369,6 +17569,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16638,6 +17853,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16907,6 +18137,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17176,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17445,6 +18705,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17714,6 +18989,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17983,6 +19273,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18252,6 +19557,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18521,6 +19841,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18790,6 +20125,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19059,6 +20409,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index ee7d79a8a8cbb..0fd4aa4a7a93f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-LABEL: flat_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-LABEL: flat_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -379,6 +416,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -409,6 +450,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -422,6 +465,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -528,6 +573,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-LABEL: flat_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +592,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -561,6 +613,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -593,6 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -607,6 +665,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -726,6 +786,9 @@ entry: define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +800,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -748,6 +815,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -770,6 +841,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -780,6 +853,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -860,6 +935,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +949,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -882,6 +964,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -904,6 +990,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -914,6 +1002,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1084,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1006,6 +1099,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1019,6 +1116,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1043,6 +1144,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1054,6 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,6 +1250,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1157,6 +1265,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1194,6 +1310,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1205,6 +1323,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1296,6 +1416,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1307,6 +1430,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1318,6 +1445,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1340,6 +1471,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1350,6 +1483,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1430,6 +1565,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1442,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1456,6 +1598,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1480,6 +1626,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1491,6 +1639,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,6 +1733,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,6 +1748,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1608,6 +1765,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1793,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1806,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,6 +1899,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,6 +1915,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1763,6 +1935,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1789,6 +1965,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,6 +1979,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1917,6 +2100,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,6 +2120,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1959,6 +2150,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1971,6 +2164,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,6 +2269,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2138,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,6 +2475,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2282,6 +2495,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2300,6 +2517,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2334,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2475,6 +2700,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2492,6 +2720,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2510,6 +2742,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2544,6 +2780,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +2797,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2685,6 +2925,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2710,6 +2953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2735,6 +2982,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2785,6 +3036,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2799,6 +3052,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2908,6 +3163,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2934,6 +3192,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2962,6 +3224,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3014,6 +3280,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3029,6 +3297,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3150,6 +3420,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3176,6 +3449,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3203,6 +3480,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3255,6 +3536,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3270,6 +3553,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3390,6 +3675,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3417,6 +3705,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3447,6 +3739,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3501,6 +3797,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3517,6 +3815,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3649,6 +3949,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3676,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3706,6 +4013,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3760,6 +4071,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3776,6 +4089,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3908,6 +4223,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3934,6 +4252,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3962,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4014,6 +4340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4029,6 +4357,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4150,6 +4480,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4176,6 +4509,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4204,6 +4541,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4256,6 +4597,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4271,6 +4614,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4392,6 +4737,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4419,6 +4767,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4449,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4503,6 +4859,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4519,6 +4877,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4651,6 +5011,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4678,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4708,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4762,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4778,6 +5151,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4910,6 +5285,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4937,6 +5315,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4967,6 +5349,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5021,6 +5407,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5037,6 +5425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5169,6 +5559,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5196,6 +5589,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5226,6 +5623,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5280,6 +5681,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5296,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5428,6 +5833,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5457,6 +5865,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5486,6 +5898,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5544,6 +5960,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5561,6 +5979,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5697,6 +6117,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5727,6 +6150,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5757,6 +6184,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5817,6 +6248,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5835,6 +6268,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5978,6 +6413,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6008,6 +6446,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6039,6 +6481,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6099,6 +6545,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6117,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6264,6 +6714,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6295,6 +6748,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6327,6 +6784,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6389,6 +6850,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6408,6 +6871,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6564,6 +7029,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6595,6 +7063,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6627,6 +7099,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6689,6 +7165,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6708,6 +7186,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6864,6 +7344,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6894,6 +7377,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6924,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6984,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7002,6 +7495,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7147,6 +7642,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7177,6 +7675,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7207,6 +7709,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7267,6 +7773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7285,6 +7793,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7428,6 +7938,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7459,6 +7972,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7491,6 +8008,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7553,6 +8074,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7572,6 +8095,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7728,6 +8253,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7759,6 +8287,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7791,6 +8323,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7853,6 +8389,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7872,6 +8410,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8028,6 +8568,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8059,6 +8602,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8091,6 +8638,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8153,6 +8704,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8172,6 +8725,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8328,6 +8883,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8359,6 +8917,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8391,6 +8953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8453,6 +9019,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8472,6 +9040,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8628,6 +9198,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8659,6 +9232,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8691,6 +9268,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8753,6 +9334,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8772,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8926,6 +9511,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8957,6 +9545,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8989,6 +9581,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9051,6 +9647,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9070,6 +9668,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9226,6 +9826,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9257,6 +9860,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9289,6 +9896,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9351,6 +9962,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9370,6 +9983,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9526,6 +10141,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9557,6 +10175,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9589,6 +10211,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9651,6 +10277,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9826,6 +10456,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-LABEL: flat_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9840,6 +10473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9854,6 +10491,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9882,6 +10523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9894,6 +10537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9993,6 +10638,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10007,6 +10655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10021,6 +10673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10049,6 +10705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10061,6 +10719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10160,6 +10820,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-LABEL: flat_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10174,6 +10837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,6 +10857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10218,6 +10889,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10230,6 +10903,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11010,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +11027,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10367,6 +11049,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10395,6 +11081,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10407,6 +11095,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10522,6 +11212,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10533,6 +11226,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10544,6 +11241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,6 +11267,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10576,6 +11279,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10656,6 +11361,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10667,6 +11375,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10678,6 +11390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10700,6 +11416,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10710,6 +11428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10790,6 +11510,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10801,6 +11524,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10814,6 +11541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10836,6 +11567,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10846,6 +11579,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10934,6 +11669,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10945,6 +11683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10958,6 +11700,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10980,6 +11726,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10990,6 +11738,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11078,6 +11828,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11089,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11100,6 +11857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11122,6 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11132,6 +11895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11212,6 +11977,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11223,6 +11991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11236,6 +12008,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11258,6 +12034,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11268,6 +12046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11356,6 +12136,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11367,6 +12150,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11380,6 +12167,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11402,6 +12193,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12205,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11500,6 +12295,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11511,6 +12309,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11526,6 +12328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11548,6 +12354,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11558,6 +12366,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11654,6 +12464,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11665,6 +12478,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11702,6 +12523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11712,6 +12535,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11808,6 +12633,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11823,6 +12651,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11840,6 +12672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11870,6 +12706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11883,6 +12721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11995,6 +12835,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12010,6 +12853,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12029,6 +12876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12059,6 +12910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12072,6 +12925,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12194,6 +13049,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12209,6 +13067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12228,6 +13090,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12258,6 +13124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12271,6 +13139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12393,6 +13263,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12418,6 +13291,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12443,6 +13320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12493,6 +13374,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12507,6 +13390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12616,6 +13501,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12641,6 +13529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12668,6 +13560,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12718,6 +13614,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12732,6 +13630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12849,6 +13749,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12874,6 +13777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12901,6 +13808,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12951,6 +13862,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12965,6 +13878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13082,6 +13997,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13107,6 +14025,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13136,6 +14058,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13186,6 +14112,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13200,6 +14128,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13325,6 +14255,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13350,6 +14283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13379,6 +14316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13429,6 +14370,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13443,6 +14386,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13568,6 +14513,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13593,6 +14541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13620,6 +14572,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13670,6 +14626,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13684,6 +14642,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13801,6 +14761,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13826,6 +14789,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13853,6 +14820,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13903,6 +14874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13917,6 +14890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14034,6 +15009,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14059,6 +15037,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14088,6 +15070,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14138,6 +15124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14152,6 +15140,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14277,6 +15267,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14302,6 +15295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14331,6 +15328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14381,6 +15382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14395,6 +15398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14520,6 +15525,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14545,6 +15553,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14574,6 +15586,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14624,6 +15640,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14638,6 +15656,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14763,6 +15783,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14788,6 +15811,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14817,6 +15844,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14867,6 +15898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14881,6 +15914,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15006,6 +16041,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15031,6 +16069,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15060,6 +16102,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15110,6 +16156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15124,6 +16172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15249,6 +16299,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15274,6 +16327,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15303,6 +16360,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15353,6 +16414,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15367,6 +16430,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15492,6 +16557,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15517,6 +16585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15546,6 +16618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15596,6 +16672,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15610,6 +16688,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15735,6 +16815,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15760,6 +16843,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15789,6 +16876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15839,6 +16930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15853,6 +16946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15978,6 +17073,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16007,6 +17105,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16036,6 +17138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16094,6 +17200,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16111,6 +17219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16247,6 +17357,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16276,6 +17389,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16307,6 +17424,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16365,6 +17486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16382,6 +17505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16524,6 +17649,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16553,6 +17681,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16584,6 +17716,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16642,6 +17778,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16659,6 +17797,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16803,6 +17943,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16832,6 +17975,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16865,6 +18012,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16923,6 +18074,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16940,6 +18093,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17092,6 +18247,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17121,6 +18279,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17154,6 +18316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17212,6 +18378,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17229,6 +18397,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17381,6 +18551,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17410,6 +18583,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17441,6 +18618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17499,6 +18680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17516,6 +18699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17660,6 +18845,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17689,6 +18877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17720,6 +18912,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17778,6 +18974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17795,6 +18993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17937,6 +19137,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17966,6 +19169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17999,6 +19206,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18057,6 +19268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18074,6 +19287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18226,6 +19441,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18255,6 +19473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18288,6 +19510,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18346,6 +19572,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18363,6 +19591,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18515,6 +19745,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18544,6 +19777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18577,6 +19814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18635,6 +19876,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18652,6 +19895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18804,6 +20049,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18833,6 +20081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18866,6 +20118,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18924,6 +20180,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18941,6 +20199,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19093,6 +20353,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19122,6 +20385,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19155,6 +20422,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19213,6 +20484,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19230,6 +20503,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19380,6 +20655,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19409,6 +20687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19442,6 +20724,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19500,6 +20786,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19517,6 +20805,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19669,6 +20959,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19698,6 +20991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19731,6 +21028,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19789,6 +21090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19806,6 +21109,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19958,6 +21263,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19987,6 +21295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20020,6 +21332,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20078,6 +21394,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20095,6 +21413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index b9487f8e14c2b..8b600c835a160 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX7-LABEL: global_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX7-LABEL: global_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -404,6 +410,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX7-LABEL: global_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -602,6 +611,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX7-LABEL: global_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -809,6 +821,9 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +977,9 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1134,9 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1294,6 +1315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,6 +1494,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1623,6 +1650,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1802,6 +1832,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX7-LABEL: global_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1980,6 +2013,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,6 +2221,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2390,6 +2429,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2587,6 +2629,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2812,6 +2857,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3256,6 +3307,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3500,6 +3554,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3743,6 +3800,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4013,6 +4073,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4282,6 +4345,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4527,6 +4593,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4773,6 +4842,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5043,6 +5115,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5313,6 +5388,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5583,6 +5661,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5853,6 +5934,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6123,6 +6207,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6393,6 +6480,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6663,6 +6753,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6933,6 +7026,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7182,6 +7278,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7447,6 +7546,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7721,6 +7823,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8015,6 +8120,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8308,6 +8416,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8577,6 +8688,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8843,6 +8957,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9137,6 +9254,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9431,6 +9551,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9725,6 +9848,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10019,6 +10145,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10309,6 +10438,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10603,6 +10735,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10897,6 +11032,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11189,6 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX7-LABEL: global_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +11511,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX7-LABEL: global_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11552,6 +11696,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX7-LABEL: global_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11750,6 +11897,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11957,6 +12107,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12110,6 +12263,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12264,6 +12420,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12442,6 +12601,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12618,6 +12780,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12771,6 +12936,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12950,6 +13118,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13128,6 +13299,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13333,6 +13507,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13538,6 +13715,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,6 +13915,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13960,6 +14143,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14186,6 +14372,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14404,6 +14593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14648,6 +14840,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14891,6 +15086,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15161,6 +15359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15430,6 +15631,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15675,6 +15879,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15921,6 +16128,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16191,6 +16401,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16461,6 +16674,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16731,6 +16947,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17001,6 +17220,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17271,6 +17493,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17541,6 +17766,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17811,6 +18039,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18081,6 +18312,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18330,6 +18564,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18596,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18890,6 +19130,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19183,6 +19426,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19452,6 +19698,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19718,6 +19967,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20012,6 +20264,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20306,6 +20561,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20600,6 +20858,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20894,6 +21155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21184,6 +21448,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21478,6 +21745,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21772,6 +22042,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index a6bd1b678f95e..16e55058e4fc8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -36,6 +36,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX7-LABEL: global_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -227,6 +230,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -474,6 +480,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX7-LABEL: global_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -658,6 +667,9 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX7-LABEL: global_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -891,6 +903,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX7-LABEL: global_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index a5de6a92db1af..8042d38716107 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX7-LABEL: global_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX7-LABEL: global_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX7-LABEL: global_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -758,6 +770,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -911,6 +926,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 69404247ccd6e..9c11781da56f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX7-LABEL: global_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX7-LABEL: global_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -404,6 +410,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX7-LABEL: global_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -604,6 +613,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX7-LABEL: global_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -813,6 +825,9 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -966,6 +981,9 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1120,6 +1138,9 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,6 +1323,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1482,6 +1506,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1635,6 +1662,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,6 +1846,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1998,6 +2031,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2209,6 +2245,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2420,6 +2459,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2619,6 +2661,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2850,6 +2895,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3082,6 +3130,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3300,6 +3351,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3546,6 +3600,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3793,6 +3850,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4069,6 +4129,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4344,6 +4407,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4591,6 +4657,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4839,6 +4908,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5115,6 +5187,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5391,6 +5466,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5667,6 +5745,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5943,6 +6024,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6192,6 +6276,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6460,6 +6547,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6760,6 +6850,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7059,6 +7152,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7330,6 +7426,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7598,6 +7697,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7898,6 +8000,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8198,6 +8303,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8498,6 +8606,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8798,6 +8909,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9094,6 +9208,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9394,6 +9511,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9694,6 +9814,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9992,6 +10115,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX7-LABEL: global_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10173,6 +10299,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX7-LABEL: global_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10355,6 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX7-LABEL: global_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10555,6 +10687,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX7-LABEL: global_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +10899,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10917,6 +11055,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11071,6 +11212,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11253,6 +11397,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11433,6 +11580,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11586,6 +11736,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11767,6 +11920,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11949,6 +12105,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12160,6 +12319,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12371,6 +12533,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12570,6 +12735,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12801,6 +12969,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13033,6 +13204,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13251,6 +13425,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13497,6 +13674,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13744,6 +13924,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14020,6 +14203,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14295,6 +14481,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14542,6 +14731,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14790,6 +14982,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15066,6 +15261,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15342,6 +15540,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15618,6 +15819,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15894,6 +16098,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16170,6 +16377,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16446,6 +16656,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16722,6 +16935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16998,6 +17214,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17247,6 +17466,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17514,6 +17736,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17792,6 +18017,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18092,6 +18320,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18391,6 +18622,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18662,6 +18896,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18930,6 +19167,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19230,6 +19470,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19530,6 +19773,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19830,6 +20076,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20130,6 +20379,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20426,6 +20678,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20726,6 +20981,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21026,6 +21284,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 7dfd5e60c24f8..8a5c5dda9f79c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -37,6 +37,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX7-LABEL: global_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -184,6 +187,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -372,6 +378,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX7-LABEL: global_volatile_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -527,6 +536,9 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX7-LABEL: global_volatile_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -718,6 +730,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: global_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -852,6 +867,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 4b6c99282dc13..151ba07a0b531 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX7-LABEL: global_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX7-LABEL: global_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX7-LABEL: global_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -758,6 +770,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -911,6 +926,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 46d65187cb1b2..69b0c7f93ab0e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX7-LABEL: global_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX7-LABEL: global_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX7-LABEL: global_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -590,6 +599,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -780,6 +792,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -933,6 +948,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1087,6 +1105,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1258,6 +1279,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1427,6 +1451,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1578,6 +1605,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1740,6 +1770,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1909,6 +1942,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2088,6 +2124,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2268,6 +2307,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2454,6 +2496,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2659,6 +2704,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2866,6 +2914,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3082,6 +3133,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3309,6 +3363,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3543,6 +3600,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3787,6 +3847,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4030,6 +4093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4256,6 +4322,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4483,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4727,6 +4799,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4971,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5215,6 +5293,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5459,6 +5540,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5703,6 +5787,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5947,6 +6034,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6191,6 +6281,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6437,6 +6530,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6685,6 +6781,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6939,6 +7038,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7205,6 +7307,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7478,6 +7583,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7750,6 +7858,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8005,6 +8116,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8259,6 +8373,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8532,6 +8649,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8805,6 +8925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9078,6 +9201,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9351,6 +9477,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9622,6 +9751,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9895,6 +10027,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10168,6 +10303,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10440,6 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10621,6 +10762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10802,6 +10946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10988,6 +11135,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11175,6 +11325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11328,6 +11481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11481,6 +11637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11644,6 +11803,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11806,6 +11968,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11957,6 +12122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12118,6 +12286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12279,6 +12450,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12450,6 +12624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12623,6 +12800,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12808,6 +12988,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13005,6 +13188,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13205,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13421,6 +13610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13647,6 +13839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13873,6 +14068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14109,6 +14307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14345,6 +14546,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14571,6 +14775,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14797,6 +15004,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15033,6 +15243,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15269,6 +15482,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15505,6 +15721,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15741,6 +15960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15977,6 +16199,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16213,6 +16438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16449,6 +16677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16688,6 +16919,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +17170,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17189,6 +17426,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17447,6 +17687,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17712,6 +17955,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17977,6 +18223,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18232,6 +18481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18485,6 +18737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18750,6 +19005,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19015,6 +19273,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19280,6 +19541,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19545,6 +19809,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19808,6 +20075,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20073,6 +20343,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20338,6 +20611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 04b0f00fe77b5..78209ee34cad4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -38,6 +38,9 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX7-LABEL: local_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -224,6 +227,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX7-LABEL: local_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 @@ -830,6 +836,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX7-LABEL: local_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 9e5f5fcffca9f..bc2508411ed6b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -34,6 +34,9 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX7-LABEL: local_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -172,6 +175,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX7-LABEL: local_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index fceee413f3f97..2aa4f021c259c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -38,7 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX7-LABEL: private_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-WGP-LABEL: private_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -67,7 +70,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-CU-LABEL: private_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -107,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -121,7 +124,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -232,7 +235,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX7-LABEL: private_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -249,7 +255,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-WGP-LABEL: private_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -265,7 +271,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-CU-LABEL: private_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -309,7 +315,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -328,7 +334,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -470,7 +476,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX7-LABEL: private_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 @@ -484,7 +490,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-WGP-LABEL: private_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -498,7 +504,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-CU-LABEL: private_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -544,7 +550,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -647,7 +653,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX7-LABEL: private_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -663,7 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-WGP-LABEL: private_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -678,7 +684,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-CU-LABEL: private_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -713,7 +719,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -731,7 +737,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -874,7 +880,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX7-LABEL: private_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -889,7 +898,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: private_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -903,7 +912,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: private_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -943,7 +952,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -957,7 +966,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index f8fb7986938f2..df4193969f8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -38,7 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX7-LABEL: private_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-WGP-LABEL: private_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -67,7 +70,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-CU-LABEL: private_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -190,7 +193,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX7-LABEL: private_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -207,7 +213,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-WGP-LABEL: private_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -223,7 +229,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-CU-LABEL: private_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -365,7 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX7-LABEL: private_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 @@ -380,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-WGP-LABEL: private_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -395,7 +401,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-CU-LABEL: private_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -515,7 +521,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX7-LABEL: private_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -532,7 +538,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-WGP-LABEL: private_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -548,7 +554,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-CU-LABEL: private_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index aaf81e2fa4000..07072f6a36296 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -144,6 +150,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -155,6 +164,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -214,6 +226,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -225,6 +240,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -288,6 +306,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s3, s3, s7 ; CI-NEXT: s_min_i32 s2, s2, s6 @@ -306,6 +327,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s3, s3, s7 ; VI-NEXT: s_min_i32 s2, s2, s6 @@ -414,11 +438,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -429,11 +456,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -549,6 +579,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -572,6 +604,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -582,6 +615,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 @@ -605,6 +640,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -757,6 +793,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -776,6 +815,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -857,6 +899,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -887,6 +932,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -983,10 +1031,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1005,10 +1056,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1122,10 +1176,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1144,10 +1201,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1233,6 +1293,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1244,6 +1307,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1305,6 +1371,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1319,6 +1388,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1391,6 +1463,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1403,6 +1478,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1468,6 +1546,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1480,6 +1561,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1557,10 +1641,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1579,10 +1666,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1686,12 +1776,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1710,12 +1803,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1838,12 +1934,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -1874,12 +1973,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -1976,6 +2078,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1987,6 +2092,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2059,10 +2167,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2081,10 +2192,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2188,6 +2302,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2209,6 +2326,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2294,6 +2414,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2305,6 +2428,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2386,6 +2512,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2407,6 +2536,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2534,6 +2666,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2556,6 +2691,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2646,6 +2784,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2657,6 +2798,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2726,6 +2870,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2757,6 +2904,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2921,6 +3071,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s10, s0, 16 ; CI-NEXT: s_and_b32 s0, s0, 0xffff @@ -2967,6 +3120,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s10, s3, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3088,11 +3244,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3103,11 +3262,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3195,11 +3357,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3210,11 +3375,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3309,6 +3477,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3323,6 +3494,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3403,6 +3577,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3421,6 +3598,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3510,6 +3690,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3528,6 +3711,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3617,6 +3803,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,6 +3824,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3724,6 +3916,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3742,6 +3937,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,9 +4053,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -3886,10 +4087,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -4005,9 +4209,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4035,10 +4242,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 337320b9eeea1..b1ce5a3423f20 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index bc1710686a087..5803821a1d2c0 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index 2e9f09ad41813..7c9ecc892478c 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals ; Check that no attributes are added to graphics functions -; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s ; Check that it doesn't crash @@ -12,12 +11,6 @@ target datalayout = "A5" define amdgpu_cs void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() { -; AKF_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; AKF_GCN-NEXT: [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr -; AKF_GCN-NEXT: call amdgpu_gfx void [[FUN]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() @@ -68,7 +61,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0 attributes #0 = { nounwind readnone speculatable willreturn } ;. -; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 1a34fa3bbbf4d..24677b60be6c2 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -10,32 +10,33 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %6 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7 - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8 + + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy ; PEI-GFX908: bb.0 (%ir-block.0): - ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX908-NEXT: {{ $}} - ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -44,7 +45,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -55,31 +56,31 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %6 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7 - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8 + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; ; PEI-GFX90A-LABEL: name: partial_copy ; PEI-GFX90A: bb.0 (%ir-block.0): - ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -87,7 +88,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 00507c1eafd6e..c26f0926d86b2 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB0_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB0_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -54,17 +54,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: preload_unused_arg_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB1_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB1_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s12 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -90,7 +89,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: no_free_sgprs_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB2_0 ; GFX90a-NEXT: .p2align 8 @@ -100,7 +99,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] +; GFX90a-NEXT: global_store_dword v0, v1, s[14:15] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -181,7 +180,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: incorrect_type_i64_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB5_0 ; GFX90a-NEXT: .p2align 8 @@ -191,7 +190,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i64, ptr addrspace(4) %imp_arg_ptr @@ -217,7 +216,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: incorrect_type_i16_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB6_0 ; GFX90a-NEXT: .p2align 8 @@ -227,7 +226,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i16, ptr addrspace(4) %imp_arg_ptr @@ -252,16 +251,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB7_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB7_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 @@ -289,7 +287,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: random_incorrect_offset: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB8_0 ; GFX90a-NEXT: .p2align 8 @@ -300,7 +298,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 @@ -327,17 +325,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB9_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB9_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s12 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 @@ -366,19 +363,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa ; ; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB10_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB10_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff -; GFX90a-NEXT: s_add_i32 s0, s10, s0 +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff +; GFX90a-NEXT: s_add_i32 s0, s12, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -408,19 +404,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_block_count_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB11_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB11_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 @@ -454,17 +449,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB12_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB12_0: -; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 @@ -492,17 +487,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB13_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 @@ -531,18 +526,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB14_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 @@ -575,22 +570,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: preload_workgroup_size_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB15_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff -; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff +; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 @@ -628,18 +623,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preload_remainder_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB16_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB16_0: -; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s14, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 @@ -668,18 +663,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preloadremainder_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB17_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB17_0: -; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 @@ -708,18 +701,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preloadremainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB18_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB18_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s15, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 @@ -752,22 +743,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preloadremainder_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB19_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB19_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 -; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s15, 16 +; GFX90a-NEXT: s_lshr_b32 s1, s14, 16 +; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s2 ; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 @@ -803,7 +792,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; ; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB20_0 ; GFX90a-NEXT: .p2align 8 @@ -814,7 +803,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] +; GFX90a-NEXT: global_store_dword v0, v1, s[14:15] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 @@ -844,10 +833,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg % ; ; GFX90a-LABEL: preload_block_max_user_sgprs: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB21_0 ; GFX90a-NEXT: .p2align 8 @@ -857,7 +843,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg % ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -887,21 +873,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; ; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB22_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c +; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index fe6378435a42e..7ae0c11dca279 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) ; ; GFX90a-LABEL: ptr1_i8: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB0_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB0_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out @@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero ; ; GFX90a-LABEL: ptr1_i8_zext_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB1_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB1_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 ; ; GFX90a-LABEL: ptr1_i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB2_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB2_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 ; ; GFX90a-LABEL: ptr1_i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB3_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB3_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void @@ -160,18 +160,17 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa ; ; GFX90a-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB4_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB4_0: -; GFX90a-NEXT: s_add_i32 s0, s6, s10 +; GFX90a-NEXT: s_add_i32 s0, s8, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] ; GFX90a-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out @@ -198,19 +197,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB5_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB5_0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s10, 16 +; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff ; GFX90a-NEXT: s_add_i32 s0, s1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -236,16 +235,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 ; ; GFX90a-LABEL: ptr1_v2i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB6_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB6_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void @@ -274,7 +273,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; ; GFX90a-LABEL: byref_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB7_0 ; GFX90a-NEXT: .p2align 8 @@ -285,9 +284,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) -; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v2, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -320,7 +319,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: byref_staggered_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB8_0 ; GFX90a-NEXT: .p2align 8 @@ -331,9 +330,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) -; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v2, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -370,26 +369,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x ; ; GFX90a-LABEL: v8i32_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB9_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB9_0: -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: v_mov_b32_e32 v2, s18 +; GFX90a-NEXT: v_mov_b32_e32 v3, s19 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 +; GFX90a-NEXT: s_nop 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX90a-NEXT: s_nop 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void @@ -414,18 +413,17 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB10_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB10_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void @@ -451,19 +449,17 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB11_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB11_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void @@ -489,19 +485,17 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3f32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB12_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB12_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void @@ -533,25 +527,24 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou ; ; GFX90a-LABEL: v5i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB13_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void @@ -587,29 +580,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; ; GFX90a-LABEL: v5f64_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB14_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: ; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: v_mov_b32_e32 v2, s18 +; GFX90a-NEXT: v_mov_b32_e32 v3, s19 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 +; GFX90a-NEXT: s_nop 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX90a-NEXT: s_nop 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void @@ -647,31 +640,30 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8 ; ; GFX90a-LABEL: v8i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB15_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s1, s9, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s11, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_lshr_b32 s2, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s2, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s2, s2, 8 -; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010 -; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff +; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010 +; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s2, s3, s2 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 -; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s2, s2, 16 ; GFX90a-NEXT: s_or_b32 s1, s1, s2 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -694,16 +686,15 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i ; ; GFX90a-LABEL: i64_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB16_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB16_0: ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -726,16 +717,15 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d ; ; GFX90a-LABEL: f64_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB17_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB17_0: ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void @@ -758,16 +748,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: half_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB18_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB18_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out ret void @@ -790,16 +780,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX90a-LABEL: bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB19_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB19_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store bfloat %in, ptr addrspace(1) %out ret void @@ -822,16 +812,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v2bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB20_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB20_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <2 x bfloat> %in, ptr addrspace(1) %out ret void @@ -856,18 +846,17 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v3bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB21_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB21_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x bfloat> %in, ptr addrspace(1) %out ret void @@ -893,19 +882,17 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v6bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB22_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <6 x bfloat> %in, ptr addrspace(1) %out ret void @@ -934,24 +921,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; ; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB23_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB23_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_short v3, v0, s[8:9] +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -976,17 +963,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 ; ; GFX90a-LABEL: i1_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB24_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB24_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-NEXT: s_and_b32 s0, s10, 1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store i1 %in, ptr addrspace(1) %out ret void @@ -1013,20 +1000,18 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: fp128_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB25_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB25_0: ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: v_mov_b32_e32 v3, s13 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store fp128 %in, ptr addrspace(1) %out ret void @@ -1059,26 +1044,25 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: v7i8_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB26_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB26_0: -; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <7 x i8> %in, ptr addrspace(1) %out ret void @@ -1106,21 +1090,19 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX90a-LABEL: v7half_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB27_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB27_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s15 +; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <7 x half> %in, ptr addrspace(1) %out ret void @@ -1145,18 +1127,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: i16_i32_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB28_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB28_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i32 %in2, ptr addrspace(1) %out2 @@ -1184,22 +1166,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; ; GFX90a-LABEL: i16_v3i32_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB29_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB29_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v4, s8 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; GFX90a-NEXT: v_mov_b32_e32 v4, s10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: global_store_short v3, v4, s[8:9] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 @@ -1224,17 +1206,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: i16_i16_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB30_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB30_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] +; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i16 %in2, ptr addrspace(1) %out2 @@ -1264,22 +1246,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: i16_v2i8_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB31_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB31_0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s0, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s0, s0, 8 -; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s0, s1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[10:11] +; GFX90a-NEXT: global_store_short v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <2 x i8> %in2, ptr addrspace(1) %out2 @@ -1308,7 +1290,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p ; ; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB32_0 ; GFX90a-NEXT: .p2align 8 @@ -1318,7 +1300,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p ; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_add_i32 s2, s6, s2 +; GFX90a-NEXT: s_add_i32 s2, s8, s2 ; GFX90a-NEXT: v_mov_b32_e32 v1, s2 ; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NEXT: s_endpgm @@ -1345,17 +1327,16 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: ptr1_i8_trailing_unused: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB33_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB33_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 5474338514522..8f25e6519588b 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -79,12 +88,14 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: s_max_u32 s0, s0, s1 @@ -92,8 +103,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -115,19 +127,22 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_add_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -147,21 +162,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_max_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -182,21 +200,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_min_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -218,21 +239,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -251,12 +275,14 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_select_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: s_max_u32 s0, s0, s1 @@ -264,8 +290,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc ; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 @@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc ; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 @@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 @@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i16_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i8_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 884ba3fc34dff..29448ab2d822e 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 0ad10437299f4..90dfd5a21d107 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -20,179 +20,183 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v22, s4, 2 ; CHECK-NEXT: v_writelane_b32 v22, s5, 3 ; CHECK-NEXT: v_writelane_b32 v22, s6, 4 -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s7, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 6 -; CHECK-NEXT: v_writelane_b32 v22, s5, 7 -; CHECK-NEXT: v_writelane_b32 v22, s6, 8 -; CHECK-NEXT: v_writelane_b32 v22, s7, 9 -; CHECK-NEXT: v_writelane_b32 v22, s8, 10 -; CHECK-NEXT: v_writelane_b32 v22, s9, 11 -; CHECK-NEXT: v_writelane_b32 v22, s10, 12 -; CHECK-NEXT: v_writelane_b32 v22, s11, 13 +; CHECK-NEXT: v_writelane_b32 v22, s8, 6 +; CHECK-NEXT: v_writelane_b32 v22, s9, 7 +; CHECK-NEXT: v_writelane_b32 v22, s10, 8 +; CHECK-NEXT: v_writelane_b32 v22, s11, 9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 14 -; CHECK-NEXT: v_writelane_b32 v22, s5, 15 -; CHECK-NEXT: v_writelane_b32 v22, s6, 16 -; CHECK-NEXT: v_writelane_b32 v22, s7, 17 -; CHECK-NEXT: v_writelane_b32 v22, s8, 18 -; CHECK-NEXT: v_writelane_b32 v22, s9, 19 -; CHECK-NEXT: v_writelane_b32 v22, s10, 20 -; CHECK-NEXT: v_writelane_b32 v22, s11, 21 -; CHECK-NEXT: v_writelane_b32 v22, s12, 22 -; CHECK-NEXT: v_writelane_b32 v22, s13, 23 -; CHECK-NEXT: v_writelane_b32 v22, s14, 24 -; CHECK-NEXT: v_writelane_b32 v22, s15, 25 -; CHECK-NEXT: v_writelane_b32 v22, s16, 26 -; CHECK-NEXT: v_writelane_b32 v22, s17, 27 -; CHECK-NEXT: v_writelane_b32 v22, s18, 28 -; CHECK-NEXT: v_writelane_b32 v22, s19, 29 +; CHECK-NEXT: v_writelane_b32 v22, s4, 10 +; CHECK-NEXT: v_writelane_b32 v22, s5, 11 +; CHECK-NEXT: v_writelane_b32 v22, s6, 12 +; CHECK-NEXT: v_writelane_b32 v22, s7, 13 +; CHECK-NEXT: v_writelane_b32 v22, s8, 14 +; CHECK-NEXT: v_writelane_b32 v22, s9, 15 +; CHECK-NEXT: v_writelane_b32 v22, s10, 16 +; CHECK-NEXT: v_writelane_b32 v22, s11, 17 +; CHECK-NEXT: v_writelane_b32 v22, s12, 18 +; CHECK-NEXT: v_writelane_b32 v22, s13, 19 +; CHECK-NEXT: v_writelane_b32 v22, s14, 20 +; CHECK-NEXT: v_writelane_b32 v22, s15, 21 +; CHECK-NEXT: v_writelane_b32 v22, s16, 22 +; CHECK-NEXT: v_writelane_b32 v22, s17, 23 +; CHECK-NEXT: v_writelane_b32 v22, s18, 24 +; CHECK-NEXT: v_writelane_b32 v22, s19, 25 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[42:43] +; CHECK-NEXT: ; def s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:55] +; CHECK-NEXT: ; def s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 30 -; CHECK-NEXT: v_writelane_b32 v22, s5, 31 -; CHECK-NEXT: v_writelane_b32 v22, s6, 32 -; CHECK-NEXT: v_writelane_b32 v22, s7, 33 -; CHECK-NEXT: v_writelane_b32 v22, s8, 34 -; CHECK-NEXT: v_writelane_b32 v22, s9, 35 -; CHECK-NEXT: v_writelane_b32 v22, s10, 36 -; CHECK-NEXT: v_writelane_b32 v22, s11, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 26 +; CHECK-NEXT: v_writelane_b32 v22, s5, 27 +; CHECK-NEXT: v_writelane_b32 v22, s6, 28 +; CHECK-NEXT: v_writelane_b32 v22, s7, 29 +; CHECK-NEXT: v_writelane_b32 v22, s8, 30 +; CHECK-NEXT: v_writelane_b32 v22, s9, 31 +; CHECK-NEXT: v_writelane_b32 v22, s10, 32 +; CHECK-NEXT: v_writelane_b32 v22, s11, 33 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[40:41] +; CHECK-NEXT: ; def s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:39] +; CHECK-NEXT: ; def s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v22, s0, 34 +; CHECK-NEXT: v_writelane_b32 v22, s1, 35 +; CHECK-NEXT: v_writelane_b32 v22, s2, 36 +; CHECK-NEXT: v_writelane_b32 v22, s3, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 38 +; CHECK-NEXT: v_writelane_b32 v22, s5, 39 +; CHECK-NEXT: v_writelane_b32 v22, s6, 40 +; CHECK-NEXT: v_writelane_b32 v22, s7, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 38 -; CHECK-NEXT: v_writelane_b32 v22, s1, 39 -; CHECK-NEXT: v_writelane_b32 v22, s2, 40 -; CHECK-NEXT: v_writelane_b32 v22, s3, 41 -; CHECK-NEXT: v_writelane_b32 v22, s4, 42 -; CHECK-NEXT: v_writelane_b32 v22, s5, 43 -; CHECK-NEXT: v_writelane_b32 v22, s6, 44 -; CHECK-NEXT: v_writelane_b32 v22, s7, 45 -; CHECK-NEXT: v_writelane_b32 v22, s8, 46 -; CHECK-NEXT: v_writelane_b32 v22, s9, 47 -; CHECK-NEXT: v_writelane_b32 v22, s10, 48 -; CHECK-NEXT: v_writelane_b32 v22, s11, 49 -; CHECK-NEXT: v_writelane_b32 v22, s12, 50 -; CHECK-NEXT: v_writelane_b32 v22, s13, 51 -; CHECK-NEXT: v_writelane_b32 v22, s14, 52 -; CHECK-NEXT: v_writelane_b32 v22, s15, 53 +; CHECK-NEXT: v_writelane_b32 v22, s0, 42 +; CHECK-NEXT: v_writelane_b32 v22, s1, 43 +; CHECK-NEXT: v_writelane_b32 v22, s2, 44 +; CHECK-NEXT: v_writelane_b32 v22, s3, 45 +; CHECK-NEXT: v_writelane_b32 v22, s4, 46 +; CHECK-NEXT: v_writelane_b32 v22, s5, 47 +; CHECK-NEXT: v_writelane_b32 v22, s6, 48 +; CHECK-NEXT: v_writelane_b32 v22, s7, 49 +; CHECK-NEXT: v_writelane_b32 v22, s8, 50 +; CHECK-NEXT: v_writelane_b32 v22, s9, 51 +; CHECK-NEXT: v_writelane_b32 v22, s10, 52 +; CHECK-NEXT: v_writelane_b32 v22, s11, 53 +; CHECK-NEXT: v_writelane_b32 v22, s12, 54 +; CHECK-NEXT: v_writelane_b32 v22, s13, 55 +; CHECK-NEXT: v_writelane_b32 v22, s14, 56 +; CHECK-NEXT: v_writelane_b32 v22, s15, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 54 -; CHECK-NEXT: v_writelane_b32 v22, s1, 55 -; CHECK-NEXT: v_writelane_b32 v22, s2, 56 -; CHECK-NEXT: v_writelane_b32 v22, s3, 57 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v22, s0, 58 ; CHECK-NEXT: v_writelane_b32 v22, s1, 59 ; CHECK-NEXT: v_writelane_b32 v22, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v22, s3, 61 -; CHECK-NEXT: v_writelane_b32 v22, s4, 62 -; CHECK-NEXT: v_writelane_b32 v23, s6, 0 -; CHECK-NEXT: v_writelane_b32 v22, s5, 63 -; CHECK-NEXT: v_writelane_b32 v23, s7, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v22, s0, 62 +; CHECK-NEXT: v_writelane_b32 v23, s2, 0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 1 +; CHECK-NEXT: v_writelane_b32 v23, s4, 2 +; CHECK-NEXT: v_writelane_b32 v23, s5, 3 +; CHECK-NEXT: v_writelane_b32 v23, s6, 4 +; CHECK-NEXT: v_writelane_b32 v22, s1, 63 +; CHECK-NEXT: v_writelane_b32 v23, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 2 -; CHECK-NEXT: v_writelane_b32 v23, s1, 3 -; CHECK-NEXT: v_writelane_b32 v23, s2, 4 -; CHECK-NEXT: v_writelane_b32 v23, s3, 5 -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 -; CHECK-NEXT: v_writelane_b32 v23, s12, 14 -; CHECK-NEXT: v_writelane_b32 v23, s13, 15 -; CHECK-NEXT: v_writelane_b32 v23, s14, 16 -; CHECK-NEXT: v_writelane_b32 v23, s15, 17 +; CHECK-NEXT: v_writelane_b32 v23, s0, 6 +; CHECK-NEXT: v_writelane_b32 v23, s1, 7 +; CHECK-NEXT: v_writelane_b32 v23, s2, 8 +; CHECK-NEXT: v_writelane_b32 v23, s3, 9 +; CHECK-NEXT: v_writelane_b32 v23, s4, 10 +; CHECK-NEXT: v_writelane_b32 v23, s5, 11 +; CHECK-NEXT: v_writelane_b32 v23, s6, 12 +; CHECK-NEXT: v_writelane_b32 v23, s7, 13 +; CHECK-NEXT: v_writelane_b32 v23, s8, 14 +; CHECK-NEXT: v_writelane_b32 v23, s9, 15 +; CHECK-NEXT: v_writelane_b32 v23, s10, 16 +; CHECK-NEXT: v_writelane_b32 v23, s11, 17 +; CHECK-NEXT: v_writelane_b32 v23, s12, 18 +; CHECK-NEXT: v_writelane_b32 v23, s13, 19 +; CHECK-NEXT: v_writelane_b32 v23, s14, 20 +; CHECK-NEXT: v_writelane_b32 v23, s15, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 18 -; CHECK-NEXT: v_writelane_b32 v23, s1, 19 +; CHECK-NEXT: v_writelane_b32 v23, s0, 22 +; CHECK-NEXT: v_writelane_b32 v23, s1, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 20 -; CHECK-NEXT: v_writelane_b32 v23, s1, 21 -; CHECK-NEXT: v_writelane_b32 v23, s2, 22 -; CHECK-NEXT: v_writelane_b32 v23, s3, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s0, 24 ; CHECK-NEXT: v_writelane_b32 v23, s1, 25 ; CHECK-NEXT: v_writelane_b32 v23, s2, 26 ; CHECK-NEXT: v_writelane_b32 v23, s3, 27 -; CHECK-NEXT: v_writelane_b32 v23, s4, 28 -; CHECK-NEXT: v_writelane_b32 v23, s5, 29 -; CHECK-NEXT: v_writelane_b32 v23, s6, 30 -; CHECK-NEXT: v_writelane_b32 v23, s7, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 28 +; CHECK-NEXT: v_writelane_b32 v23, s1, 29 +; CHECK-NEXT: v_writelane_b32 v23, s2, 30 +; CHECK-NEXT: v_writelane_b32 v23, s3, 31 +; CHECK-NEXT: v_writelane_b32 v23, s4, 32 +; CHECK-NEXT: v_writelane_b32 v23, s5, 33 +; CHECK-NEXT: v_writelane_b32 v23, s6, 34 +; CHECK-NEXT: v_writelane_b32 v23, s7, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 32 -; CHECK-NEXT: v_writelane_b32 v23, s1, 33 -; CHECK-NEXT: v_writelane_b32 v23, s2, 34 -; CHECK-NEXT: v_writelane_b32 v23, s3, 35 -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 -; CHECK-NEXT: v_writelane_b32 v23, s12, 44 -; CHECK-NEXT: v_writelane_b32 v23, s13, 45 -; CHECK-NEXT: v_writelane_b32 v23, s14, 46 -; CHECK-NEXT: v_writelane_b32 v23, s15, 47 +; CHECK-NEXT: v_writelane_b32 v23, s0, 36 +; CHECK-NEXT: v_writelane_b32 v23, s1, 37 +; CHECK-NEXT: v_writelane_b32 v23, s2, 38 +; CHECK-NEXT: v_writelane_b32 v23, s3, 39 +; CHECK-NEXT: v_writelane_b32 v23, s4, 40 +; CHECK-NEXT: v_writelane_b32 v23, s5, 41 +; CHECK-NEXT: v_writelane_b32 v23, s6, 42 +; CHECK-NEXT: v_writelane_b32 v23, s7, 43 +; CHECK-NEXT: v_writelane_b32 v23, s8, 44 +; CHECK-NEXT: v_writelane_b32 v23, s9, 45 +; CHECK-NEXT: v_writelane_b32 v23, s10, 46 +; CHECK-NEXT: v_writelane_b32 v23, s11, 47 +; CHECK-NEXT: v_writelane_b32 v23, s12, 48 +; CHECK-NEXT: v_writelane_b32 v23, s13, 49 +; CHECK-NEXT: v_writelane_b32 v23, s14, 50 +; CHECK-NEXT: v_writelane_b32 v23, s15, 51 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: s_endpgm @@ -206,166 +210,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s1, v22, 3 ; CHECK-NEXT: v_readlane_b32 s2, v22, 4 ; CHECK-NEXT: v_readlane_b32 s3, v22, 5 +; CHECK-NEXT: v_readlane_b32 s4, v22, 6 +; CHECK-NEXT: v_readlane_b32 s5, v22, 7 +; CHECK-NEXT: v_readlane_b32 s6, v22, 8 +; CHECK-NEXT: v_readlane_b32 s7, v22, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[48:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 6 -; CHECK-NEXT: v_readlane_b32 s1, v22, 7 -; CHECK-NEXT: v_readlane_b32 s2, v22, 8 -; CHECK-NEXT: v_readlane_b32 s3, v22, 9 -; CHECK-NEXT: v_readlane_b32 s4, v22, 10 -; CHECK-NEXT: v_readlane_b32 s5, v22, 11 -; CHECK-NEXT: v_readlane_b32 s6, v22, 12 -; CHECK-NEXT: v_readlane_b32 s7, v22, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 14 -; CHECK-NEXT: v_readlane_b32 s1, v22, 15 -; CHECK-NEXT: v_readlane_b32 s2, v22, 16 -; CHECK-NEXT: v_readlane_b32 s3, v22, 17 -; CHECK-NEXT: v_readlane_b32 s4, v22, 18 -; CHECK-NEXT: v_readlane_b32 s5, v22, 19 -; CHECK-NEXT: v_readlane_b32 s6, v22, 20 -; CHECK-NEXT: v_readlane_b32 s7, v22, 21 -; CHECK-NEXT: v_readlane_b32 s8, v22, 22 -; CHECK-NEXT: v_readlane_b32 s9, v22, 23 -; CHECK-NEXT: v_readlane_b32 s10, v22, 24 -; CHECK-NEXT: v_readlane_b32 s11, v22, 25 -; CHECK-NEXT: v_readlane_b32 s12, v22, 26 -; CHECK-NEXT: v_readlane_b32 s13, v22, 27 -; CHECK-NEXT: v_readlane_b32 s14, v22, 28 -; CHECK-NEXT: v_readlane_b32 s15, v22, 29 +; CHECK-NEXT: v_readlane_b32 s0, v22, 10 +; CHECK-NEXT: v_readlane_b32 s1, v22, 11 +; CHECK-NEXT: v_readlane_b32 s2, v22, 12 +; CHECK-NEXT: v_readlane_b32 s3, v22, 13 +; CHECK-NEXT: v_readlane_b32 s4, v22, 14 +; CHECK-NEXT: v_readlane_b32 s5, v22, 15 +; CHECK-NEXT: v_readlane_b32 s6, v22, 16 +; CHECK-NEXT: v_readlane_b32 s7, v22, 17 +; CHECK-NEXT: v_readlane_b32 s8, v22, 18 +; CHECK-NEXT: v_readlane_b32 s9, v22, 19 +; CHECK-NEXT: v_readlane_b32 s10, v22, 20 +; CHECK-NEXT: v_readlane_b32 s11, v22, 21 +; CHECK-NEXT: v_readlane_b32 s12, v22, 22 +; CHECK-NEXT: v_readlane_b32 s13, v22, 23 +; CHECK-NEXT: v_readlane_b32 s14, v22, 24 +; CHECK-NEXT: v_readlane_b32 s15, v22, 25 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 30 -; CHECK-NEXT: v_readlane_b32 s1, v22, 31 -; CHECK-NEXT: v_readlane_b32 s2, v22, 32 -; CHECK-NEXT: v_readlane_b32 s3, v22, 33 -; CHECK-NEXT: v_readlane_b32 s4, v22, 34 -; CHECK-NEXT: v_readlane_b32 s5, v22, 35 -; CHECK-NEXT: v_readlane_b32 s6, v22, 36 -; CHECK-NEXT: v_readlane_b32 s7, v22, 37 +; CHECK-NEXT: v_readlane_b32 s0, v22, 26 +; CHECK-NEXT: v_readlane_b32 s1, v22, 27 +; CHECK-NEXT: v_readlane_b32 s2, v22, 28 +; CHECK-NEXT: v_readlane_b32 s3, v22, 29 +; CHECK-NEXT: v_readlane_b32 s4, v22, 30 +; CHECK-NEXT: v_readlane_b32 s5, v22, 31 +; CHECK-NEXT: v_readlane_b32 s6, v22, 32 +; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[42:43] +; CHECK-NEXT: ; use s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:55] +; CHECK-NEXT: ; use s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 38 -; CHECK-NEXT: v_readlane_b32 s1, v22, 39 -; CHECK-NEXT: v_readlane_b32 s2, v22, 40 -; CHECK-NEXT: v_readlane_b32 s3, v22, 41 +; CHECK-NEXT: v_readlane_b32 s0, v22, 34 +; CHECK-NEXT: v_readlane_b32 s1, v22, 35 +; CHECK-NEXT: v_readlane_b32 s2, v22, 36 +; CHECK-NEXT: v_readlane_b32 s3, v22, 37 +; CHECK-NEXT: v_readlane_b32 s4, v22, 38 +; CHECK-NEXT: v_readlane_b32 s5, v22, 39 +; CHECK-NEXT: v_readlane_b32 s6, v22, 40 +; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[40:41] +; CHECK-NEXT: ; use s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:39] +; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:51] +; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v22, 42 -; CHECK-NEXT: v_readlane_b32 s5, v22, 43 -; CHECK-NEXT: v_readlane_b32 s6, v22, 44 -; CHECK-NEXT: v_readlane_b32 s7, v22, 45 -; CHECK-NEXT: v_readlane_b32 s8, v22, 46 -; CHECK-NEXT: v_readlane_b32 s9, v22, 47 -; CHECK-NEXT: v_readlane_b32 s10, v22, 48 -; CHECK-NEXT: v_readlane_b32 s11, v22, 49 -; CHECK-NEXT: v_readlane_b32 s12, v22, 50 -; CHECK-NEXT: v_readlane_b32 s13, v22, 51 -; CHECK-NEXT: v_readlane_b32 s14, v22, 52 -; CHECK-NEXT: v_readlane_b32 s15, v22, 53 +; CHECK-NEXT: v_readlane_b32 s0, v22, 42 +; CHECK-NEXT: v_readlane_b32 s1, v22, 43 +; CHECK-NEXT: v_readlane_b32 s2, v22, 44 +; CHECK-NEXT: v_readlane_b32 s3, v22, 45 +; CHECK-NEXT: v_readlane_b32 s4, v22, 46 +; CHECK-NEXT: v_readlane_b32 s5, v22, 47 +; CHECK-NEXT: v_readlane_b32 s6, v22, 48 +; CHECK-NEXT: v_readlane_b32 s7, v22, 49 +; CHECK-NEXT: v_readlane_b32 s8, v22, 50 +; CHECK-NEXT: v_readlane_b32 s9, v22, 51 +; CHECK-NEXT: v_readlane_b32 s10, v22, 52 +; CHECK-NEXT: v_readlane_b32 s11, v22, 53 +; CHECK-NEXT: v_readlane_b32 s12, v22, 54 +; CHECK-NEXT: v_readlane_b32 s13, v22, 55 +; CHECK-NEXT: v_readlane_b32 s14, v22, 56 +; CHECK-NEXT: v_readlane_b32 s15, v22, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 54 -; CHECK-NEXT: v_readlane_b32 s1, v22, 55 -; CHECK-NEXT: v_readlane_b32 s2, v22, 56 -; CHECK-NEXT: v_readlane_b32 s3, v22, 57 +; CHECK-NEXT: v_readlane_b32 s0, v22, 58 +; CHECK-NEXT: v_readlane_b32 s1, v22, 59 +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 58 -; CHECK-NEXT: v_readlane_b32 s1, v22, 59 -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 -; CHECK-NEXT: v_readlane_b32 s4, v22, 62 -; CHECK-NEXT: v_readlane_b32 s5, v22, 63 -; CHECK-NEXT: v_readlane_b32 s6, v23, 0 -; CHECK-NEXT: v_readlane_b32 s7, v23, 1 +; CHECK-NEXT: v_readlane_b32 s0, v22, 62 +; CHECK-NEXT: v_readlane_b32 s1, v22, 63 +; CHECK-NEXT: v_readlane_b32 s2, v23, 0 +; CHECK-NEXT: v_readlane_b32 s3, v23, 1 +; CHECK-NEXT: v_readlane_b32 s4, v23, 2 +; CHECK-NEXT: v_readlane_b32 s5, v23, 3 +; CHECK-NEXT: v_readlane_b32 s6, v23, 4 +; CHECK-NEXT: v_readlane_b32 s7, v23, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 -; CHECK-NEXT: v_readlane_b32 s4, v23, 6 -; CHECK-NEXT: v_readlane_b32 s5, v23, 7 -; CHECK-NEXT: v_readlane_b32 s6, v23, 8 -; CHECK-NEXT: v_readlane_b32 s7, v23, 9 -; CHECK-NEXT: v_readlane_b32 s8, v23, 10 -; CHECK-NEXT: v_readlane_b32 s9, v23, 11 -; CHECK-NEXT: v_readlane_b32 s10, v23, 12 -; CHECK-NEXT: v_readlane_b32 s11, v23, 13 -; CHECK-NEXT: v_readlane_b32 s12, v23, 14 -; CHECK-NEXT: v_readlane_b32 s13, v23, 15 -; CHECK-NEXT: v_readlane_b32 s14, v23, 16 -; CHECK-NEXT: v_readlane_b32 s15, v23, 17 +; CHECK-NEXT: v_readlane_b32 s0, v23, 6 +; CHECK-NEXT: v_readlane_b32 s1, v23, 7 +; CHECK-NEXT: v_readlane_b32 s2, v23, 8 +; CHECK-NEXT: v_readlane_b32 s3, v23, 9 +; CHECK-NEXT: v_readlane_b32 s4, v23, 10 +; CHECK-NEXT: v_readlane_b32 s5, v23, 11 +; CHECK-NEXT: v_readlane_b32 s6, v23, 12 +; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s8, v23, 14 +; CHECK-NEXT: v_readlane_b32 s9, v23, 15 +; CHECK-NEXT: v_readlane_b32 s10, v23, 16 +; CHECK-NEXT: v_readlane_b32 s11, v23, 17 +; CHECK-NEXT: v_readlane_b32 s12, v23, 18 +; CHECK-NEXT: v_readlane_b32 s13, v23, 19 +; CHECK-NEXT: v_readlane_b32 s14, v23, 20 +; CHECK-NEXT: v_readlane_b32 s15, v23, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 18 -; CHECK-NEXT: v_readlane_b32 s1, v23, 19 +; CHECK-NEXT: v_readlane_b32 s0, v23, 22 +; CHECK-NEXT: v_readlane_b32 s1, v23, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 20 -; CHECK-NEXT: v_readlane_b32 s1, v23, 21 -; CHECK-NEXT: v_readlane_b32 s2, v23, 22 -; CHECK-NEXT: v_readlane_b32 s3, v23, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 24 ; CHECK-NEXT: v_readlane_b32 s1, v23, 25 ; CHECK-NEXT: v_readlane_b32 s2, v23, 26 ; CHECK-NEXT: v_readlane_b32 s3, v23, 27 -; CHECK-NEXT: v_readlane_b32 s4, v23, 28 -; CHECK-NEXT: v_readlane_b32 s5, v23, 29 -; CHECK-NEXT: v_readlane_b32 s6, v23, 30 -; CHECK-NEXT: v_readlane_b32 s7, v23, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 28 +; CHECK-NEXT: v_readlane_b32 s1, v23, 29 +; CHECK-NEXT: v_readlane_b32 s2, v23, 30 +; CHECK-NEXT: v_readlane_b32 s3, v23, 31 +; CHECK-NEXT: v_readlane_b32 s4, v23, 32 +; CHECK-NEXT: v_readlane_b32 s5, v23, 33 +; CHECK-NEXT: v_readlane_b32 s6, v23, 34 +; CHECK-NEXT: v_readlane_b32 s7, v23, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 -; CHECK-NEXT: v_readlane_b32 s4, v23, 36 -; CHECK-NEXT: v_readlane_b32 s5, v23, 37 -; CHECK-NEXT: v_readlane_b32 s6, v23, 38 -; CHECK-NEXT: v_readlane_b32 s7, v23, 39 -; CHECK-NEXT: v_readlane_b32 s8, v23, 40 -; CHECK-NEXT: v_readlane_b32 s9, v23, 41 -; CHECK-NEXT: v_readlane_b32 s10, v23, 42 -; CHECK-NEXT: v_readlane_b32 s11, v23, 43 -; CHECK-NEXT: v_readlane_b32 s12, v23, 44 -; CHECK-NEXT: v_readlane_b32 s13, v23, 45 -; CHECK-NEXT: v_readlane_b32 s14, v23, 46 -; CHECK-NEXT: v_readlane_b32 s15, v23, 47 +; CHECK-NEXT: v_readlane_b32 s0, v23, 36 +; CHECK-NEXT: v_readlane_b32 s1, v23, 37 +; CHECK-NEXT: v_readlane_b32 s2, v23, 38 +; CHECK-NEXT: v_readlane_b32 s3, v23, 39 +; CHECK-NEXT: v_readlane_b32 s4, v23, 40 +; CHECK-NEXT: v_readlane_b32 s5, v23, 41 +; CHECK-NEXT: v_readlane_b32 s6, v23, 42 +; CHECK-NEXT: v_readlane_b32 s7, v23, 43 +; CHECK-NEXT: v_readlane_b32 s8, v23, 44 +; CHECK-NEXT: v_readlane_b32 s9, v23, 45 +; CHECK-NEXT: v_readlane_b32 s10, v23, 46 +; CHECK-NEXT: v_readlane_b32 s11, v23, 47 +; CHECK-NEXT: v_readlane_b32 s12, v23, 48 +; CHECK-NEXT: v_readlane_b32 s13, v23, 49 +; CHECK-NEXT: v_readlane_b32 s14, v23, 50 +; CHECK-NEXT: v_readlane_b32 s15, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 455d22f2aa29c..cdfba3cf0db7f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index a423b6f831a9d..65a17ed67481c 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, 64, s4 ; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 @@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 8531b2ad4e405..3c47e2504747d 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s @@ -7,9 +6,6 @@ target datalayout = "A5" define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -22,15 +18,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr -; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -79,12 +66,10 @@ define amdgpu_kernel void @test_simple_indirect_call() { !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 8129a7ac51df9..d71d0f78fe1c3 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -231,11 +267,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -245,11 +284,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -282,11 +324,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -296,11 +341,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -352,11 +400,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -366,11 +417,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 5ae339454a0ba..bd255e88b9512 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13 + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %14 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f791135d45e9a..ef92cf3214e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_add_i32 s12, s12, s17 ; HAWAII-NEXT: s_or_b32 s0, s8, 14 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s9 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] @@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_add_i32 s12, s12, s17 ; FIJI-NEXT: s_or_b32 s0, s8, 14 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s9 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 19d633651fdd0..30accc846d2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 2097579e0c995..4f84b31f1877b 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................ +; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 5 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 775c62e73261a..644f434923368 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index b8f0d7617167e..69cc63eba6243 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_trap 2 @@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -280,6 +286,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -411,10 +420,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 6e29536feb51b..660ff4677547a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 @@ -2378,6 +2420,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 55cbc14a46706..97738a7944741 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -128,6 +146,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1 @@ -160,6 +181,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5 @@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 45ea6b62761cc..ab7e85fdff516 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index e99a06f497016..1bc25a1386074 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -95,7 +95,7 @@ ; Function Attrs: convergent nocallback nofree nounwind willreturn declare void @llvm.amdgcn.end.cf.i64(i64) #2 - attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } + attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #2 = { convergent nocallback nofree nounwind willreturn } attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 076d7c9cd8842..0515ffa094329 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -69,5 +69,5 @@ bb4: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } From 16980d5463c787a48ffb78fd9bbe3d9d32757f34 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 15 Apr 2025 19:34:55 -0300 Subject: [PATCH 052/710] Revert "[Clang] Fix dependent local class instantiation bugs" (#135870) Reverts llvm/llvm-project#134038 This crashes clang as reported here: https://github.com/llvm/llvm-project/pull/134038#issuecomment-2807092646 --- clang/docs/ReleaseNotes.rst | 1 - clang/lib/Sema/SemaTemplateInstantiate.cpp | 3 + .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 56 +--------------- .../CodeGenCXX/local-class-instantiation.cpp | 64 ------------------- 4 files changed, 4 insertions(+), 120 deletions(-) delete mode 100644 clang/test/CodeGenCXX/local-class-instantiation.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 166f26921cb71..ee69af5632f6e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -457,7 +457,6 @@ Bug Fixes to C++ Support by template argument deduction. - Clang is now better at instantiating the function definition after its use inside of a constexpr lambda. (#GH125747) -- Fixed a local class member function instantiation bug inside dependent lambdas. (#GH59734), (#GH132208) - Clang no longer crashes when trying to unify the types of arrays with certain differences in qualifiers (this could happen during template argument deduction or when building a ternary operator). (#GH97005) diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 0e81804f8c1e7..d2408a94ad0ab 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -4126,6 +4126,9 @@ Sema::InstantiateClassMembers(SourceLocation PointOfInstantiation, if (FunctionDecl *Pattern = Function->getInstantiatedFromMemberFunction()) { + if (Function->isIneligibleOrNotSelected()) + continue; + if (Function->getTrailingRequiresClause()) { ConstraintSatisfaction Satisfaction; if (CheckFunctionConstraints(Function, Satisfaction) || diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index bf5a882ba4f12..5c80077f294c6 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5597,61 +5597,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, Function->setLocation(PatternDecl->getLocation()); Function->setInnerLocStart(PatternDecl->getInnerLocStart()); Function->setRangeEnd(PatternDecl->getEndLoc()); - // Let the instantiation use the Pattern's DeclarationNameLoc, due to the - // following awkwardness: - // - // 1. There are out-of-tree users of getNameInfo().getSourceRange(), who - // expect the source range of the instantiated declaration to be set to - // point to the definition. - // - // 2. That getNameInfo().getSourceRange() might return the TypeLocInfo's - // location it tracked. - // - // 3. Function might come from an (implicit) declaration, while the pattern - // comes from a definition. In these cases, we need the PatternDecl's source - // location. - // - // To that end, we need to more or less tweak the DeclarationNameLoc. However, - // we can't blindly copy the DeclarationNameLoc from the PatternDecl to the - // function, since it contains associated TypeLocs that should have already - // been transformed. So, we rebuild the TypeLoc for that purpose. Technically, - // we should create a new function declaration and assign everything we need, - // but InstantiateFunctionDefinition updates the declaration in place. - auto NameLocPointsToPattern = [&] { - DeclarationNameInfo PatternName = PatternDecl->getNameInfo(); - DeclarationNameLoc PatternNameLoc = PatternName.getInfo(); - switch (PatternName.getName().getNameKind()) { - case DeclarationName::CXXConstructorName: - case DeclarationName::CXXDestructorName: - case DeclarationName::CXXConversionFunctionName: - break; - default: - // Cases where DeclarationNameLoc doesn't matter, as it merely contains a - // source range. - return PatternNameLoc; - } - - TypeSourceInfo *TSI = Function->getNameInfo().getNamedTypeInfo(); - // TSI might be null if the function is named by a constructor template id. - // E.g. S() {} for class template S with a template parameter T. - if (!TSI) { - // We don't care about the DeclarationName of the instantiated function, - // but only the DeclarationNameLoc. So if the TypeLoc is absent, we do - // nothing. - return PatternNameLoc; - } - - QualType InstT = TSI->getType(); - // We want to use a TypeLoc that reflects the transformed type while - // preserving the source location from the pattern. - TypeLocBuilder TLB; - TLB.pushTrivial( - Context, InstT, - PatternNameLoc.getNamedTypeInfo()->getTypeLoc().getBeginLoc()); - return DeclarationNameLoc::makeNamedTypeLoc( - TLB.getTypeSourceInfo(Context, InstT)); - }; - Function->setDeclarationNameLoc(NameLocPointsToPattern()); + Function->setDeclarationNameLoc(PatternDecl->getNameInfo().getInfo()); EnterExpressionEvaluationContext EvalContext( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); diff --git a/clang/test/CodeGenCXX/local-class-instantiation.cpp b/clang/test/CodeGenCXX/local-class-instantiation.cpp deleted file mode 100644 index 34103a1ee55ef..0000000000000 --- a/clang/test/CodeGenCXX/local-class-instantiation.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// RUN: %clang_cc1 -std=c++17 %s -emit-llvm -triple %itanium_abi_triple -o - | FileCheck %s - -namespace LambdaContainingLocalClasses { - -template -void GH59734() { - [&](auto param) { - struct Guard { - Guard() { - // Check that we're able to create DeclRefExpr to param at this point. - static_assert(__is_same(decltype(param), int), ""); - } - ~Guard() { - static_assert(__is_same(decltype(param), int), ""); - } - operator decltype(param)() { - return decltype(param)(); - } - }; - Guard guard; - param = guard; - }(42); -} - -// Guard::Guard(): -// CHECK-DAG: define {{.*}} @_ZZZN28LambdaContainingLocalClasses7GH59734IiEEvvENKUlT_E_clIiEEDaS1_EN5GuardC2Ev -// Guard::operator int(): -// CHECK-DAG: define {{.*}} @_ZZZN28LambdaContainingLocalClasses7GH59734IiEEvvENKUlT_E_clIiEEDaS1_EN5GuardcviEv -// Guard::~Guard(): -// CHECK-DAG: define {{.*}} @_ZZZN28LambdaContainingLocalClasses7GH59734IiEEvvENKUlT_E_clIiEEDaS1_EN5GuardD2Ev - -struct S {}; - -template -auto GH132208 = [](auto param) { - struct OnScopeExit { - OnScopeExit() { - static_assert(__is_same(decltype(param), S), ""); - } - ~OnScopeExit() { - static_assert(__is_same(decltype(param), S), ""); - } - operator decltype(param)() { - return decltype(param)(); - } - } pending; - - param = pending; -}; - -void bar() { - GH59734(); - - GH132208(S{}); -} - -// OnScopeExit::OnScopeExit(): -// CHECK-DAG: define {{.*}} @_ZZNK28LambdaContainingLocalClasses8GH132208IvEMUlT_E_clINS_1SEEEDaS2_EN11OnScopeExitC2Ev -// OnScopeExit::operator S(): -// CHECK-DAG: define {{.*}} @_ZZNK28LambdaContainingLocalClasses8GH132208IvEMUlT_E_clINS_1SEEEDaS2_EN11OnScopeExitcvS5_Ev -// OnScopeExit::~OnScopeExit(): -// CHECK-DAG: define {{.*}} @_ZZNK28LambdaContainingLocalClasses8GH132208IvEMUlT_E_clINS_1SEEEDaS2_EN11OnScopeExitD2Ev - -} // namespace LambdaContainingLocalClasses From af63e1b505453de3e6a281d1b72e62fa8d396b23 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Tue, 15 Apr 2025 15:52:04 -0700 Subject: [PATCH 053/710] [OpenACC][CIR] Implement 'self' lowering on compute constructs (#135851) This is our first attempt at lowering a clause that is an 'operand' in the OpenACC operand, so it does quite a bit of refactoring. My previous plans on how to emit the clauses was not viable, so we instead do 'create the op, then use the visitor to fill in the operands'. This resulted in the 'applyAttributes' function getting removed and a few other functions simplified. Additionally, it requires setting the insertion point a little to make sure we're inserting 'around' the operation correctly. Finally, since the OpenACC dialect only understands the MLIR types, we had to introduce a use of the unrealized-conversion-cast, which we'll probably getting good use out of in the future. --- clang/include/clang/AST/OpenACCClause.h | 5 + clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 216 +++++++++++--------- clang/test/CIR/CodeGenOpenACC/kernels.c | 30 ++- clang/test/CIR/CodeGenOpenACC/parallel.c | 30 ++- clang/test/CIR/CodeGenOpenACC/serial.c | 30 ++- 5 files changed, 205 insertions(+), 106 deletions(-) diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 3687af76a559f..681567228cbb0 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -430,6 +430,11 @@ class OpenACCSelfClause final } bool isConditionExprClause() const { return HasConditionExpr.has_value(); } + bool isVarListClause() const { return !isConditionExprClause(); } + bool isEmptySelfClause() const { + return (isConditionExprClause() && !hasConditionExpr()) || + (!isConditionExprClause() && getVarList().empty()); + } bool hasConditionExpr() const { assert(HasConditionExpr.has_value() && diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 152f996ed0fed..3bcc6f908a841 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -32,46 +32,51 @@ constexpr bool isOneOfTypes = template constexpr bool isOneOfTypes = std::is_same_v; +template class OpenACCClauseCIREmitter final - : public OpenACCClauseVisitor { - CIRGenModule &cgm; + : public OpenACCClauseVisitor> { + OpTy &operation; + CIRGenFunction &cgf; + CIRGenBuilderTy &builder; + // This is necessary since a few of the clauses emit differently based on the // directive kind they are attached to. OpenACCDirectiveKind dirKind; + // TODO(cir): This source location should be able to go away once the NYI + // diagnostics are gone. SourceLocation dirLoc; - struct AttributeData { - // Value of the 'default' attribute, added on 'data' and 'compute'/etc - // constructs as a 'default-attr'. - std::optional defaultVal = std::nullopt; - // For directives that have their device type architectures listed in - // attributes (init/shutdown/etc), the list of architectures to be emitted. - llvm::SmallVector deviceTypeArchs{}; - } attrData; - void clauseNotImplemented(const OpenACCClause &c) { - cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind()); + cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind()); } public: - OpenACCClauseCIREmitter(CIRGenModule &cgm, OpenACCDirectiveKind dirKind, - SourceLocation dirLoc) - : cgm(cgm), dirKind(dirKind), dirLoc(dirLoc) {} + OpenACCClauseCIREmitter(OpTy &operation, CIRGenFunction &cgf, + CIRGenBuilderTy &builder, + OpenACCDirectiveKind dirKind, SourceLocation dirLoc) + : operation(operation), cgf(cgf), builder(builder), dirKind(dirKind), + dirLoc(dirLoc) {} void VisitClause(const OpenACCClause &clause) { clauseNotImplemented(clause); } void VisitDefaultClause(const OpenACCDefaultClause &clause) { - switch (clause.getDefaultClauseKind()) { - case OpenACCDefaultClauseKind::None: - attrData.defaultVal = ClauseDefaultValue::None; - break; - case OpenACCDefaultClauseKind::Present: - attrData.defaultVal = ClauseDefaultValue::Present; - break; - case OpenACCDefaultClauseKind::Invalid: - break; + // This type-trait checks if 'op'(the first arg) is one of the mlir::acc + // operations listed in the rest of the arguments. + if constexpr (isOneOfTypes) { + switch (clause.getDefaultClauseKind()) { + case OpenACCDefaultClauseKind::None: + operation.setDefaultAttr(ClauseDefaultValue::None); + break; + case OpenACCDefaultClauseKind::Present: + operation.setDefaultAttr(ClauseDefaultValue::Present); + break; + case OpenACCDefaultClauseKind::Invalid: + break; + } + } else { + return clauseNotImplemented(clause); } } @@ -89,64 +94,70 @@ class OpenACCClauseCIREmitter final } void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) { + if constexpr (isOneOfTypes) { + llvm::SmallVector deviceTypes; + std::optional existingDeviceTypes = + operation.getDeviceTypes(); + + // Ensure we keep the existing ones, and in the correct 'new' order. + if (existingDeviceTypes) { + for (const mlir::Attribute &Attr : *existingDeviceTypes) + deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + builder.getContext(), + cast(Attr).getValue())); + } - switch (dirKind) { - case OpenACCDirectiveKind::Init: - case OpenACCDirectiveKind::Set: - case OpenACCDirectiveKind::Shutdown: { - // Device type has a list that is either a 'star' (emitted as 'star'), - // or an identifer list, all of which get added for attributes. - - for (const DeviceTypeArgument &arg : clause.getArchitectures()) - attrData.deviceTypeArchs.push_back(decodeDeviceType(arg.first)); - break; - } - default: + for (const DeviceTypeArgument &arg : clause.getArchitectures()) { + deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + builder.getContext(), decodeDeviceType(arg.first))); + } + operation.removeDeviceTypesAttr(); + operation.setDeviceTypesAttr( + mlir::ArrayAttr::get(builder.getContext(), deviceTypes)); + } else if constexpr (isOneOfTypes) { + assert(!operation.getDeviceTypeAttr() && "already have device-type?"); + assert(clause.getArchitectures().size() <= 1); + + if (!clause.getArchitectures().empty()) + operation.setDeviceType( + decodeDeviceType(clause.getArchitectures()[0].first)); + } else { return clauseNotImplemented(clause); } } - // Apply any of the clauses that resulted in an 'attribute'. - template - void applyAttributes(CIRGenBuilderTy &builder, Op &op) { - - if (attrData.defaultVal.has_value()) { - // FIXME: OpenACC: as we implement this for other directive kinds, we have - // to expand this list. - // This type-trait checks if 'op'(the first arg) is one of the mlir::acc - // operations listed in the rest of the arguments. - if constexpr (isOneOfTypes) - op.setDefaultAttr(*attrData.defaultVal); - else - cgm.errorNYI(dirLoc, "OpenACC 'default' clause lowering for ", dirKind); - } - - if (!attrData.deviceTypeArchs.empty()) { - // FIXME: OpenACC: as we implement this for other directive kinds, we have - // to expand this list, or more likely, have a 'noop' branch as most other - // uses of this apply to the operands instead. - // This type-trait checks if 'op'(the first arg) is one of the mlir::acc - if constexpr (isOneOfTypes) { - llvm::SmallVector deviceTypes; - for (mlir::acc::DeviceType DT : attrData.deviceTypeArchs) - deviceTypes.push_back( - mlir::acc::DeviceTypeAttr::get(builder.getContext(), DT)); - - op.setDeviceTypesAttr( - mlir::ArrayAttr::get(builder.getContext(), deviceTypes)); - } else if constexpr (isOneOfTypes) { - assert(attrData.deviceTypeArchs.size() <= 1 && - "Set can only have a single architecture"); - if (!attrData.deviceTypeArchs.empty()) - op.setDeviceType(attrData.deviceTypeArchs[0]); + void VisitSelfClause(const OpenACCSelfClause &clause) { + if constexpr (isOneOfTypes) { + if (clause.isEmptySelfClause()) { + operation.setSelfAttr(true); + } else if (clause.isConditionExprClause()) { + assert(clause.hasConditionExpr()); + mlir::Value condition = + cgf.evaluateExprAsBool(clause.getConditionExpr()); + + mlir::Location exprLoc = + cgf.cgm.getLoc(clause.getConditionExpr()->getBeginLoc()); + mlir::IntegerType targetType = mlir::IntegerType::get( + &cgf.getMLIRContext(), /*width=*/1, + mlir::IntegerType::SignednessSemantics::Signless); + auto conversionOp = builder.create( + exprLoc, targetType, condition); + operation.getSelfCondMutable().append(conversionOp.getResult(0)); } else { - cgm.errorNYI(dirLoc, "OpenACC 'device_type' clause lowering for ", - dirKind); + llvm_unreachable("var-list version of self shouldn't get here"); } + } else { + return clauseNotImplemented(clause); } } }; +template +auto makeClauseEmitter(OpTy &op, CIRGenFunction &cgf, CIRGenBuilderTy &builder, + OpenACCDirectiveKind dirKind, SourceLocation dirLoc) { + return OpenACCClauseCIREmitter(op, cgf, builder, dirKind, dirLoc); +} + } // namespace template @@ -158,24 +169,27 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt( llvm::SmallVector retTy; llvm::SmallVector operands; - - // Clause-emitter must be here because it might modify operands. - OpenACCClauseCIREmitter clauseEmitter(getCIRGenModule(), dirKind, dirLoc); - clauseEmitter.VisitClauseList(clauses); - auto op = builder.create(start, retTy, operands); - // Apply the attributes derived from the clauses. - clauseEmitter.applyAttributes(builder, op); + { + mlir::OpBuilder::InsertionGuard guardCase(builder); + // Sets insertion point before the 'op', since every new expression needs to + // be before the operation. + builder.setInsertionPoint(op); + makeClauseEmitter(op, *this, builder, dirKind, dirLoc) + .VisitClauseList(clauses); + } - mlir::Block &block = op.getRegion().emplaceBlock(); - mlir::OpBuilder::InsertionGuard guardCase(builder); - builder.setInsertionPointToEnd(&block); + { + mlir::Block &block = op.getRegion().emplaceBlock(); + mlir::OpBuilder::InsertionGuard guardCase(builder); + builder.setInsertionPointToEnd(&block); - LexicalScope ls{*this, start, builder.getInsertionBlock()}; - res = emitStmt(associatedStmt, /*useCurrentScope=*/true); + LexicalScope ls{*this, start, builder.getInsertionBlock()}; + res = emitStmt(associatedStmt, /*useCurrentScope=*/true); - builder.create(end); + builder.create(end); + } return res; } @@ -187,14 +201,16 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOp( llvm::SmallVector retTy; llvm::SmallVector operands; - - // Clause-emitter must be here because it might modify operands. - OpenACCClauseCIREmitter clauseEmitter(getCIRGenModule(), dirKind, dirLoc); - clauseEmitter.VisitClauseList(clauses); - auto op = builder.create(start, retTy, operands); - // Apply the attributes derived from the clauses. - clauseEmitter.applyAttributes(builder, op); + + { + mlir::OpBuilder::InsertionGuard guardCase(builder); + // Sets insertion point before the 'op', since every new expression needs to + // be before the operation. + builder.setInsertionPoint(op); + makeClauseEmitter(op, *this, builder, dirKind, dirLoc) + .VisitClauseList(clauses); + } return res; } @@ -254,46 +270,46 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCShutdownConstruct( mlir::LogicalResult CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Loop Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Loop Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCCombinedConstruct( const OpenACCCombinedConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Combined Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Combined Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCEnterDataConstruct( const OpenACCEnterDataConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC EnterData Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC EnterData Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCExitDataConstruct( const OpenACCExitDataConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC ExitData Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC ExitData Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCHostDataConstruct( const OpenACCHostDataConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC HostData Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC HostData Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Wait Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Wait Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Update Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Update Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Atomic Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct"); return mlir::failure(); } mlir::LogicalResult CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) { - getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Cache Construct"); + cgm.errorNYI(s.getSourceRange(), "OpenACC Cache Construct"); return mlir::failure(); } diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c index 0c950fe3d0f9c..934daf9e8ecc0 100644 --- a/clang/test/CIR/CodeGenOpenACC/kernels.c +++ b/clang/test/CIR/CodeGenOpenACC/kernels.c @@ -1,7 +1,9 @@ // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s -void acc_kernels(void) { - // CHECK: cir.func @acc_kernels() { +void acc_kernels(int cond) { + // CHECK: cir.func @acc_kernels(%[[ARG:.*]]: !s32i{{.*}}) { + // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr, ["cond", init] + // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr #pragma acc kernels {} @@ -38,5 +40,29 @@ void acc_kernels(void) { // CHECK-NEXT: acc.terminator // CHECK-NEXT:} +#pragma acc kernels self + {} + // CHECK-NEXT: acc.kernels { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } attributes {selfAttr} + +#pragma acc kernels self(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels self(0) + {} + // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c index e18270435460c..c7a4bda6faa74 100644 --- a/clang/test/CIR/CodeGenOpenACC/parallel.c +++ b/clang/test/CIR/CodeGenOpenACC/parallel.c @@ -1,7 +1,9 @@ // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s -void acc_parallel(void) { - // CHECK: cir.func @acc_parallel() { +void acc_parallel(int cond) { + // CHECK: cir.func @acc_parallel(%[[ARG:.*]]: !s32i{{.*}}) { + // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr, ["cond", init] + // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr #pragma acc parallel {} // CHECK-NEXT: acc.parallel { @@ -37,5 +39,29 @@ void acc_parallel(void) { // CHECK-NEXT: acc.yield // CHECK-NEXT:} +#pragma acc parallel self + {} + // CHECK-NEXT: acc.parallel { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {selfAttr} + +#pragma acc parallel self(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel self(0) + {} + // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c index 72a0995549da3..38a38ad6c9514 100644 --- a/clang/test/CIR/CodeGenOpenACC/serial.c +++ b/clang/test/CIR/CodeGenOpenACC/serial.c @@ -1,7 +1,9 @@ // RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s -void acc_serial(void) { - // CHECK: cir.func @acc_serial() { +void acc_serial(int cond) { + // CHECK: cir.func @acc_serial(%[[ARG:.*]]: !s32i{{.*}}) { + // CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr, ["cond", init] + // CHECK-NEXT: cir.store %[[ARG]], %[[COND]] : !s32i, !cir.ptr #pragma acc serial {} @@ -38,5 +40,29 @@ void acc_serial(void) { // CHECK-NEXT: acc.yield // CHECK-NEXT:} +#pragma acc serial self + {} + // CHECK-NEXT: acc.serial { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {selfAttr} + +#pragma acc serial self(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial self(0) + {} + // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial self(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } From e4d951d2e42a9124bd87275a864804c4b84b62e3 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 15 Apr 2025 16:03:03 -0700 Subject: [PATCH 054/710] LowerTypeTests: Fix quadratic complexity. Currently we have quadratic complexity in LowerTypeTests because ScopedSaveAliaseesAndUsed loops over all aliases for each disjoint set, and the number of aliases and number of disjoint sets is roughly proportional to the program size. Fix that by moving ScopedSaveAliaseesAndUsed to LowerTypeTestsModule::lower() so that we do this only once. Reviewers: fmayer, vitalybuka Reviewed By: vitalybuka Pull Request: https://github.com/llvm/llvm-project/pull/135875 --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 164 ++++++++++----------- 1 file changed, 81 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 7cf7d74acfcfa..8e9f24dfc31fa 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1669,61 +1669,55 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout); - { - ScopedSaveAliaseesAndUsed S(M); + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Function *F = cast(Functions[I]->getGlobal()); + bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); - // Build aliases pointing to offsets into the jump table, and replace - // references to the original functions with references to the aliases. - for (unsigned I = 0; I != Functions.size(); ++I) { - Function *F = cast(Functions[I]->getGlobal()); - bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); - - Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( - JumpTableType, JumpTable, - ArrayRef{ConstantInt::get(IntPtrTy, 0), - ConstantInt::get(IntPtrTy, I)}); - - const bool IsExported = Functions[I]->isExported(); - if (!IsJumpTableCanonical) { - GlobalValue::LinkageTypes LT = IsExported - ? GlobalValue::ExternalLinkage - : GlobalValue::InternalLinkage; - GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, - F->getName() + ".cfi_jt", - CombinedGlobalElemPtr, &M); - if (IsExported) - JtAlias->setVisibility(GlobalValue::HiddenVisibility); - else - appendToUsed(M, {JtAlias}); - } + Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( + JumpTableType, JumpTable, + ArrayRef{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}); + + const bool IsExported = Functions[I]->isExported(); + if (!IsJumpTableCanonical) { + GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage + : GlobalValue::InternalLinkage; + GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, + F->getName() + ".cfi_jt", + CombinedGlobalElemPtr, &M); + if (IsExported) + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + else + appendToUsed(M, {JtAlias}); + } - if (IsExported) { - if (IsJumpTableCanonical) - ExportSummary->cfiFunctionDefs().emplace(F->getName()); - else - ExportSummary->cfiFunctionDecls().emplace(F->getName()); - } + if (IsExported) { + if (IsJumpTableCanonical) + ExportSummary->cfiFunctionDefs().emplace(F->getName()); + else + ExportSummary->cfiFunctionDecls().emplace(F->getName()); + } - if (!IsJumpTableCanonical) { - if (F->hasExternalWeakLinkage()) - replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, - IsJumpTableCanonical); - else - replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); - } else { - assert(F->getType()->getAddressSpace() == 0); - - GlobalAlias *FAlias = - GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "", - CombinedGlobalElemPtr, &M); - FAlias->setVisibility(F->getVisibility()); - FAlias->takeName(F); - if (FAlias->hasName()) - F->setName(FAlias->getName() + ".cfi"); - replaceCfiUses(F, FAlias, IsJumpTableCanonical); - if (!F->hasLocalLinkage()) - F->setVisibility(GlobalVariable::HiddenVisibility); - } + if (!IsJumpTableCanonical) { + if (F->hasExternalWeakLinkage()) + replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, + IsJumpTableCanonical); + else + replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); + } else { + assert(F->getType()->getAddressSpace() == 0); + + GlobalAlias *FAlias = GlobalAlias::create( + F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); + FAlias->setVisibility(F->getVisibility()); + FAlias->takeName(F); + if (FAlias->hasName()) + F->setName(FAlias->getName() + ".cfi"); + replaceCfiUses(F, FAlias, IsJumpTableCanonical); + if (!F->hasLocalLinkage()) + F->setVisibility(GlobalVariable::HiddenVisibility); } } @@ -2339,39 +2333,43 @@ bool LowerTypeTestsModule::lower() { if (GlobalClasses.empty()) return false; - // For each disjoint set we found... - for (const auto &C : GlobalClasses) { - if (!C->isLeader()) - continue; - - ++NumTypeIdDisjointSets; - // Build the list of type identifiers in this disjoint set. - std::vector TypeIds; - std::vector Globals; - std::vector ICallBranchFunnels; - for (auto M : GlobalClasses.members(*C)) { - if (isa(M)) - TypeIds.push_back(cast(M)); - else if (isa(M)) - Globals.push_back(cast(M)); - else - ICallBranchFunnels.push_back(cast(M)); - } - - // Order type identifiers by unique ID for determinism. This ordering is - // stable as there is a one-to-one mapping between metadata and unique IDs. - llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { - return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; - }); + { + ScopedSaveAliaseesAndUsed S(M); + // For each disjoint set we found... + for (const auto &C : GlobalClasses) { + if (!C->isLeader()) + continue; - // Same for the branch funnels. - llvm::sort(ICallBranchFunnels, - [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { - return F1->UniqueId < F2->UniqueId; - }); + ++NumTypeIdDisjointSets; + // Build the list of type identifiers in this disjoint set. + std::vector TypeIds; + std::vector Globals; + std::vector ICallBranchFunnels; + for (auto M : GlobalClasses.members(*C)) { + if (isa(M)) + TypeIds.push_back(cast(M)); + else if (isa(M)) + Globals.push_back(cast(M)); + else + ICallBranchFunnels.push_back(cast(M)); + } - // Build bitsets for this disjoint set. - buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); + // Order type identifiers by unique ID for determinism. This ordering is + // stable as there is a one-to-one mapping between metadata and unique + // IDs. + llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { + return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; + }); + + // Same for the branch funnels. + llvm::sort(ICallBranchFunnels, + [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { + return F1->UniqueId < F2->UniqueId; + }); + + // Build bitsets for this disjoint set. + buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); + } } allocateByteArrays(); From 3428cc94c893f9a09728c707baf018b9cdfaf243 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Tue, 15 Apr 2025 19:04:59 -0400 Subject: [PATCH 055/710] [flang] Implement external routine usage of hostnm() (#134900) Previously, `hostnm` extended intrinsic was implemented as proper intrinsic. Since then we found out that some applications use `hostnm` as external routine via `external hostnm`. This prevents `hostnm` from being recognized as an intrinsic. This PR implements `hostnm` as external routine. --- flang-rt/lib/runtime/extensions.cpp | 32 ++++++++++++++++++++++++ flang/include/flang/Runtime/extensions.h | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp index 6b553ff97e5ab..e70dff3997233 100644 --- a/flang-rt/lib/runtime/extensions.cpp +++ b/flang-rt/lib/runtime/extensions.cpp @@ -264,6 +264,38 @@ int RTNAME(Chdir)(const char *name) { #endif } +int FORTRAN_PROCEDURE_NAME(hostnm)(char *hn, int length) { + std::int32_t status{0}; + + if (!hn || length < 0) { + return EINVAL; + } + +#ifdef _WIN32 + DWORD dwSize{static_cast(length)}; + + // Note: Winsock has gethostname(), but use Win32 API GetComputerNameEx(), + // in order to avoid adding dependency on Winsock. + if (!GetComputerNameExA(ComputerNameDnsHostname, hn, &dwSize)) { + status = GetLastError(); + } +#else + if (gethostname(hn, length) < 0) { + status = errno; + } +#endif + + if (status == 0) { + // Find zero terminator and fill the string from the + // zero terminator to the end with spaces + char *str_end{hn + length}; + char *str_zero{std::find(hn, str_end, '\0')}; + std::fill(str_zero, str_end, ' '); + } + + return status; +} + int FORTRAN_PROCEDURE_NAME(ierrno)() { return errno; } void FORTRAN_PROCEDURE_NAME(qsort)(int *array, int *len, int *isize, diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index db2245875e85a..06ae7f35d9b5b 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -60,7 +60,7 @@ uid_t RTNAME(GetUID)(); void FORTRAN_PROCEDURE_NAME(getlog)(char *name, std::int64_t length); // GNU extension subroutine HOSTNM(C) -void FORTRAN_PROCEDURE_NAME(hostnm)(char *name, std::int64_t length); +int FORTRAN_PROCEDURE_NAME(hostnm)(char *hn, int length); std::intptr_t RTNAME(Malloc)(std::size_t size); From 58c3fba7063eaca926931a412c329e9ac4deefd6 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 15 Apr 2025 16:20:45 -0700 Subject: [PATCH 056/710] Revert "LowerTypeTests: Fix quadratic complexity." This reverts commit e4d951d2e42a9124bd87275a864804c4b84b62e3. Need to investigate some test failures. --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 164 +++++++++++---------- 1 file changed, 83 insertions(+), 81 deletions(-) diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 8e9f24dfc31fa..7cf7d74acfcfa 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1669,55 +1669,61 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout); - // Build aliases pointing to offsets into the jump table, and replace - // references to the original functions with references to the aliases. - for (unsigned I = 0; I != Functions.size(); ++I) { - Function *F = cast(Functions[I]->getGlobal()); - bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); + { + ScopedSaveAliaseesAndUsed S(M); - Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( - JumpTableType, JumpTable, - ArrayRef{ConstantInt::get(IntPtrTy, 0), - ConstantInt::get(IntPtrTy, I)}); - - const bool IsExported = Functions[I]->isExported(); - if (!IsJumpTableCanonical) { - GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage - : GlobalValue::InternalLinkage; - GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, - F->getName() + ".cfi_jt", - CombinedGlobalElemPtr, &M); - if (IsExported) - JtAlias->setVisibility(GlobalValue::HiddenVisibility); - else - appendToUsed(M, {JtAlias}); - } + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Function *F = cast(Functions[I]->getGlobal()); + bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); + + Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( + JumpTableType, JumpTable, + ArrayRef{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}); + + const bool IsExported = Functions[I]->isExported(); + if (!IsJumpTableCanonical) { + GlobalValue::LinkageTypes LT = IsExported + ? GlobalValue::ExternalLinkage + : GlobalValue::InternalLinkage; + GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, + F->getName() + ".cfi_jt", + CombinedGlobalElemPtr, &M); + if (IsExported) + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + else + appendToUsed(M, {JtAlias}); + } - if (IsExported) { - if (IsJumpTableCanonical) - ExportSummary->cfiFunctionDefs().emplace(F->getName()); - else - ExportSummary->cfiFunctionDecls().emplace(F->getName()); - } + if (IsExported) { + if (IsJumpTableCanonical) + ExportSummary->cfiFunctionDefs().emplace(F->getName()); + else + ExportSummary->cfiFunctionDecls().emplace(F->getName()); + } - if (!IsJumpTableCanonical) { - if (F->hasExternalWeakLinkage()) - replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, - IsJumpTableCanonical); - else - replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); - } else { - assert(F->getType()->getAddressSpace() == 0); - - GlobalAlias *FAlias = GlobalAlias::create( - F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); - FAlias->setVisibility(F->getVisibility()); - FAlias->takeName(F); - if (FAlias->hasName()) - F->setName(FAlias->getName() + ".cfi"); - replaceCfiUses(F, FAlias, IsJumpTableCanonical); - if (!F->hasLocalLinkage()) - F->setVisibility(GlobalVariable::HiddenVisibility); + if (!IsJumpTableCanonical) { + if (F->hasExternalWeakLinkage()) + replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, + IsJumpTableCanonical); + else + replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); + } else { + assert(F->getType()->getAddressSpace() == 0); + + GlobalAlias *FAlias = + GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "", + CombinedGlobalElemPtr, &M); + FAlias->setVisibility(F->getVisibility()); + FAlias->takeName(F); + if (FAlias->hasName()) + F->setName(FAlias->getName() + ".cfi"); + replaceCfiUses(F, FAlias, IsJumpTableCanonical); + if (!F->hasLocalLinkage()) + F->setVisibility(GlobalVariable::HiddenVisibility); + } } } @@ -2333,43 +2339,39 @@ bool LowerTypeTestsModule::lower() { if (GlobalClasses.empty()) return false; - { - ScopedSaveAliaseesAndUsed S(M); - // For each disjoint set we found... - for (const auto &C : GlobalClasses) { - if (!C->isLeader()) - continue; - - ++NumTypeIdDisjointSets; - // Build the list of type identifiers in this disjoint set. - std::vector TypeIds; - std::vector Globals; - std::vector ICallBranchFunnels; - for (auto M : GlobalClasses.members(*C)) { - if (isa(M)) - TypeIds.push_back(cast(M)); - else if (isa(M)) - Globals.push_back(cast(M)); - else - ICallBranchFunnels.push_back(cast(M)); - } + // For each disjoint set we found... + for (const auto &C : GlobalClasses) { + if (!C->isLeader()) + continue; - // Order type identifiers by unique ID for determinism. This ordering is - // stable as there is a one-to-one mapping between metadata and unique - // IDs. - llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { - return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; - }); - - // Same for the branch funnels. - llvm::sort(ICallBranchFunnels, - [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { - return F1->UniqueId < F2->UniqueId; - }); - - // Build bitsets for this disjoint set. - buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); + ++NumTypeIdDisjointSets; + // Build the list of type identifiers in this disjoint set. + std::vector TypeIds; + std::vector Globals; + std::vector ICallBranchFunnels; + for (auto M : GlobalClasses.members(*C)) { + if (isa(M)) + TypeIds.push_back(cast(M)); + else if (isa(M)) + Globals.push_back(cast(M)); + else + ICallBranchFunnels.push_back(cast(M)); } + + // Order type identifiers by unique ID for determinism. This ordering is + // stable as there is a one-to-one mapping between metadata and unique IDs. + llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { + return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; + }); + + // Same for the branch funnels. + llvm::sort(ICallBranchFunnels, + [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { + return F1->UniqueId < F2->UniqueId; + }); + + // Build bitsets for this disjoint set. + buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); } allocateByteArrays(); From 860d0383db80ea881e957a5628d04e9d725b919d Mon Sep 17 00:00:00 2001 From: Haowei Date: Tue, 15 Apr 2025 16:23:53 -0700 Subject: [PATCH 057/710] [Fuchsia] Not building llvm-mt when LIBXML2 is not enabled. (#135877) This patch prevents including the llvm-mt to `LLVM_TOOLCHAIN_TOOLS` in the Fuchsia toolchain when LIBXML2 is not explicitly enabled. --- clang/cmake/caches/Fuchsia-stage2.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 99890b8246ad7..e10855f5ef31b 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -459,7 +459,6 @@ set(LLVM_TOOLCHAIN_TOOLS llvm-libtool-darwin llvm-lipo llvm-ml - llvm-mt llvm-nm llvm-objcopy llvm-objdump @@ -481,6 +480,10 @@ set(LLVM_TOOLCHAIN_TOOLS scan-build-py CACHE STRING "") +if (LLVM_ENABLE_LIBXML2) + list(APPEND LLVM_TOOLCHAIN_TOOLS llvm-mt) +endif() + set(LLVM_Toolchain_DISTRIBUTION_COMPONENTS bolt clang From 6ad922b75a41911e0e394d5d367bee1240ad509f Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 15 Apr 2025 16:14:49 -0700 Subject: [PATCH 058/710] [OpenACC][CIR] Implement lowering for 'if' on compute constructs This is the same for these as the 'self' was, except it doesn't support the 'empty' variant, so we have to just generate the condition. This patch does that, and extracts the 'condition' emission to a separate function since the two share it. --- clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 40 +++++++++++++----- clang/test/CIR/CodeGenOpenACC/kernels.c | 46 ++++++++++++++++++++- clang/test/CIR/CodeGenOpenACC/parallel.c | 44 +++++++++++++++++++- clang/test/CIR/CodeGenOpenACC/serial.c | 44 +++++++++++++++++++- 4 files changed, 159 insertions(+), 15 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 3bcc6f908a841..c14ff9a16841d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -50,6 +50,21 @@ class OpenACCClauseCIREmitter final cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind()); } + // 'condition' as an OpenACC grammar production is used for 'if' and (some + // variants of) 'self'. It needs to be emitted as a signless-1-bit value, so + // this function emits the expression, then sets the unrealized conversion + // cast correctly, and returns the completed value. + mlir::Value createCondition(const Expr *condExpr) { + mlir::Value condition = cgf.evaluateExprAsBool(condExpr); + mlir::Location exprLoc = cgf.cgm.getLoc(condExpr->getBeginLoc()); + mlir::IntegerType targetType = mlir::IntegerType::get( + &cgf.getMLIRContext(), /*width=*/1, + mlir::IntegerType::SignednessSemantics::Signless); + auto conversionOp = builder.create( + exprLoc, targetType, condition); + return conversionOp.getResult(0); + } + public: OpenACCClauseCIREmitter(OpTy &operation, CIRGenFunction &cgf, CIRGenBuilderTy &builder, @@ -132,17 +147,8 @@ class OpenACCClauseCIREmitter final operation.setSelfAttr(true); } else if (clause.isConditionExprClause()) { assert(clause.hasConditionExpr()); - mlir::Value condition = - cgf.evaluateExprAsBool(clause.getConditionExpr()); - - mlir::Location exprLoc = - cgf.cgm.getLoc(clause.getConditionExpr()->getBeginLoc()); - mlir::IntegerType targetType = mlir::IntegerType::get( - &cgf.getMLIRContext(), /*width=*/1, - mlir::IntegerType::SignednessSemantics::Signless); - auto conversionOp = builder.create( - exprLoc, targetType, condition); - operation.getSelfCondMutable().append(conversionOp.getResult(0)); + operation.getSelfCondMutable().append( + createCondition(clause.getConditionExpr())); } else { llvm_unreachable("var-list version of self shouldn't get here"); } @@ -150,6 +156,18 @@ class OpenACCClauseCIREmitter final return clauseNotImplemented(clause); } } + + void VisitIfClause(const OpenACCIfClause &clause) { + if constexpr (isOneOfTypes) { + operation.getIfCondMutable().append( + createCondition(clause.getConditionExpr())); + } else { + // 'if' applies to most of the constructs, but hold off on lowering them + // until we can write tests/know what we're doing with codegen to make + // sure we get it right. + return clauseNotImplemented(clause); + } + } }; template diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c index 934daf9e8ecc0..ca5bfebcb4ff3 100644 --- a/clang/test/CIR/CodeGenOpenACC/kernels.c +++ b/clang/test/CIR/CodeGenOpenACC/kernels.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s void acc_kernels(int cond) { // CHECK: cir.func @acc_kernels(%[[ARG:.*]]: !s32i{{.*}}) { @@ -63,6 +63,48 @@ void acc_kernels(int cond) { // CHECK-NEXT: acc.kernels self(%[[CONV_CAST]]) { // CHECK-NEXT: acc.terminator // CHECK-NEXT: } loc - + +#pragma acc kernels if(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels if(1) + {} + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels if(cond == 1) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels if(cond == 1) self(cond == 2) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES_IF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_IF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_IF]] : !cir.bool to i1 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[EQ_RES_SELF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[TWO_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_SELF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_SELF]] : !cir.bool to i1 + // CHECK-NEXT: acc.kernels self(%[[CONV_CAST_SELF]]) if(%[[CONV_CAST_IF]]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c index c7a4bda6faa74..3fb0b987409db 100644 --- a/clang/test/CIR/CodeGenOpenACC/parallel.c +++ b/clang/test/CIR/CodeGenOpenACC/parallel.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s void acc_parallel(int cond) { // CHECK: cir.func @acc_parallel(%[[ARG:.*]]: !s32i{{.*}}) { @@ -63,5 +63,47 @@ void acc_parallel(int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc parallel if(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel if(1) + {} + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel if(cond == 1) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel if(cond == 1) self(cond == 2) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES_IF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_IF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_IF]] : !cir.bool to i1 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[EQ_RES_SELF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[TWO_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_SELF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_SELF]] : !cir.bool to i1 + // CHECK-NEXT: acc.parallel self(%[[CONV_CAST_SELF]]) if(%[[CONV_CAST_IF]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c index 38a38ad6c9514..b72f44a2ea473 100644 --- a/clang/test/CIR/CodeGenOpenACC/serial.c +++ b/clang/test/CIR/CodeGenOpenACC/serial.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fopenacc -emit-cir -fclangir %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s void acc_serial(int cond) { // CHECK: cir.func @acc_serial(%[[ARG:.*]]: !s32i{{.*}}) { @@ -64,5 +64,47 @@ void acc_serial(int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc serial if(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial if(1) + {} + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial if(cond == 1) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial if(%[[CONV_CAST]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial if(cond == 1) self(cond == 2) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[EQ_RES_IF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[ONE_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_IF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_IF]] : !cir.bool to i1 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[EQ_RES_SELF:.*]] = cir.cmp(eq, %[[COND_LOAD]], %[[TWO_LITERAL]]) : !s32i, !cir.bool + // CHECK-NEXT: %[[CONV_CAST_SELF:.*]] = builtin.unrealized_conversion_cast %[[EQ_RES_SELF]] : !cir.bool to i1 + // CHECK-NEXT: acc.serial self(%[[CONV_CAST_SELF]]) if(%[[CONV_CAST_IF]]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } From 0b8f817aab7a242e0bfb519cb07c8979ffadef36 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 15 Apr 2025 16:59:05 -0700 Subject: [PATCH 059/710] [BOLT] Fix conditional compilation of hugify.cpp (#135880) Fix builds after #117158: do not build hugify.cpp on Apple platforms. --- bolt/runtime/hugify.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp index 67d5fa26007d2..672b04247dfa4 100644 --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -6,8 +6,8 @@ // //===---------------------------------------------------------------------===// -#if defined(__x86_64__) || \ - (defined(__aarch64__) || defined(__arm64__)) && !defined(__APPLE__) +#if (defined(__x86_64__) || defined(__aarch64__) || defined(__arm64__)) && \ + !defined(__APPLE__) #include "common.h" From bd4d3519c708d70ed8c827a27b63f13b0229ef00 Mon Sep 17 00:00:00 2001 From: Camsyn Date: Wed, 16 Apr 2025 08:15:57 +0800 Subject: [PATCH 060/710] [ASan] Prevent ASan/LSan deadlock by preloading modules before error reporting (#131756) ### Description This PR resolves a deadlock between AddressSanitizer (ASan) and LeakSanitizer (LSan) that occurs when both sanitizers attempt to acquire locks in conflicting orders across threads. The fix ensures safe lock acquisition ordering by preloading module information before error reporting. --- ### Issue Details **Reproducer** ```cpp // Thread 1: ASan error path int arr[1] = {0}; std::thread t([&]() { arr[1] = 1; // Triggers ASan OOB error }); // Thread 2: LSan check path __lsan_do_leak_check(); ``` **Lock Order Conflict**: - Thread 1 (ASan error reporting): 1. Acquires ASan thread registry lock (B) 1. Attempts to acquire libdl lock (A) via `dl_iterate_phdr` - Thread 2 (LSan leak check): 1. Acquires libdl lock (A) via `dl_iterate_phdr` 1. Attempts to acquire ASan thread registry lock (B) This creates a circular wait condition (A -> B -> A) meeting all four Coffman deadlock criteria. --- ### Fix Strategy The root cause lies in ASan's error reporting path needing `dl_iterate_phdr` (requiring lock A) while already holding its thread registry lock (B). The solution: 1. **Preload Modules Early**: Force module list initialization _before_ acquiring ASan's thread lock 2. **Avoid Nested Locking**: Ensure symbolization (via dl_iterate_phdr) completes before error reporting locks Key code change: ```cpp // Before acquiring ASan's thread registry lock: Symbolizer::GetOrInit()->GetRefreshedListOfModules(); ``` This guarantees module information is cached before lock acquisition, eliminating the need for `dl_iterate_phdr` calls during error reporting. --- ### Testing Added **asan_lsan_deadlock.cpp** test case: - Reproduces deadlock reliably without fix **under idle system conditions** - Uses watchdog thread to detect hangs - Verifies ASan error reports correctly without deadlock **Note**: Due to the inherent non-determinism of thread scheduling and lock acquisition timing, this test may not reliably reproduce the deadlock on busy systems (e.g., during parallel `ninja check-asan` runs). --- ### Impact - Fixes rare but severe deadlocks in mixed ASan+LSan environments - Maintains thread safety guarantees for both sanitizers - No user-visible behavior changes except deadlock elimination --- ### Relevant Buggy Code - Code in ASan's asan_report.cpp ```cpp explicit ScopedInErrorReport(bool fatal = false) : halt_on_error_(fatal || flags()->halt_on_error) { // Acquire lock B asanThreadRegistry().Lock(); } ~ScopedInErrorReport() { ... // Try to acquire lock A under holding lock B via the following path // #4 0x000071a353d83e93 in __GI___dl_iterate_phdr ( // callback=0x5d1a07a39580 <__sanitizer::dl_iterate_phdr_cb(dl_phdr_info*, unsigned long, void*)>, // data=0x6da3510fd3f0) at ./elf/dl-iteratephdr.c:39 // #5 0x00005d1a07a39574 in __sanitizer::ListOfModules::init (this=0x71a353ebc080) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp:784 // #6 0x00005d1a07a429e3 in __sanitizer::Symbolizer::RefreshModules (this=0x71a353ebc058) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp:188 // #7 __sanitizer::Symbolizer::FindModuleForAddress (this=this@entry=0x71a353ebc058, // address=address@entry=102366378805727) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp:214 // #8 0x00005d1a07a4291b in __sanitizer::Symbolizer::SymbolizePC (this=0x71a353ebc058, addr=102366378805727) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp:88 // #9 0x00005d1a07a40df7 in __sanitizer::(anonymous namespace)::StackTraceTextPrinter::ProcessAddressFrames ( // this=this@entry=0x6da3510fd520, pc=102366378805727) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp:37 // #10 0x00005d1a07a40d27 in __sanitizer::StackTrace::PrintTo (this=this@entry=0x6da3510fd5e8, // output=output@entry=0x6da3510fd588) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp:110 // #11 0x00005d1a07a410a1 in __sanitizer::StackTrace::Print (this=0x6da3510fd5e8) // at llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp:133 // #12 0x00005d1a0798758d in __asan::ErrorGeneric::Print ( // this=0x5d1a07aa4e08 <__asan::ScopedInErrorReport::current_error_+8>) // at llvm-project/compiler-rt/lib/asan/asan_errors.cpp:617 current_error_.Print(); ... } ``` - Code in LSan's lsan_common_linux.cpp ```cpp void LockStuffAndStopTheWorld(StopTheWorldCallback callback, CheckForLeaksParam *argument) { // Acquire lock A dl_iterate_phdr(LockStuffAndStopTheWorldCallback, ¶m); } static int LockStuffAndStopTheWorldCallback(struct dl_phdr_info *info, size_t size, void *data) { // Try to acquire lock B under holding lock A via the following path // #3 0x000055555562b34a in __sanitizer::ThreadRegistry::Lock (this=) // at llvm-project/compiler-rt/lib/asan/../sanitizer_common/sanitizer_thread_registry.h:99 // #4 __lsan::LockThreads () at llvm-project/compiler-rt/lib/asan/asan_thread.cpp:484 // #5 0x0000555555652629 in __lsan::ScopedStopTheWorldLock::ScopedStopTheWorldLock (this=) // at llvm-project/compiler-rt/lib/lsan/lsan_common.h:164 // #6 __lsan::LockStuffAndStopTheWorldCallback (info=, size=, data=0x0, // data@entry=0x7fffffffd158) at llvm-project/compiler-rt/lib/lsan/lsan_common_linux.cpp:120 ScopedStopTheWorldLock lock; DoStopTheWorldParam *param = reinterpret_cast(data); StopTheWorld(param->callback, param->argument); return 1; } ``` --- compiler-rt/lib/asan/asan_report.cpp | 25 +++++++ .../asan/TestCases/asan_lsan_deadlock.cpp | 72 +++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp diff --git a/compiler-rt/lib/asan/asan_report.cpp b/compiler-rt/lib/asan/asan_report.cpp index 6302866805f37..e515f20548c00 100644 --- a/compiler-rt/lib/asan/asan_report.cpp +++ b/compiler-rt/lib/asan/asan_report.cpp @@ -126,6 +126,31 @@ class ScopedInErrorReport { public: explicit ScopedInErrorReport(bool fatal = false) : halt_on_error_(fatal || flags()->halt_on_error) { + // Deadlock Prevention Between ASan and LSan + // + // Background: + // - The `dl_iterate_phdr` function requires holding libdl's internal lock + // (Lock A). + // - LSan acquires the ASan thread registry lock (Lock B) *after* calling + // `dl_iterate_phdr`. + // + // Problem Scenario: + // When ASan attempts to call `dl_iterate_phdr` while holding Lock B (e.g., + // during error reporting via `ErrorDescription::Print`), a circular lock + // dependency may occur: + // 1. Thread 1: Holds Lock B → Requests Lock A (via dl_iterate_phdr) + // 2. Thread 2: Holds Lock A → Requests Lock B (via LSan operations) + // + // Solution: + // Proactively load all required modules before acquiring Lock B. + // This ensures: + // 1. Any `dl_iterate_phdr` calls during module loading complete before + // locking. + // 2. Subsequent error reporting avoids nested lock acquisition patterns. + // 3. Eliminates the lock order inversion risk between libdl and ASan's + // thread registry. + Symbolizer::GetOrInit()->GetRefreshedListOfModules(); + // Make sure the registry and sanitizer report mutexes are locked while // we're printing an error report. // We can lock them only here to avoid self-deadlock in case of diff --git a/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp new file mode 100644 index 0000000000000..4e1a2415ad013 --- /dev/null +++ b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp @@ -0,0 +1,72 @@ +// Test for potential deadlock in LeakSanitizer+AddressSanitizer. +// REQUIRES: leak-detection +// +// RUN: %clangxx_asan -O0 %s -o %t +// RUN: %env_asan_opts=detect_leaks=1 not %run %t 2>&1 | FileCheck %s + +/* + * Purpose: Verify deadlock prevention between ASan error reporting and LSan leak checking. + * + * Test Design: + * 1. Creates contention scenario between: + * - ASan's error reporting (requires lock B -> lock A ordering) + * - LSan's leak check (requires lock A -> lock B ordering) + * 2. Thread timing: + * - Main thread: Holds 'in' mutex -> Triggers LSan check (lock A then B) + * - Worker thread: Triggers ASan OOB error (lock B then A via symbolization) + * + * Deadlock Condition (if unfixed): + * Circular lock dependency forms when: + * [Main Thread] LSan: lock A -> requests lock B + * [Worker Thread] ASan: lock B -> requests lock A + * + * Success Criteria: + * With proper lock ordering enforcement, watchdog should NOT trigger - test exits normally. + * If deadlock occurs, watchdog terminates via _exit(1) after 10s timeout. + */ + +#include +#include +#include +#include +#include + +void Watchdog() { + // Safety mechanism: Turn infinite deadlock into finite test failure + usleep(10000000); + // CHECK-NOT: Timeout! Deadlock detected. + puts("Timeout! Deadlock detected."); + fflush(stdout); + _exit(1); +} + +int main(int argc, char **argv) { + int arr[1] = {0}; + std::mutex in; + in.lock(); + + std::thread w(Watchdog); + w.detach(); + + std::thread t([&]() { + in.unlock(); + /* + * Provoke ASan error: ASan's error reporting acquires: + * 1. ASan's thread registry lock (B) during the reporting + * 2. dl_iterate_phdr lock (A) during symbolization + */ + // CHECK: SUMMARY: AddressSanitizer: stack-buffer-overflow + arr[argc] = 1; // Deliberate OOB access + }); + + in.lock(); + /* + * Critical section: LSan's check acquires: + * 1. dl_iterate_phdr lock (A) + * 2. ASan's thread registry lock (B) + * before Stop The World. + */ + __lsan_do_leak_check(); + t.join(); + return 0; +} From 77f0708b9d4feee8b8a67a5f571be741be4e26af Mon Sep 17 00:00:00 2001 From: Vinay Deshmukh <32487576+vinay-deshmukh@users.noreply.github.com> Date: Tue, 15 Apr 2025 20:24:07 -0400 Subject: [PATCH 061/710] [libc]: Remove `-Wglobal-constructors` for libc tests (#131485) * Relates to: https://github.com/llvm/llvm-project/issues/119281 * Removes `-Wglobal-constructors` as per: https://github.com/llvm/llvm-project/pull/131485#pullrequestreview-2728020622 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 63a8e9ecda002..a28e15fc5e394 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -57,7 +57,6 @@ function(_get_common_test_compile_options output_var c_test flags) list(APPEND compile_options "-Wnewline-eof") list(APPEND compile_options "-Wnonportable-system-include-path") list(APPEND compile_options "-Wthread-safety") - # list(APPEND compile_options "-Wglobal-constructors") endif() endif() set(${output_var} ${compile_options} PARENT_SCOPE) From 6c6ab2a270b799f1397926c9064fa30fe2be1d96 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Wed, 16 Apr 2025 09:09:13 +0800 Subject: [PATCH 062/710] AArch64: Set FMAXIMUMNUM and FMINIMUMNUM as Promote if not fullfp16 (#135708) Since Promote will emit FP_EXTEND, the result of it will never be sNaN, so we don't need worry about duplicated of FCANONICALIZE in expandFMINIMUMNUM_FMAXIMUMNUM. --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 + .../AArch64/fp-maximumnum-minimumnum.ll | 1751 ++++++++++++++--- 2 files changed, 1480 insertions(+), 273 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 830ec6886e6bc..bea8087750d6e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -795,6 +795,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM, + ISD::FMINIMUMNUM, + ISD::FMAXIMUMNUM, ISD::FCANONICALIZE, ISD::STRICT_FADD, ISD::STRICT_FSUB, diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index bb3f9a3e52a16..c6b8e41f9bdfd 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=aarch64 --mattr=+fullfp16 < %s | FileCheck %s --check-prefix=AARCH64 +; RUN: llc --mtriple=aarch64 --mattr=+fullfp16 < %s | FileCheck %s --check-prefixes=AARCH64,FULLFP16 +; RUN: llc --mtriple=aarch64 < %s | FileCheck %s --check-prefixes=AARCH64,NOFULLFP16 ;;;;;;;;;;;;;;;; max_f64 define double @max_nnan_f64(double %a, double %b) { @@ -142,96 +143,397 @@ entry: ;;;;;;;;;;;;;;;;;; max_f16 define half @max_nnan_f16(half %a, half %b) { -; AARCH64-LABEL: max_nnan_f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fmaxnm h0, h0, h1 -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fmaxnm h0, h0, h1 +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan half @llvm.maximumnum.f16(half %a, half %b) ret half %c } define <2 x half> @max_nnan_v2f16(<2 x half> %a, <2 x half> %b) { -; AARCH64-LABEL: max_nnan_v2f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_v2f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_v2f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fmaxnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fmaxnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %c } define <4 x half> @max_nnan_v4f16(<4 x half> %a, <4 x half> %b) { -; AARCH64-LABEL: max_nnan_v4f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_v4f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_v4f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fmaxnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fmaxnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan <4 x half> @llvm.maximumnum.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %c } define <8 x half> @max_nnan_v8f16(<8 x half> %a, <8 x half> %b) { -; AARCH64-LABEL: max_nnan_v8f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_v8f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_v8f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h1 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h6, v1.h[2] +; NOFULLFP16-NEXT: mov h7, v0.h[2] +; NOFULLFP16-NEXT: mov h16, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: mov h5, v0.h[3] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fmaxnm s3, s3, s2 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fcvt h2, s4 +; NOFULLFP16-NEXT: fmaxnm s4, s7, s6 +; NOFULLFP16-NEXT: mov h6, v1.h[4] +; NOFULLFP16-NEXT: mov h7, v0.h[4] +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fmaxnm s5, s5, s16 +; NOFULLFP16-NEXT: mov h16, v0.h[5] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v2.h[1], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: fcvt s6, h7 +; NOFULLFP16-NEXT: mov h7, v1.h[5] +; NOFULLFP16-NEXT: fcvt h5, s5 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: mov v2.h[2], v4.h[0] +; NOFULLFP16-NEXT: mov h4, v1.h[6] +; NOFULLFP16-NEXT: fmaxnm s3, s6, s3 +; NOFULLFP16-NEXT: mov h6, v0.h[6] +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov v2.h[3], v5.h[0] +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt s5, h6 +; NOFULLFP16-NEXT: fmaxnm s6, s16, s7 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: mov v2.h[4], v3.h[0] +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt h3, s6 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s1 +; NOFULLFP16-NEXT: mov v2.h[5], v3.h[0] +; NOFULLFP16-NEXT: fcvt h3, s4 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v2.h[6], v3.h[0] +; NOFULLFP16-NEXT: mov v2.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v2.16b +; NOFULLFP16-NEXT: ret entry: %c = call nnan <8 x half> @llvm.maximumnum.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %c } define <9 x half> @max_nnan_v9f16(<9 x half> %a, <9 x half> %b) { -; AARCH64-LABEL: max_nnan_v9f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 -; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 -; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 -; AARCH64-NEXT: add x9, sp, #16 -; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 -; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 -; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 -; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 -; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 -; AARCH64-NEXT: mov v0.h[1], v1.h[0] -; AARCH64-NEXT: ldr h1, [sp, #8] -; AARCH64-NEXT: ld1 { v1.h }[1], [x9] -; AARCH64-NEXT: add x9, sp, #24 -; AARCH64-NEXT: mov v0.h[2], v2.h[0] -; AARCH64-NEXT: ldr h2, [sp, #72] -; AARCH64-NEXT: ld1 { v1.h }[2], [x9] -; AARCH64-NEXT: add x9, sp, #32 -; AARCH64-NEXT: mov v0.h[3], v3.h[0] -; AARCH64-NEXT: ld1 { v1.h }[3], [x9] -; AARCH64-NEXT: add x9, sp, #40 -; AARCH64-NEXT: ldr h3, [sp] -; AARCH64-NEXT: ld1 { v1.h }[4], [x9] -; AARCH64-NEXT: add x9, sp, #48 -; AARCH64-NEXT: fmaxnm v2.8h, v3.8h, v2.8h -; AARCH64-NEXT: mov v0.h[4], v4.h[0] -; AARCH64-NEXT: ld1 { v1.h }[5], [x9] -; AARCH64-NEXT: add x9, sp, #56 -; AARCH64-NEXT: str h2, [x8, #16] -; AARCH64-NEXT: mov v0.h[5], v5.h[0] -; AARCH64-NEXT: ld1 { v1.h }[6], [x9] -; AARCH64-NEXT: add x9, sp, #64 -; AARCH64-NEXT: mov v0.h[6], v6.h[0] -; AARCH64-NEXT: ld1 { v1.h }[7], [x9] -; AARCH64-NEXT: mov v0.h[7], v7.h[0] -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: str q0, [x8] -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_v9f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: // kill: def $h0 killed $h0 def $q0 +; FULLFP16-NEXT: // kill: def $h1 killed $h1 def $q1 +; FULLFP16-NEXT: // kill: def $h2 killed $h2 def $q2 +; FULLFP16-NEXT: add x9, sp, #16 +; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 +; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 +; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 +; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 +; FULLFP16-NEXT: mov v0.h[1], v1.h[0] +; FULLFP16-NEXT: ldr h1, [sp, #8] +; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #24 +; FULLFP16-NEXT: mov v0.h[2], v2.h[0] +; FULLFP16-NEXT: ldr h2, [sp, #72] +; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] +; FULLFP16-NEXT: add x9, sp, #32 +; FULLFP16-NEXT: mov v0.h[3], v3.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] +; FULLFP16-NEXT: add x9, sp, #40 +; FULLFP16-NEXT: ldr h3, [sp] +; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: fmaxnm v2.8h, v3.8h, v2.8h +; FULLFP16-NEXT: mov v0.h[4], v4.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] +; FULLFP16-NEXT: add x9, sp, #56 +; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: add x9, sp, #64 +; FULLFP16-NEXT: mov v0.h[6], v6.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: str q0, [x8] +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_v9f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: ldr h16, [sp, #16] +; NOFULLFP16-NEXT: ldr h17, [sp, #8] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: ldr h18, [sp, #24] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fmaxnm s1, s1, s16 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s17 +; NOFULLFP16-NEXT: ldr h16, [sp, #32] +; NOFULLFP16-NEXT: fmaxnm s2, s2, s18 +; NOFULLFP16-NEXT: ldr h17, [sp, #40] +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fmaxnm s3, s3, s16 +; NOFULLFP16-NEXT: fmaxnm s4, s4, s17 +; NOFULLFP16-NEXT: mov v0.h[1], v1.h[0] +; NOFULLFP16-NEXT: ldr h1, [sp, #48] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: ldr h2, [sp, #56] +; NOFULLFP16-NEXT: fmaxnm s1, s5, s1 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: ldr h5, [sp, #64] +; NOFULLFP16-NEXT: mov v0.h[3], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: ldr h6, [sp, #72] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: mov v0.h[4], v4.h[0] +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h5 +; NOFULLFP16-NEXT: fcvt s4, h7 +; NOFULLFP16-NEXT: ldr h5, [sp] +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: mov v0.h[5], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fmaxnm s2, s4, s3 +; NOFULLFP16-NEXT: fmaxnm s3, s5, s6 +; NOFULLFP16-NEXT: mov v0.h[6], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: mov v0.h[7], v1.h[0] +; NOFULLFP16-NEXT: str h2, [x8, #16] +; NOFULLFP16-NEXT: str q0, [x8] +; NOFULLFP16-NEXT: ret entry: %c = call nnan <9 x half> @llvm.maximumnum.v9f16(<9 x half> %a, <9 x half> %b) ret <9 x half> %c } define <16 x half> @max_nnan_v16f16(<16 x half> %a, <16 x half> %b) { -; AARCH64-LABEL: max_nnan_v16f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fmaxnm v1.8h, v1.8h, v3.8h -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v2.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_nnan_v16f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fmaxnm v1.8h, v1.8h, v3.8h +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v2.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_nnan_v16f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h6, v2.h[1] +; NOFULLFP16-NEXT: mov h7, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h2 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h16, v3.h[1] +; NOFULLFP16-NEXT: mov h17, v1.h[1] +; NOFULLFP16-NEXT: mov h18, v2.h[2] +; NOFULLFP16-NEXT: mov h19, v0.h[2] +; NOFULLFP16-NEXT: fcvt s20, h3 +; NOFULLFP16-NEXT: fcvt s21, h1 +; NOFULLFP16-NEXT: mov h22, v3.h[2] +; NOFULLFP16-NEXT: mov h23, v1.h[2] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h24, v0.h[6] +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt s5, h16 +; NOFULLFP16-NEXT: fcvt s16, h17 +; NOFULLFP16-NEXT: fcvt s17, h18 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: mov h19, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s20, s21, s20 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: mov h22, v3.h[3] +; NOFULLFP16-NEXT: fmaxnm s6, s7, s6 +; NOFULLFP16-NEXT: mov h7, v2.h[3] +; NOFULLFP16-NEXT: mov h25, v1.h[6] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: fmaxnm s5, s16, s5 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: mov h23, v1.h[3] +; NOFULLFP16-NEXT: fmaxnm s17, s18, s17 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt h19, s5 +; NOFULLFP16-NEXT: fcvt h5, s20 +; NOFULLFP16-NEXT: fmaxnm s16, s16, s21 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: mov h21, v2.h[4] +; NOFULLFP16-NEXT: mov h23, v1.h[4] +; NOFULLFP16-NEXT: mov v4.h[1], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h22 +; NOFULLFP16-NEXT: mov h22, v0.h[4] +; NOFULLFP16-NEXT: fmaxnm s7, s18, s7 +; NOFULLFP16-NEXT: mov h18, v3.h[4] +; NOFULLFP16-NEXT: mov v5.h[1], v19.h[0] +; NOFULLFP16-NEXT: fcvt h16, s16 +; NOFULLFP16-NEXT: fmaxnm s6, s20, s6 +; NOFULLFP16-NEXT: mov v4.h[2], v17.h[0] +; NOFULLFP16-NEXT: fcvt s17, h21 +; NOFULLFP16-NEXT: fcvt s19, h22 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: mov h21, v2.h[5] +; NOFULLFP16-NEXT: mov h22, v0.h[5] +; NOFULLFP16-NEXT: mov v5.h[2], v16.h[0] +; NOFULLFP16-NEXT: mov h16, v3.h[5] +; NOFULLFP16-NEXT: mov h23, v1.h[5] +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: fmaxnm s17, s19, s17 +; NOFULLFP16-NEXT: mov h19, v2.h[6] +; NOFULLFP16-NEXT: mov v4.h[3], v7.h[0] +; NOFULLFP16-NEXT: fmaxnm s18, s20, s18 +; NOFULLFP16-NEXT: mov h20, v3.h[6] +; NOFULLFP16-NEXT: fcvt s7, h21 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: fcvt s22, h24 +; NOFULLFP16-NEXT: mov h2, v2.h[7] +; NOFULLFP16-NEXT: mov v5.h[3], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h16 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: fcvt s19, h19 +; NOFULLFP16-NEXT: fcvt s23, h25 +; NOFULLFP16-NEXT: fcvt h18, s18 +; NOFULLFP16-NEXT: fcvt s20, h20 +; NOFULLFP16-NEXT: mov h3, v3.h[7] +; NOFULLFP16-NEXT: fmaxnm s7, s21, s7 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fmaxnm s6, s16, s6 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: mov v4.h[4], v17.h[0] +; NOFULLFP16-NEXT: fmaxnm s16, s22, s19 +; NOFULLFP16-NEXT: mov v5.h[4], v18.h[0] +; NOFULLFP16-NEXT: fmaxnm s17, s23, s20 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s2 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt h2, s16 +; NOFULLFP16-NEXT: fmaxnm s1, s1, s3 +; NOFULLFP16-NEXT: mov v4.h[5], v7.h[0] +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v5.h[5], v6.h[0] +; NOFULLFP16-NEXT: fcvt h6, s17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v4.h[6], v2.h[0] +; NOFULLFP16-NEXT: mov v5.h[6], v6.h[0] +; NOFULLFP16-NEXT: mov v4.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v5.h[7], v1.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v4.16b +; NOFULLFP16-NEXT: mov v1.16b, v5.16b +; NOFULLFP16-NEXT: ret entry: %c = call nnan <16 x half> @llvm.maximumnum.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c @@ -378,96 +680,397 @@ entry: ;;;;;;;;;;;;;;;;;; min_f16 define half @min_nnan_f16(half %a, half %b) { -; AARCH64-LABEL: min_nnan_f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm h0, h0, h1 -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm h0, h0, h1 +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fminnm s0, s0, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan half @llvm.minimumnum.f16(half %a, half %b) ret half %c } define <2 x half> @min_nnan_v2f16(<2 x half> %a, <2 x half> %b) { -; AARCH64-LABEL: min_nnan_v2f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_v2f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_v2f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fminnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fminnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fminnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %c } define <4 x half> @min_nnan_v4f16(<4 x half> %a, <4 x half> %b) { -; AARCH64-LABEL: min_nnan_v4f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_v4f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_v4f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fminnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fminnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fminnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call nnan <4 x half> @llvm.minimumnum.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %c } define <8 x half> @min_nnan_v8f16(<8 x half> %a, <8 x half> %b) { -; AARCH64-LABEL: min_nnan_v8f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_v8f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_v8f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h1 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h6, v1.h[2] +; NOFULLFP16-NEXT: mov h7, v0.h[2] +; NOFULLFP16-NEXT: mov h16, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: mov h5, v0.h[3] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fminnm s3, s3, s2 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fcvt h2, s4 +; NOFULLFP16-NEXT: fminnm s4, s7, s6 +; NOFULLFP16-NEXT: mov h6, v1.h[4] +; NOFULLFP16-NEXT: mov h7, v0.h[4] +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fminnm s5, s5, s16 +; NOFULLFP16-NEXT: mov h16, v0.h[5] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v2.h[1], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: fcvt s6, h7 +; NOFULLFP16-NEXT: mov h7, v1.h[5] +; NOFULLFP16-NEXT: fcvt h5, s5 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: mov v2.h[2], v4.h[0] +; NOFULLFP16-NEXT: mov h4, v1.h[6] +; NOFULLFP16-NEXT: fminnm s3, s6, s3 +; NOFULLFP16-NEXT: mov h6, v0.h[6] +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov v2.h[3], v5.h[0] +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt s5, h6 +; NOFULLFP16-NEXT: fminnm s6, s16, s7 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: mov v2.h[4], v3.h[0] +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt h3, s6 +; NOFULLFP16-NEXT: fminnm s0, s0, s1 +; NOFULLFP16-NEXT: mov v2.h[5], v3.h[0] +; NOFULLFP16-NEXT: fcvt h3, s4 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v2.h[6], v3.h[0] +; NOFULLFP16-NEXT: mov v2.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v2.16b +; NOFULLFP16-NEXT: ret entry: %c = call nnan <8 x half> @llvm.minimumnum.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %c } define <9 x half> @min_nnan_v9f16(<9 x half> %a, <9 x half> %b) { -; AARCH64-LABEL: min_nnan_v9f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 -; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 -; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 -; AARCH64-NEXT: add x9, sp, #16 -; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 -; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 -; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 -; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 -; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 -; AARCH64-NEXT: mov v0.h[1], v1.h[0] -; AARCH64-NEXT: ldr h1, [sp, #8] -; AARCH64-NEXT: ld1 { v1.h }[1], [x9] -; AARCH64-NEXT: add x9, sp, #24 -; AARCH64-NEXT: mov v0.h[2], v2.h[0] -; AARCH64-NEXT: ldr h2, [sp, #72] -; AARCH64-NEXT: ld1 { v1.h }[2], [x9] -; AARCH64-NEXT: add x9, sp, #32 -; AARCH64-NEXT: mov v0.h[3], v3.h[0] -; AARCH64-NEXT: ld1 { v1.h }[3], [x9] -; AARCH64-NEXT: add x9, sp, #40 -; AARCH64-NEXT: ldr h3, [sp] -; AARCH64-NEXT: ld1 { v1.h }[4], [x9] -; AARCH64-NEXT: add x9, sp, #48 -; AARCH64-NEXT: fminnm v2.8h, v3.8h, v2.8h -; AARCH64-NEXT: mov v0.h[4], v4.h[0] -; AARCH64-NEXT: ld1 { v1.h }[5], [x9] -; AARCH64-NEXT: add x9, sp, #56 -; AARCH64-NEXT: str h2, [x8, #16] -; AARCH64-NEXT: mov v0.h[5], v5.h[0] -; AARCH64-NEXT: ld1 { v1.h }[6], [x9] -; AARCH64-NEXT: add x9, sp, #64 -; AARCH64-NEXT: mov v0.h[6], v6.h[0] -; AARCH64-NEXT: ld1 { v1.h }[7], [x9] -; AARCH64-NEXT: mov v0.h[7], v7.h[0] -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: str q0, [x8] -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_v9f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: // kill: def $h0 killed $h0 def $q0 +; FULLFP16-NEXT: // kill: def $h1 killed $h1 def $q1 +; FULLFP16-NEXT: // kill: def $h2 killed $h2 def $q2 +; FULLFP16-NEXT: add x9, sp, #16 +; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 +; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 +; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 +; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 +; FULLFP16-NEXT: mov v0.h[1], v1.h[0] +; FULLFP16-NEXT: ldr h1, [sp, #8] +; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #24 +; FULLFP16-NEXT: mov v0.h[2], v2.h[0] +; FULLFP16-NEXT: ldr h2, [sp, #72] +; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] +; FULLFP16-NEXT: add x9, sp, #32 +; FULLFP16-NEXT: mov v0.h[3], v3.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] +; FULLFP16-NEXT: add x9, sp, #40 +; FULLFP16-NEXT: ldr h3, [sp] +; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: fminnm v2.8h, v3.8h, v2.8h +; FULLFP16-NEXT: mov v0.h[4], v4.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] +; FULLFP16-NEXT: add x9, sp, #56 +; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: add x9, sp, #64 +; FULLFP16-NEXT: mov v0.h[6], v6.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: str q0, [x8] +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_v9f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: ldr h16, [sp, #16] +; NOFULLFP16-NEXT: ldr h17, [sp, #8] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: ldr h18, [sp, #24] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fminnm s1, s1, s16 +; NOFULLFP16-NEXT: fminnm s0, s0, s17 +; NOFULLFP16-NEXT: ldr h16, [sp, #32] +; NOFULLFP16-NEXT: fminnm s2, s2, s18 +; NOFULLFP16-NEXT: ldr h17, [sp, #40] +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fminnm s3, s3, s16 +; NOFULLFP16-NEXT: fminnm s4, s4, s17 +; NOFULLFP16-NEXT: mov v0.h[1], v1.h[0] +; NOFULLFP16-NEXT: ldr h1, [sp, #48] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: ldr h2, [sp, #56] +; NOFULLFP16-NEXT: fminnm s1, s5, s1 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: ldr h5, [sp, #64] +; NOFULLFP16-NEXT: mov v0.h[3], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: ldr h6, [sp, #72] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: mov v0.h[4], v4.h[0] +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h5 +; NOFULLFP16-NEXT: fcvt s4, h7 +; NOFULLFP16-NEXT: ldr h5, [sp] +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: mov v0.h[5], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fminnm s2, s4, s3 +; NOFULLFP16-NEXT: fminnm s3, s5, s6 +; NOFULLFP16-NEXT: mov v0.h[6], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: mov v0.h[7], v1.h[0] +; NOFULLFP16-NEXT: str h2, [x8, #16] +; NOFULLFP16-NEXT: str q0, [x8] +; NOFULLFP16-NEXT: ret entry: %c = call nnan <9 x half> @llvm.minimumnum.v9f16(<9 x half> %a, <9 x half> %b) ret <9 x half> %c } define <16 x half> @min_nnan_v16f16(<16 x half> %a, <16 x half> %b) { -; AARCH64-LABEL: min_nnan_v16f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v3.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v2.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_nnan_v16f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v3.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v2.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_nnan_v16f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h6, v2.h[1] +; NOFULLFP16-NEXT: mov h7, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h2 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h16, v3.h[1] +; NOFULLFP16-NEXT: mov h17, v1.h[1] +; NOFULLFP16-NEXT: mov h18, v2.h[2] +; NOFULLFP16-NEXT: mov h19, v0.h[2] +; NOFULLFP16-NEXT: fcvt s20, h3 +; NOFULLFP16-NEXT: fcvt s21, h1 +; NOFULLFP16-NEXT: mov h22, v3.h[2] +; NOFULLFP16-NEXT: mov h23, v1.h[2] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h24, v0.h[6] +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt s5, h16 +; NOFULLFP16-NEXT: fcvt s16, h17 +; NOFULLFP16-NEXT: fcvt s17, h18 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: mov h19, v0.h[3] +; NOFULLFP16-NEXT: fminnm s20, s21, s20 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: mov h22, v3.h[3] +; NOFULLFP16-NEXT: fminnm s6, s7, s6 +; NOFULLFP16-NEXT: mov h7, v2.h[3] +; NOFULLFP16-NEXT: mov h25, v1.h[6] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: fminnm s5, s16, s5 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: mov h23, v1.h[3] +; NOFULLFP16-NEXT: fminnm s17, s18, s17 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt h19, s5 +; NOFULLFP16-NEXT: fcvt h5, s20 +; NOFULLFP16-NEXT: fminnm s16, s16, s21 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: mov h21, v2.h[4] +; NOFULLFP16-NEXT: mov h23, v1.h[4] +; NOFULLFP16-NEXT: mov v4.h[1], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h22 +; NOFULLFP16-NEXT: mov h22, v0.h[4] +; NOFULLFP16-NEXT: fminnm s7, s18, s7 +; NOFULLFP16-NEXT: mov h18, v3.h[4] +; NOFULLFP16-NEXT: mov v5.h[1], v19.h[0] +; NOFULLFP16-NEXT: fcvt h16, s16 +; NOFULLFP16-NEXT: fminnm s6, s20, s6 +; NOFULLFP16-NEXT: mov v4.h[2], v17.h[0] +; NOFULLFP16-NEXT: fcvt s17, h21 +; NOFULLFP16-NEXT: fcvt s19, h22 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: mov h21, v2.h[5] +; NOFULLFP16-NEXT: mov h22, v0.h[5] +; NOFULLFP16-NEXT: mov v5.h[2], v16.h[0] +; NOFULLFP16-NEXT: mov h16, v3.h[5] +; NOFULLFP16-NEXT: mov h23, v1.h[5] +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: fminnm s17, s19, s17 +; NOFULLFP16-NEXT: mov h19, v2.h[6] +; NOFULLFP16-NEXT: mov v4.h[3], v7.h[0] +; NOFULLFP16-NEXT: fminnm s18, s20, s18 +; NOFULLFP16-NEXT: mov h20, v3.h[6] +; NOFULLFP16-NEXT: fcvt s7, h21 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: fcvt s22, h24 +; NOFULLFP16-NEXT: mov h2, v2.h[7] +; NOFULLFP16-NEXT: mov v5.h[3], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h16 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: fcvt s19, h19 +; NOFULLFP16-NEXT: fcvt s23, h25 +; NOFULLFP16-NEXT: fcvt h18, s18 +; NOFULLFP16-NEXT: fcvt s20, h20 +; NOFULLFP16-NEXT: mov h3, v3.h[7] +; NOFULLFP16-NEXT: fminnm s7, s21, s7 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fminnm s6, s16, s6 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: mov v4.h[4], v17.h[0] +; NOFULLFP16-NEXT: fminnm s16, s22, s19 +; NOFULLFP16-NEXT: mov v5.h[4], v18.h[0] +; NOFULLFP16-NEXT: fminnm s17, s23, s20 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fminnm s0, s0, s2 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt h2, s16 +; NOFULLFP16-NEXT: fminnm s1, s1, s3 +; NOFULLFP16-NEXT: mov v4.h[5], v7.h[0] +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v5.h[5], v6.h[0] +; NOFULLFP16-NEXT: fcvt h6, s17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v4.h[6], v2.h[0] +; NOFULLFP16-NEXT: mov v5.h[6], v6.h[0] +; NOFULLFP16-NEXT: mov v4.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v5.h[7], v1.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v4.16b +; NOFULLFP16-NEXT: mov v1.16b, v5.16b +; NOFULLFP16-NEXT: ret entry: %c = call nnan <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c @@ -642,112 +1245,413 @@ entry: ;;;;;;;;;;;;;;;;;; max_f16 define half @max_f16(half %a, half %b) { -; AARCH64-LABEL: max_f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm h1, h1, h1 -; AARCH64-NEXT: fminnm h0, h0, h0 -; AARCH64-NEXT: fmaxnm h0, h0, h1 -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm h1, h1, h1 +; FULLFP16-NEXT: fminnm h0, h0, h0 +; FULLFP16-NEXT: fmaxnm h0, h0, h1 +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: ret entry: %c = call half @llvm.maximumnum.f16(half %a, half %b) ret half %c } define <2 x half> @max_v2f16(<2 x half> %a, <2 x half> %b) { -; AARCH64-LABEL: max_v2f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h -; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_v2f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.4h, v1.4h, v1.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v0.4h +; FULLFP16-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_v2f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fmaxnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fmaxnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %c } define <4 x half> @max_v4f16(<4 x half> %a, <4 x half> %b) { -; AARCH64-LABEL: max_v4f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h -; AARCH64-NEXT: fmaxnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_v4f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.4h, v1.4h, v1.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v0.4h +; FULLFP16-NEXT: fmaxnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_v4f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fmaxnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fmaxnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %c } define <8 x half> @max_v8f16(<8 x half> %a, <8 x half> %b) { -; AARCH64-LABEL: max_v8f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_v8f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_v8f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h1 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h6, v1.h[2] +; NOFULLFP16-NEXT: mov h7, v0.h[2] +; NOFULLFP16-NEXT: mov h16, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: mov h5, v0.h[3] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fmaxnm s3, s3, s2 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fcvt h2, s4 +; NOFULLFP16-NEXT: fmaxnm s4, s7, s6 +; NOFULLFP16-NEXT: mov h6, v1.h[4] +; NOFULLFP16-NEXT: mov h7, v0.h[4] +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fmaxnm s5, s5, s16 +; NOFULLFP16-NEXT: mov h16, v0.h[5] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v2.h[1], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: fcvt s6, h7 +; NOFULLFP16-NEXT: mov h7, v1.h[5] +; NOFULLFP16-NEXT: fcvt h5, s5 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: mov v2.h[2], v4.h[0] +; NOFULLFP16-NEXT: mov h4, v1.h[6] +; NOFULLFP16-NEXT: fmaxnm s3, s6, s3 +; NOFULLFP16-NEXT: mov h6, v0.h[6] +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov v2.h[3], v5.h[0] +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt s5, h6 +; NOFULLFP16-NEXT: fmaxnm s6, s16, s7 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: mov v2.h[4], v3.h[0] +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt h3, s6 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s1 +; NOFULLFP16-NEXT: mov v2.h[5], v3.h[0] +; NOFULLFP16-NEXT: fcvt h3, s4 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v2.h[6], v3.h[0] +; NOFULLFP16-NEXT: mov v2.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v2.16b +; NOFULLFP16-NEXT: ret entry: %c = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %c } define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { -; AARCH64-LABEL: max_v9f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 -; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 -; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 -; AARCH64-NEXT: add x9, sp, #16 -; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 -; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 -; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 -; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 -; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 -; AARCH64-NEXT: mov v0.h[1], v1.h[0] -; AARCH64-NEXT: ldr h1, [sp, #8] -; AARCH64-NEXT: ld1 { v1.h }[1], [x9] -; AARCH64-NEXT: add x9, sp, #24 -; AARCH64-NEXT: mov v0.h[2], v2.h[0] -; AARCH64-NEXT: ldr h2, [sp] -; AARCH64-NEXT: ld1 { v1.h }[2], [x9] -; AARCH64-NEXT: add x9, sp, #32 -; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h -; AARCH64-NEXT: mov v0.h[3], v3.h[0] -; AARCH64-NEXT: ld1 { v1.h }[3], [x9] -; AARCH64-NEXT: add x9, sp, #40 -; AARCH64-NEXT: ldr h3, [sp, #72] -; AARCH64-NEXT: ld1 { v1.h }[4], [x9] -; AARCH64-NEXT: add x9, sp, #48 -; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h -; AARCH64-NEXT: mov v0.h[4], v4.h[0] -; AARCH64-NEXT: ld1 { v1.h }[5], [x9] -; AARCH64-NEXT: add x9, sp, #56 -; AARCH64-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; AARCH64-NEXT: mov v0.h[5], v5.h[0] -; AARCH64-NEXT: ld1 { v1.h }[6], [x9] -; AARCH64-NEXT: add x9, sp, #64 -; AARCH64-NEXT: str h2, [x8, #16] -; AARCH64-NEXT: mov v0.h[6], v6.h[0] -; AARCH64-NEXT: ld1 { v1.h }[7], [x9] -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: mov v0.h[7], v7.h[0] -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: str q0, [x8] -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_v9f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: // kill: def $h0 killed $h0 def $q0 +; FULLFP16-NEXT: // kill: def $h1 killed $h1 def $q1 +; FULLFP16-NEXT: // kill: def $h2 killed $h2 def $q2 +; FULLFP16-NEXT: add x9, sp, #16 +; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 +; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 +; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 +; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 +; FULLFP16-NEXT: mov v0.h[1], v1.h[0] +; FULLFP16-NEXT: ldr h1, [sp, #8] +; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #24 +; FULLFP16-NEXT: mov v0.h[2], v2.h[0] +; FULLFP16-NEXT: ldr h2, [sp] +; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] +; FULLFP16-NEXT: add x9, sp, #32 +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h +; FULLFP16-NEXT: mov v0.h[3], v3.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] +; FULLFP16-NEXT: add x9, sp, #40 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h +; FULLFP16-NEXT: mov v0.h[4], v4.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] +; FULLFP16-NEXT: add x9, sp, #56 +; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: add x9, sp, #64 +; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[6], v6.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: str q0, [x8] +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_v9f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: ldr h16, [sp, #16] +; NOFULLFP16-NEXT: ldr h17, [sp, #8] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: ldr h18, [sp, #24] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fmaxnm s1, s1, s16 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s17 +; NOFULLFP16-NEXT: ldr h16, [sp, #32] +; NOFULLFP16-NEXT: fmaxnm s2, s2, s18 +; NOFULLFP16-NEXT: ldr h17, [sp, #40] +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fmaxnm s3, s3, s16 +; NOFULLFP16-NEXT: fmaxnm s4, s4, s17 +; NOFULLFP16-NEXT: mov v0.h[1], v1.h[0] +; NOFULLFP16-NEXT: ldr h1, [sp, #48] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: ldr h2, [sp, #56] +; NOFULLFP16-NEXT: fmaxnm s1, s5, s1 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: ldr h5, [sp, #64] +; NOFULLFP16-NEXT: mov v0.h[3], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: ldr h6, [sp, #72] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: mov v0.h[4], v4.h[0] +; NOFULLFP16-NEXT: fmaxnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h5 +; NOFULLFP16-NEXT: fcvt s4, h7 +; NOFULLFP16-NEXT: ldr h5, [sp] +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: mov v0.h[5], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fmaxnm s2, s4, s3 +; NOFULLFP16-NEXT: fmaxnm s3, s5, s6 +; NOFULLFP16-NEXT: mov v0.h[6], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: mov v0.h[7], v1.h[0] +; NOFULLFP16-NEXT: str h2, [x8, #16] +; NOFULLFP16-NEXT: str q0, [x8] +; NOFULLFP16-NEXT: ret entry: %c = call <9 x half> @llvm.maximumnum.v9f16(<9 x half> %a, <9 x half> %b) ret <9 x half> %c } define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) { -; AARCH64-LABEL: max_v16f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: fmaxnm v0.8h, v0.8h, v2.8h -; AARCH64-NEXT: fmaxnm v1.8h, v1.8h, v3.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: max_v16f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v2.8h +; FULLFP16-NEXT: fmaxnm v1.8h, v1.8h, v3.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: max_v16f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h6, v2.h[1] +; NOFULLFP16-NEXT: mov h7, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h2 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h16, v3.h[1] +; NOFULLFP16-NEXT: mov h17, v1.h[1] +; NOFULLFP16-NEXT: mov h18, v2.h[2] +; NOFULLFP16-NEXT: mov h19, v0.h[2] +; NOFULLFP16-NEXT: fcvt s20, h3 +; NOFULLFP16-NEXT: fcvt s21, h1 +; NOFULLFP16-NEXT: mov h22, v3.h[2] +; NOFULLFP16-NEXT: mov h23, v1.h[2] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h24, v0.h[6] +; NOFULLFP16-NEXT: fmaxnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt s5, h16 +; NOFULLFP16-NEXT: fcvt s16, h17 +; NOFULLFP16-NEXT: fcvt s17, h18 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: mov h19, v0.h[3] +; NOFULLFP16-NEXT: fmaxnm s20, s21, s20 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: mov h22, v3.h[3] +; NOFULLFP16-NEXT: fmaxnm s6, s7, s6 +; NOFULLFP16-NEXT: mov h7, v2.h[3] +; NOFULLFP16-NEXT: mov h25, v1.h[6] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: fmaxnm s5, s16, s5 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: mov h23, v1.h[3] +; NOFULLFP16-NEXT: fmaxnm s17, s18, s17 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt h19, s5 +; NOFULLFP16-NEXT: fcvt h5, s20 +; NOFULLFP16-NEXT: fmaxnm s16, s16, s21 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: mov h21, v2.h[4] +; NOFULLFP16-NEXT: mov h23, v1.h[4] +; NOFULLFP16-NEXT: mov v4.h[1], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h22 +; NOFULLFP16-NEXT: mov h22, v0.h[4] +; NOFULLFP16-NEXT: fmaxnm s7, s18, s7 +; NOFULLFP16-NEXT: mov h18, v3.h[4] +; NOFULLFP16-NEXT: mov v5.h[1], v19.h[0] +; NOFULLFP16-NEXT: fcvt h16, s16 +; NOFULLFP16-NEXT: fmaxnm s6, s20, s6 +; NOFULLFP16-NEXT: mov v4.h[2], v17.h[0] +; NOFULLFP16-NEXT: fcvt s17, h21 +; NOFULLFP16-NEXT: fcvt s19, h22 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: mov h21, v2.h[5] +; NOFULLFP16-NEXT: mov h22, v0.h[5] +; NOFULLFP16-NEXT: mov v5.h[2], v16.h[0] +; NOFULLFP16-NEXT: mov h16, v3.h[5] +; NOFULLFP16-NEXT: mov h23, v1.h[5] +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: fmaxnm s17, s19, s17 +; NOFULLFP16-NEXT: mov h19, v2.h[6] +; NOFULLFP16-NEXT: mov v4.h[3], v7.h[0] +; NOFULLFP16-NEXT: fmaxnm s18, s20, s18 +; NOFULLFP16-NEXT: mov h20, v3.h[6] +; NOFULLFP16-NEXT: fcvt s7, h21 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: fcvt s22, h24 +; NOFULLFP16-NEXT: mov h2, v2.h[7] +; NOFULLFP16-NEXT: mov v5.h[3], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h16 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: fcvt s19, h19 +; NOFULLFP16-NEXT: fcvt s23, h25 +; NOFULLFP16-NEXT: fcvt h18, s18 +; NOFULLFP16-NEXT: fcvt s20, h20 +; NOFULLFP16-NEXT: mov h3, v3.h[7] +; NOFULLFP16-NEXT: fmaxnm s7, s21, s7 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fmaxnm s6, s16, s6 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: mov v4.h[4], v17.h[0] +; NOFULLFP16-NEXT: fmaxnm s16, s22, s19 +; NOFULLFP16-NEXT: mov v5.h[4], v18.h[0] +; NOFULLFP16-NEXT: fmaxnm s17, s23, s20 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fmaxnm s0, s0, s2 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt h2, s16 +; NOFULLFP16-NEXT: fmaxnm s1, s1, s3 +; NOFULLFP16-NEXT: mov v4.h[5], v7.h[0] +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v5.h[5], v6.h[0] +; NOFULLFP16-NEXT: fcvt h6, s17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v4.h[6], v2.h[0] +; NOFULLFP16-NEXT: mov v5.h[6], v6.h[0] +; NOFULLFP16-NEXT: mov v4.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v5.h[7], v1.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v4.16b +; NOFULLFP16-NEXT: mov v1.16b, v5.16b +; NOFULLFP16-NEXT: ret entry: %c = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c @@ -922,112 +1826,413 @@ entry: ;;;;;;;;;;;;;;;;;; min_f16 define half @min_f16(half %a, half %b) { -; AARCH64-LABEL: min_f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm h1, h1, h1 -; AARCH64-NEXT: fminnm h0, h0, h0 -; AARCH64-NEXT: fminnm h0, h0, h1 -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm h1, h1, h1 +; FULLFP16-NEXT: fminnm h0, h0, h0 +; FULLFP16-NEXT: fminnm h0, h0, h1 +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fminnm s0, s0, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: ret entry: %c = call half @llvm.minimumnum.f16(half %a, half %b) ret half %c } define <2 x half> @min_v2f16(<2 x half> %a, <2 x half> %b) { -; AARCH64-LABEL: min_v2f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_v2f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.4h, v1.4h, v1.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v0.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_v2f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fminnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fminnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fminnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %c } define <4 x half> @min_v4f16(<4 x half> %a, <4 x half> %b) { -; AARCH64-LABEL: min_v4f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.4h, v1.4h, v1.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v0.4h -; AARCH64-NEXT: fminnm v0.4h, v0.4h, v1.4h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_v4f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.4h, v1.4h, v1.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v0.4h +; FULLFP16-NEXT: fminnm v0.4h, v0.4h, v1.4h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_v4f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: // kill: def $d1 killed $d1 def $q1 +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: mov h4, v1.h[2] +; NOFULLFP16-NEXT: mov h5, v0.h[2] +; NOFULLFP16-NEXT: fcvt s6, h1 +; NOFULLFP16-NEXT: fcvt s7, h0 +; NOFULLFP16-NEXT: mov h1, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h4 +; NOFULLFP16-NEXT: fcvt s4, h5 +; NOFULLFP16-NEXT: fminnm s5, s7, s6 +; NOFULLFP16-NEXT: mov h6, v0.h[3] +; NOFULLFP16-NEXT: fminnm s3, s4, s3 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fcvt h0, s5 +; NOFULLFP16-NEXT: fcvt s4, h6 +; NOFULLFP16-NEXT: mov v0.h[1], v2.h[0] +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: fminnm s1, s4, s1 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v0.h[3], v1.h[0] +; NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOFULLFP16-NEXT: ret entry: %c = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %c } define <8 x half> @min_v8f16(<8 x half> %a, <8 x half> %b) { -; AARCH64-LABEL: min_v8f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_v8f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_v8f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h2, v1.h[1] +; NOFULLFP16-NEXT: mov h3, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h1 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h6, v1.h[2] +; NOFULLFP16-NEXT: mov h7, v0.h[2] +; NOFULLFP16-NEXT: mov h16, v1.h[3] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: mov h5, v0.h[3] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fminnm s3, s3, s2 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fcvt h2, s4 +; NOFULLFP16-NEXT: fminnm s4, s7, s6 +; NOFULLFP16-NEXT: mov h6, v1.h[4] +; NOFULLFP16-NEXT: mov h7, v0.h[4] +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fminnm s5, s5, s16 +; NOFULLFP16-NEXT: mov h16, v0.h[5] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v2.h[1], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: fcvt s6, h7 +; NOFULLFP16-NEXT: mov h7, v1.h[5] +; NOFULLFP16-NEXT: fcvt h5, s5 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: mov v2.h[2], v4.h[0] +; NOFULLFP16-NEXT: mov h4, v1.h[6] +; NOFULLFP16-NEXT: fminnm s3, s6, s3 +; NOFULLFP16-NEXT: mov h6, v0.h[6] +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov v2.h[3], v5.h[0] +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt s5, h6 +; NOFULLFP16-NEXT: fminnm s6, s16, s7 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: mov v2.h[4], v3.h[0] +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt h3, s6 +; NOFULLFP16-NEXT: fminnm s0, s0, s1 +; NOFULLFP16-NEXT: mov v2.h[5], v3.h[0] +; NOFULLFP16-NEXT: fcvt h3, s4 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v2.h[6], v3.h[0] +; NOFULLFP16-NEXT: mov v2.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v2.16b +; NOFULLFP16-NEXT: ret entry: %c = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %c } define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { -; AARCH64-LABEL: min_v9f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: // kill: def $h0 killed $h0 def $q0 -; AARCH64-NEXT: // kill: def $h1 killed $h1 def $q1 -; AARCH64-NEXT: // kill: def $h2 killed $h2 def $q2 -; AARCH64-NEXT: add x9, sp, #16 -; AARCH64-NEXT: // kill: def $h3 killed $h3 def $q3 -; AARCH64-NEXT: // kill: def $h4 killed $h4 def $q4 -; AARCH64-NEXT: // kill: def $h5 killed $h5 def $q5 -; AARCH64-NEXT: // kill: def $h6 killed $h6 def $q6 -; AARCH64-NEXT: // kill: def $h7 killed $h7 def $q7 -; AARCH64-NEXT: mov v0.h[1], v1.h[0] -; AARCH64-NEXT: ldr h1, [sp, #8] -; AARCH64-NEXT: ld1 { v1.h }[1], [x9] -; AARCH64-NEXT: add x9, sp, #24 -; AARCH64-NEXT: mov v0.h[2], v2.h[0] -; AARCH64-NEXT: ldr h2, [sp] -; AARCH64-NEXT: ld1 { v1.h }[2], [x9] -; AARCH64-NEXT: add x9, sp, #32 -; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h -; AARCH64-NEXT: mov v0.h[3], v3.h[0] -; AARCH64-NEXT: ld1 { v1.h }[3], [x9] -; AARCH64-NEXT: add x9, sp, #40 -; AARCH64-NEXT: ldr h3, [sp, #72] -; AARCH64-NEXT: ld1 { v1.h }[4], [x9] -; AARCH64-NEXT: add x9, sp, #48 -; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h -; AARCH64-NEXT: mov v0.h[4], v4.h[0] -; AARCH64-NEXT: ld1 { v1.h }[5], [x9] -; AARCH64-NEXT: add x9, sp, #56 -; AARCH64-NEXT: fminnm v2.8h, v2.8h, v3.8h -; AARCH64-NEXT: mov v0.h[5], v5.h[0] -; AARCH64-NEXT: ld1 { v1.h }[6], [x9] -; AARCH64-NEXT: add x9, sp, #64 -; AARCH64-NEXT: str h2, [x8, #16] -; AARCH64-NEXT: mov v0.h[6], v6.h[0] -; AARCH64-NEXT: ld1 { v1.h }[7], [x9] -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: mov v0.h[7], v7.h[0] -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v1.8h -; AARCH64-NEXT: str q0, [x8] -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_v9f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: // kill: def $h0 killed $h0 def $q0 +; FULLFP16-NEXT: // kill: def $h1 killed $h1 def $q1 +; FULLFP16-NEXT: // kill: def $h2 killed $h2 def $q2 +; FULLFP16-NEXT: add x9, sp, #16 +; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 +; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 +; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 +; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 +; FULLFP16-NEXT: mov v0.h[1], v1.h[0] +; FULLFP16-NEXT: ldr h1, [sp, #8] +; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #24 +; FULLFP16-NEXT: mov v0.h[2], v2.h[0] +; FULLFP16-NEXT: ldr h2, [sp] +; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] +; FULLFP16-NEXT: add x9, sp, #32 +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h +; FULLFP16-NEXT: mov v0.h[3], v3.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] +; FULLFP16-NEXT: add x9, sp, #40 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h +; FULLFP16-NEXT: mov v0.h[4], v4.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] +; FULLFP16-NEXT: add x9, sp, #56 +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: add x9, sp, #64 +; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[6], v6.h[0] +; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: str q0, [x8] +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_v9f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: ldr h16, [sp, #16] +; NOFULLFP16-NEXT: ldr h17, [sp, #8] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: ldr h18, [sp, #24] +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s4, h4 +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: fminnm s1, s1, s16 +; NOFULLFP16-NEXT: fminnm s0, s0, s17 +; NOFULLFP16-NEXT: ldr h16, [sp, #32] +; NOFULLFP16-NEXT: fminnm s2, s2, s18 +; NOFULLFP16-NEXT: ldr h17, [sp, #40] +; NOFULLFP16-NEXT: fcvt s16, h16 +; NOFULLFP16-NEXT: fcvt s17, h17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: fcvt h2, s2 +; NOFULLFP16-NEXT: fminnm s3, s3, s16 +; NOFULLFP16-NEXT: fminnm s4, s4, s17 +; NOFULLFP16-NEXT: mov v0.h[1], v1.h[0] +; NOFULLFP16-NEXT: ldr h1, [sp, #48] +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: fcvt h3, s3 +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: mov v0.h[2], v2.h[0] +; NOFULLFP16-NEXT: ldr h2, [sp, #56] +; NOFULLFP16-NEXT: fminnm s1, s5, s1 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: ldr h5, [sp, #64] +; NOFULLFP16-NEXT: mov v0.h[3], v3.h[0] +; NOFULLFP16-NEXT: fcvt s3, h6 +; NOFULLFP16-NEXT: ldr h6, [sp, #72] +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: mov v0.h[4], v4.h[0] +; NOFULLFP16-NEXT: fminnm s2, s3, s2 +; NOFULLFP16-NEXT: fcvt s3, h5 +; NOFULLFP16-NEXT: fcvt s4, h7 +; NOFULLFP16-NEXT: ldr h5, [sp] +; NOFULLFP16-NEXT: fcvt s5, h5 +; NOFULLFP16-NEXT: mov v0.h[5], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fminnm s2, s4, s3 +; NOFULLFP16-NEXT: fminnm s3, s5, s6 +; NOFULLFP16-NEXT: mov v0.h[6], v1.h[0] +; NOFULLFP16-NEXT: fcvt h1, s2 +; NOFULLFP16-NEXT: fcvt h2, s3 +; NOFULLFP16-NEXT: mov v0.h[7], v1.h[0] +; NOFULLFP16-NEXT: str h2, [x8, #16] +; NOFULLFP16-NEXT: str q0, [x8] +; NOFULLFP16-NEXT: ret entry: %c = call <9 x half> @llvm.minimumnum.v9f16(<9 x half> %a, <9 x half> %b) ret <9 x half> %c } define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) { -; AARCH64-LABEL: min_v16f16: -; AARCH64: // %bb.0: // %entry -; AARCH64-NEXT: fminnm v2.8h, v2.8h, v2.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v0.8h -; AARCH64-NEXT: fminnm v3.8h, v3.8h, v3.8h -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v1.8h -; AARCH64-NEXT: fminnm v0.8h, v0.8h, v2.8h -; AARCH64-NEXT: fminnm v1.8h, v1.8h, v3.8h -; AARCH64-NEXT: ret +; FULLFP16-LABEL: min_v16f16: +; FULLFP16: // %bb.0: // %entry +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h +; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v2.8h +; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v3.8h +; FULLFP16-NEXT: ret +; +; NOFULLFP16-LABEL: min_v16f16: +; NOFULLFP16: // %bb.0: // %entry +; NOFULLFP16-NEXT: mov h6, v2.h[1] +; NOFULLFP16-NEXT: mov h7, v0.h[1] +; NOFULLFP16-NEXT: fcvt s4, h2 +; NOFULLFP16-NEXT: fcvt s5, h0 +; NOFULLFP16-NEXT: mov h16, v3.h[1] +; NOFULLFP16-NEXT: mov h17, v1.h[1] +; NOFULLFP16-NEXT: mov h18, v2.h[2] +; NOFULLFP16-NEXT: mov h19, v0.h[2] +; NOFULLFP16-NEXT: fcvt s20, h3 +; NOFULLFP16-NEXT: fcvt s21, h1 +; NOFULLFP16-NEXT: mov h22, v3.h[2] +; NOFULLFP16-NEXT: mov h23, v1.h[2] +; NOFULLFP16-NEXT: fcvt s6, h6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: mov h24, v0.h[6] +; NOFULLFP16-NEXT: fminnm s4, s5, s4 +; NOFULLFP16-NEXT: fcvt s5, h16 +; NOFULLFP16-NEXT: fcvt s16, h17 +; NOFULLFP16-NEXT: fcvt s17, h18 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: mov h19, v0.h[3] +; NOFULLFP16-NEXT: fminnm s20, s21, s20 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: mov h22, v3.h[3] +; NOFULLFP16-NEXT: fminnm s6, s7, s6 +; NOFULLFP16-NEXT: mov h7, v2.h[3] +; NOFULLFP16-NEXT: mov h25, v1.h[6] +; NOFULLFP16-NEXT: fcvt h4, s4 +; NOFULLFP16-NEXT: fminnm s5, s16, s5 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: mov h23, v1.h[3] +; NOFULLFP16-NEXT: fminnm s17, s18, s17 +; NOFULLFP16-NEXT: fcvt s18, h19 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt s7, h7 +; NOFULLFP16-NEXT: fcvt h19, s5 +; NOFULLFP16-NEXT: fcvt h5, s20 +; NOFULLFP16-NEXT: fminnm s16, s16, s21 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: mov h21, v2.h[4] +; NOFULLFP16-NEXT: mov h23, v1.h[4] +; NOFULLFP16-NEXT: mov v4.h[1], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h22 +; NOFULLFP16-NEXT: mov h22, v0.h[4] +; NOFULLFP16-NEXT: fminnm s7, s18, s7 +; NOFULLFP16-NEXT: mov h18, v3.h[4] +; NOFULLFP16-NEXT: mov v5.h[1], v19.h[0] +; NOFULLFP16-NEXT: fcvt h16, s16 +; NOFULLFP16-NEXT: fminnm s6, s20, s6 +; NOFULLFP16-NEXT: mov v4.h[2], v17.h[0] +; NOFULLFP16-NEXT: fcvt s17, h21 +; NOFULLFP16-NEXT: fcvt s19, h22 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fcvt s18, h18 +; NOFULLFP16-NEXT: fcvt s20, h23 +; NOFULLFP16-NEXT: mov h21, v2.h[5] +; NOFULLFP16-NEXT: mov h22, v0.h[5] +; NOFULLFP16-NEXT: mov v5.h[2], v16.h[0] +; NOFULLFP16-NEXT: mov h16, v3.h[5] +; NOFULLFP16-NEXT: mov h23, v1.h[5] +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: mov h0, v0.h[7] +; NOFULLFP16-NEXT: mov h1, v1.h[7] +; NOFULLFP16-NEXT: fminnm s17, s19, s17 +; NOFULLFP16-NEXT: mov h19, v2.h[6] +; NOFULLFP16-NEXT: mov v4.h[3], v7.h[0] +; NOFULLFP16-NEXT: fminnm s18, s20, s18 +; NOFULLFP16-NEXT: mov h20, v3.h[6] +; NOFULLFP16-NEXT: fcvt s7, h21 +; NOFULLFP16-NEXT: fcvt s21, h22 +; NOFULLFP16-NEXT: fcvt s22, h24 +; NOFULLFP16-NEXT: mov h2, v2.h[7] +; NOFULLFP16-NEXT: mov v5.h[3], v6.h[0] +; NOFULLFP16-NEXT: fcvt s6, h16 +; NOFULLFP16-NEXT: fcvt s16, h23 +; NOFULLFP16-NEXT: fcvt h17, s17 +; NOFULLFP16-NEXT: fcvt s19, h19 +; NOFULLFP16-NEXT: fcvt s23, h25 +; NOFULLFP16-NEXT: fcvt h18, s18 +; NOFULLFP16-NEXT: fcvt s20, h20 +; NOFULLFP16-NEXT: mov h3, v3.h[7] +; NOFULLFP16-NEXT: fminnm s7, s21, s7 +; NOFULLFP16-NEXT: fcvt s2, h2 +; NOFULLFP16-NEXT: fcvt s0, h0 +; NOFULLFP16-NEXT: fminnm s6, s16, s6 +; NOFULLFP16-NEXT: fcvt s1, h1 +; NOFULLFP16-NEXT: mov v4.h[4], v17.h[0] +; NOFULLFP16-NEXT: fminnm s16, s22, s19 +; NOFULLFP16-NEXT: mov v5.h[4], v18.h[0] +; NOFULLFP16-NEXT: fminnm s17, s23, s20 +; NOFULLFP16-NEXT: fcvt s3, h3 +; NOFULLFP16-NEXT: fcvt h7, s7 +; NOFULLFP16-NEXT: fminnm s0, s0, s2 +; NOFULLFP16-NEXT: fcvt h6, s6 +; NOFULLFP16-NEXT: fcvt h2, s16 +; NOFULLFP16-NEXT: fminnm s1, s1, s3 +; NOFULLFP16-NEXT: mov v4.h[5], v7.h[0] +; NOFULLFP16-NEXT: fcvt h0, s0 +; NOFULLFP16-NEXT: mov v5.h[5], v6.h[0] +; NOFULLFP16-NEXT: fcvt h6, s17 +; NOFULLFP16-NEXT: fcvt h1, s1 +; NOFULLFP16-NEXT: mov v4.h[6], v2.h[0] +; NOFULLFP16-NEXT: mov v5.h[6], v6.h[0] +; NOFULLFP16-NEXT: mov v4.h[7], v0.h[0] +; NOFULLFP16-NEXT: mov v5.h[7], v1.h[0] +; NOFULLFP16-NEXT: mov v0.16b, v4.16b +; NOFULLFP16-NEXT: mov v1.16b, v5.16b +; NOFULLFP16-NEXT: ret entry: %c = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c From 4c97c5131f9ca32ce644a0be6e3586077ee03aa6 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Wed, 16 Apr 2025 06:48:56 +0530 Subject: [PATCH 063/710] [RISCV] Add ISel patterns for Xqcilia instructions (#135724) This patch adds instruction selection patterns for generating the long immediate arithmetic instructions. We prefer generating instructions that have a 26 bit immediate to a 32 bit immediate given that both are of the same size but the former might be easier to register allocate for. Base RISC-V arithmetic instructions will be preferred, when applicable. --- llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 30 ++++++ llvm/test/CodeGen/RISCV/xqcilia.ll | 108 ++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/xqcilia.ll diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 8eaa5e394a91c..2458bda80b1d6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -118,6 +118,12 @@ def simm20_li : RISCVOp { def simm26 : RISCVSImmLeafOp<26>; +def simm26_nosimm12 : ImmLeaf(Imm) && !isInt<12>(Imm);}]>; + +def simm32_nosimm26 : ImmLeaf(Imm) && !isInt<26>(Imm);}]>; + class BareSImmNAsmOperand : ImmAsmOperand<"BareS", width, ""> { let PredicateMethod = "isBareSimmN<" # width # ">"; @@ -1223,5 +1229,29 @@ def PseudoQC_E_SW : PseudoStore<"qc.e.sw">; // Code Gen Patterns //===----------------------------------------------------------------------===// +/// Generic pattern classes + +class PatGprNoX0Simm26NoSimm12 + : Pat<(i32 (OpNode (i32 GPRNoX0:$rs1), simm26_nosimm12:$imm)), + (Inst GPRNoX0:$rs1, simm26_nosimm12:$imm)>; + +class PatGprNoX0Simm32NoSimm26 + : Pat<(i32 (OpNode (i32 GPRNoX0:$rs1), simm32_nosimm26:$imm)), + (Inst GPRNoX0:$rs1, simm32_nosimm26:$imm)>; + +/// Simple arithmetic operations + +let Predicates = [HasVendorXqcilia, IsRV32] in { +def : PatGprNoX0Simm32NoSimm26; +def : PatGprNoX0Simm32NoSimm26; +def : PatGprNoX0Simm32NoSimm26; +def : PatGprNoX0Simm32NoSimm26; + +def : PatGprNoX0Simm26NoSimm12; +def : PatGprNoX0Simm26NoSimm12; +def : PatGprNoX0Simm26NoSimm12; +def : PatGprNoX0Simm26NoSimm12; +} // Predicates = [HasVendorXqcilia, IsRV32] + let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; diff --git a/llvm/test/CodeGen/RISCV/xqcilia.ll b/llvm/test/CodeGen/RISCV/xqcilia.ll new file mode 100644 index 0000000000000..0f14044d62dc8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcilia.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test that we are able to generate the Xqcilia instructions +; RUN: llc < %s -mtriple=riscv32 | FileCheck %s -check-prefix=RV32I +; RUN: llc < %s -mtriple=riscv32 -mattr=+experimental-xqcilia | FileCheck %s -check-prefix=RV32XQCILIA + +define i32 @add(i32 %a, i32 %b) { +; RV32I-LABEL: add: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 65536 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lui a2, 573 +; RV32I-NEXT: addi a2, a2, -1330 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: addi a0, a0, 13 +; RV32I-NEXT: ret +; +; RV32XQCILIA-LABEL: add: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: qc.e.addi a1, a1, 2345678 +; RV32XQCILIA-NEXT: qc.e.addai a0, 268435456 +; RV32XQCILIA-NEXT: and a0, a0, a1 +; RV32XQCILIA-NEXT: addi a0, a0, 13 +; RV32XQCILIA-NEXT: ret + %addai = add i32 %a, 268435456 + %add = add i32 %b, 2345678 + %and = and i32 %add, %addai + %res = add i32 %and, 13 + ret i32 %res +} + +define i32 @and(i32 %a, i32 %b) { +; RV32I-LABEL: and: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 65536 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 573 +; RV32I-NEXT: addi a2, a2, -1330 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srl a0, a1, a0 +; RV32I-NEXT: andi a0, a0, 10 +; RV32I-NEXT: ret +; +; RV32XQCILIA-LABEL: and: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: qc.e.andi a1, a1, 2345678 +; RV32XQCILIA-NEXT: qc.e.andai a0, 268435456 +; RV32XQCILIA-NEXT: srl a0, a1, a0 +; RV32XQCILIA-NEXT: andi a0, a0, 10 +; RV32XQCILIA-NEXT: ret + %andai = and i32 %a, 268435456 + %and = and i32 %b, 2345678 + %srl = lshr i32 %and, %andai + %res = and i32 %srl, 10 + ret i32 %res +} + +define i32 @or(i32 %a, i32 %b) { +; RV32I-LABEL: or: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 65536 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: lui a2, 573 +; RV32I-NEXT: addi a2, a2, -1330 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ori a0, a0, 13 +; RV32I-NEXT: ret +; +; RV32XQCILIA-LABEL: or: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: qc.e.ori a1, a1, 2345678 +; RV32XQCILIA-NEXT: qc.e.orai a0, 268435456 +; RV32XQCILIA-NEXT: add a0, a0, a1 +; RV32XQCILIA-NEXT: ori a0, a0, 13 +; RV32XQCILIA-NEXT: ret + %orai = or i32 %a, 268435456 + %or = or i32 %b, 2345678 + %add = add i32 %or, %orai + %res = or i32 %add, 13 + ret i32 %res +} + +define i32 @xor(i32 %a, i32 %b) { +; RV32I-LABEL: xor: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 65536 +; RV32I-NEXT: xor a0, a0, a2 +; RV32I-NEXT: lui a2, 573 +; RV32I-NEXT: addi a2, a2, -1330 +; RV32I-NEXT: xor a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: xori a0, a0, 13 +; RV32I-NEXT: ret +; +; RV32XQCILIA-LABEL: xor: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: qc.e.xori a1, a1, 2345678 +; RV32XQCILIA-NEXT: qc.e.xorai a0, 268435456 +; RV32XQCILIA-NEXT: add a0, a0, a1 +; RV32XQCILIA-NEXT: xori a0, a0, 13 +; RV32XQCILIA-NEXT: ret + %xorai = xor i32 %a, 268435456 + %xor = xor i32 %b, 2345678 + %add = add i32 %xor, %xorai + %res = xor i32 %add, 13 + ret i32 %res +} From 0ce8ad68e44aaf50d1e2aa304fa8a1127e311e1d Mon Sep 17 00:00:00 2001 From: Koakuma Date: Wed, 16 Apr 2025 08:33:42 +0700 Subject: [PATCH 064/710] [SPARC] Use fzero/fzeros to materialize FP zeros when we have VIS Reviewers: rorth, brad0, s-barannikov Reviewed By: s-barannikov Pull Request: https://github.com/llvm/llvm-project/pull/135712 --- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 6 + llvm/lib/Target/Sparc/SparcISelLowering.h | 3 + llvm/lib/Target/Sparc/SparcInstrVIS.td | 17 ++- llvm/test/CodeGen/SPARC/float-constants.ll | 115 ++++++++++++++++++++ 4 files changed, 139 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 85b8750d40f46..bce8ddbd47586 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -3560,6 +3560,12 @@ bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const { return true; } +bool SparcTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + return Subtarget->isVIS() && (VT == MVT::f32 || VT == MVT::f64) && + Imm.isZero(); +} + // Override to disable global variable loading on Linux. void SparcTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget->isTargetLinux()) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index 1bee5f4cfe84d..c09e465f5d05e 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -207,6 +207,9 @@ namespace llvm { return VT != MVT::f128; } + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override { // FIXME: We insert fences for each atomics and generate // sub-optimal code for PSO/TSO. (Approximately nobody uses any diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index 8ce8f37f34040..7be45fe9faf3f 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -45,10 +45,10 @@ class VISInst2 opfval, string OpcStr, RegisterClass RC = DFPRegs> !strconcat(OpcStr, " $rs2, $rd")>; // For VIS Instructions with only rd operand. -let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in +let rs1 = 0, rs2 = 0 in class VISInstD opfval, string OpcStr, RegisterClass RC = DFPRegs> : VISInstFormat; // VIS 1 Instructions @@ -277,3 +277,16 @@ def UMULXHI : VISInst<0b000010110, "umulxhi", I64Regs>; def XMULX : VISInst<0b100010101, "xmulx", I64Regs>; def XMULXHI : VISInst<0b100010110, "xmulxhi", I64Regs>; } // Predicates = [IsVIS3] + +// FP immediate patterns. +def fpimm0 : FPImmLeaf; +def fpnegimm0 : FPImmLeaf; + +// VIS instruction patterns. +let Predicates = [HasVIS] in { +// Zero immediate. +def : Pat<(f64 fpimm0), (FZERO)>; +def : Pat<(f32 fpimm0), (FZEROS)>; +def : Pat<(f64 fpnegimm0), (FNEGD (FZERO))>; +def : Pat<(f32 fpnegimm0), (FNEGS (FZEROS))>; +} // Predicates = [HasVIS] diff --git a/llvm/test/CodeGen/SPARC/float-constants.ll b/llvm/test/CodeGen/SPARC/float-constants.ll index b04ec68ed3d7e..440c75bfca9f9 100644 --- a/llvm/test/CodeGen/SPARC/float-constants.ll +++ b/llvm/test/CodeGen/SPARC/float-constants.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=sparc | FileCheck %s ; RUN: llc < %s -mtriple=sparcel | FileCheck %s --check-prefix=CHECK-LE +; RUN: llc < %s -mtriple=sparcv9 -mattr=+vis | FileCheck %s --check-prefix=CHECK-VIS ;; Bitcast should not do a runtime conversion, but rather emit a ;; constant into integer registers directly. @@ -17,6 +18,12 @@ define <2 x i32> @bitcast() nounwind { ; CHECK-LE-NEXT: sethi 1049856, %o1 ; CHECK-LE-NEXT: retl ; CHECK-LE-NEXT: mov %g0, %o0 +; +; CHECK-VIS-LABEL: bitcast: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: sethi 1049856, %o0 +; CHECK-VIS-NEXT: retl +; CHECK-VIS-NEXT: mov %g0, %o1 %1 = bitcast double 5.0 to <2 x i32> ret <2 x i32> %1 } @@ -43,6 +50,17 @@ define void @test_call() nounwind { ; CHECK-LE-NEXT: mov %g0, %o0 ; CHECK-LE-NEXT: ret ; CHECK-LE-NEXT: restore +; +; CHECK-VIS-LABEL: test_call: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: save %sp, -176, %sp +; CHECK-VIS-NEXT: sethi %h44(.LCPI1_0), %i0 +; CHECK-VIS-NEXT: add %i0, %m44(.LCPI1_0), %i0 +; CHECK-VIS-NEXT: sllx %i0, 12, %i0 +; CHECK-VIS-NEXT: call a +; CHECK-VIS-NEXT: ldd [%i0+%l44(.LCPI1_0)], %f0 +; CHECK-VIS-NEXT: ret +; CHECK-VIS-NEXT: restore call void @a(double 5.0) ret void } @@ -75,6 +93,103 @@ define double @test_intrins_call() nounwind { ; CHECK-LE-NEXT: mov %o1, %o3 ; CHECK-LE-NEXT: ret ; CHECK-LE-NEXT: restore +; +; CHECK-VIS-LABEL: test_intrins_call: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: save %sp, -176, %sp +; CHECK-VIS-NEXT: sethi %h44(.LCPI2_0), %i0 +; CHECK-VIS-NEXT: add %i0, %m44(.LCPI2_0), %i0 +; CHECK-VIS-NEXT: sllx %i0, 12, %i0 +; CHECK-VIS-NEXT: ldd [%i0+%l44(.LCPI2_0)], %f0 +; CHECK-VIS-NEXT: fmovd %f0, %f2 +; CHECK-VIS-NEXT: call pow +; CHECK-VIS-NEXT: nop +; CHECK-VIS-NEXT: ret +; CHECK-VIS-NEXT: restore %1 = call double @llvm.pow.f64(double 2.0, double 2.0) ret double %1 } + +;; When we have VIS, f32/f64 zero constant should be materialized from fzero/fzeros. + +define double @pos_zero_double() nounwind { +; CHECK-LABEL: pos_zero_double: +; CHECK: ! %bb.0: +; CHECK-NEXT: sethi %hi(.LCPI3_0), %o0 +; CHECK-NEXT: retl +; CHECK-NEXT: ldd [%o0+%lo(.LCPI3_0)], %f0 +; +; CHECK-LE-LABEL: pos_zero_double: +; CHECK-LE: ! %bb.0: +; CHECK-LE-NEXT: sethi %hi(.LCPI3_0), %o0 +; CHECK-LE-NEXT: retl +; CHECK-LE-NEXT: ldd [%o0+%lo(.LCPI3_0)], %f0 +; +; CHECK-VIS-LABEL: pos_zero_double: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: retl +; CHECK-VIS-NEXT: fzero %f0 + ret double +0.0 +} + +define double @neg_zero_double() nounwind { +; CHECK-LABEL: neg_zero_double: +; CHECK: ! %bb.0: +; CHECK-NEXT: sethi %hi(.LCPI4_0), %o0 +; CHECK-NEXT: retl +; CHECK-NEXT: ldd [%o0+%lo(.LCPI4_0)], %f0 +; +; CHECK-LE-LABEL: neg_zero_double: +; CHECK-LE: ! %bb.0: +; CHECK-LE-NEXT: sethi %hi(.LCPI4_0), %o0 +; CHECK-LE-NEXT: retl +; CHECK-LE-NEXT: ldd [%o0+%lo(.LCPI4_0)], %f0 +; +; CHECK-VIS-LABEL: neg_zero_double: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: fzero %f0 +; CHECK-VIS-NEXT: retl +; CHECK-VIS-NEXT: fnegd %f0, %f0 + ret double -0.0 +} + +define float @pos_zero_float() nounwind { +; CHECK-LABEL: pos_zero_float: +; CHECK: ! %bb.0: +; CHECK-NEXT: sethi %hi(.LCPI5_0), %o0 +; CHECK-NEXT: retl +; CHECK-NEXT: ld [%o0+%lo(.LCPI5_0)], %f0 +; +; CHECK-LE-LABEL: pos_zero_float: +; CHECK-LE: ! %bb.0: +; CHECK-LE-NEXT: sethi %hi(.LCPI5_0), %o0 +; CHECK-LE-NEXT: retl +; CHECK-LE-NEXT: ld [%o0+%lo(.LCPI5_0)], %f0 +; +; CHECK-VIS-LABEL: pos_zero_float: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: retl +; CHECK-VIS-NEXT: fzeros %f0 + ret float +0.0 +} + +define float @neg_zero_float() nounwind { +; CHECK-LABEL: neg_zero_float: +; CHECK: ! %bb.0: +; CHECK-NEXT: sethi %hi(.LCPI6_0), %o0 +; CHECK-NEXT: retl +; CHECK-NEXT: ld [%o0+%lo(.LCPI6_0)], %f0 +; +; CHECK-LE-LABEL: neg_zero_float: +; CHECK-LE: ! %bb.0: +; CHECK-LE-NEXT: sethi %hi(.LCPI6_0), %o0 +; CHECK-LE-NEXT: retl +; CHECK-LE-NEXT: ld [%o0+%lo(.LCPI6_0)], %f0 +; +; CHECK-VIS-LABEL: neg_zero_float: +; CHECK-VIS: ! %bb.0: +; CHECK-VIS-NEXT: fzeros %f0 +; CHECK-VIS-NEXT: retl +; CHECK-VIS-NEXT: fnegs %f0, %f0 + ret float -0.0 +} From f3de63c64998eb46db9cf26aca9ebcc5453f6f44 Mon Sep 17 00:00:00 2001 From: Koakuma Date: Wed, 16 Apr 2025 08:36:47 +0700 Subject: [PATCH 065/710] [SPARC] Use addxccc to do multiword addition when we have VIS3 Reviewers: brad0, s-barannikov, rorth Reviewed By: s-barannikov Pull Request: https://github.com/llvm/llvm-project/pull/135713 --- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 5 + llvm/lib/Target/Sparc/SparcInstr64Bit.td | 2 + llvm/lib/Target/Sparc/SparcInstrVIS.td | 5 + llvm/test/CodeGen/SPARC/2011-01-11-CC.ll | 118 ++++++++++++++++++++ 4 files changed, 130 insertions(+) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index bce8ddbd47586..098e5f22834f4 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1737,6 +1737,11 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBC, MVT::i32, Legal); setOperationAction(ISD::SUBE, MVT::i32, Legal); + if (Subtarget->isVIS3()) { + setOperationAction(ISD::ADDC, MVT::i64, Legal); + setOperationAction(ISD::ADDE, MVT::i64, Legal); + } + if (Subtarget->is64Bit()) { setOperationAction(ISD::BITCAST, MVT::f64, Expand); setOperationAction(ISD::BITCAST, MVT::i64, Expand); diff --git a/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/llvm/lib/Target/Sparc/SparcInstr64Bit.td index 56fab2f26a19e..000612534e89d 100644 --- a/llvm/lib/Target/Sparc/SparcInstr64Bit.td +++ b/llvm/lib/Target/Sparc/SparcInstr64Bit.td @@ -157,9 +157,11 @@ def : Pat<(and i64:$lhs, (not i64:$rhs)), (ANDNrr $lhs, $rhs)>; def : Pat<(or i64:$lhs, (not i64:$rhs)), (ORNrr $lhs, $rhs)>; def : Pat<(not (xor i64:$lhs, i64:$rhs)), (XNORrr $lhs, $rhs)>; +def : Pat<(addc i64:$lhs, i64:$rhs), (ADDCCrr $lhs, $rhs)>, Requires<[HasVIS3]>; def : Pat<(add i64:$lhs, i64:$rhs), (ADDrr $lhs, $rhs)>; def : Pat<(sub i64:$lhs, i64:$rhs), (SUBrr $lhs, $rhs)>; +def : Pat<(addc i64:$lhs, (i64 simm13:$rhs)), (ADDCCri $lhs, imm:$rhs)>, Requires<[HasVIS3]>; def : Pat<(add i64:$lhs, (i64 simm13:$rhs)), (ADDri $lhs, imm:$rhs)>; def : Pat<(sub i64:$lhs, (i64 simm13:$rhs)), (SUBri $lhs, imm:$rhs)>; diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index 7be45fe9faf3f..ee24d8a54fe8e 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -290,3 +290,8 @@ def : Pat<(f32 fpimm0), (FZEROS)>; def : Pat<(f64 fpnegimm0), (FNEGD (FZERO))>; def : Pat<(f32 fpnegimm0), (FNEGS (FZEROS))>; } // Predicates = [HasVIS] + +// VIS3 instruction patterns. +let Predicates = [HasVIS3] in { +def : Pat<(i64 (adde i64:$lhs, i64:$rhs)), (ADDXCCC $lhs, $rhs)>; +} // Predicates = [HasVIS3] diff --git a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll index 1560bc687b7dd..e05c47bfee766 100644 --- a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll +++ b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=sparc %s -o - | FileCheck %s -check-prefix=V8 ; RUN: llc -mtriple=sparc -mattr=v9 %s -o - | FileCheck %s -check-prefix=V9 ; RUN: llc -mtriple=sparc64-unknown-linux %s -o - | FileCheck %s -check-prefix=SPARC64 +; RUN: llc -mtriple=sparc64-unknown-linux -mattr=vis3 %s -o - | FileCheck %s -check-prefix=SPARC64-VIS3 define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind { ; V8-LABEL: test_addx: @@ -60,6 +61,15 @@ define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind { ; SPARC64-NEXT: movgu %xcc, 1, %o3 ; SPARC64-NEXT: retl ; SPARC64-NEXT: srl %o3, 0, %o0 +; +; SPARC64-VIS3-LABEL: test_addx: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: mov %g0, %o3 +; SPARC64-VIS3-NEXT: add %o0, %o1, %o0 +; SPARC64-VIS3-NEXT: cmp %o0, %o2 +; SPARC64-VIS3-NEXT: movgu %xcc, 1, %o3 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: srl %o3, 0, %o0 entry: %0 = add i64 %a, %b %1 = icmp ugt i64 %0, %c @@ -92,6 +102,13 @@ define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind { ; SPARC64-NEXT: move %icc, %o1, %o2 ; SPARC64-NEXT: retl ; SPARC64-NEXT: mov %o2, %o0 +; +; SPARC64-VIS3-LABEL: test_select_int_icc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: cmp %o0, 0 +; SPARC64-VIS3-NEXT: move %icc, %o1, %o2 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: mov %o2, %o0 entry: %0 = icmp eq i32 %a, 0 %1 = select i1 %0, i32 %b, i32 %c @@ -133,6 +150,13 @@ define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind { ; SPARC64-NEXT: cmp %o0, 0 ; SPARC64-NEXT: retl ; SPARC64-NEXT: fmovse %icc, %f3, %f0 +; +; SPARC64-VIS3-LABEL: test_select_fp_icc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: fmovs %f5, %f0 +; SPARC64-VIS3-NEXT: cmp %o0, 0 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: fmovse %icc, %f3, %f0 entry: %0 = icmp eq i32 %a, 0 %1 = select i1 %0, float %f1, float %f2 @@ -182,6 +206,13 @@ define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind { ; SPARC64-NEXT: cmp %o0, 0 ; SPARC64-NEXT: retl ; SPARC64-NEXT: fmovde %icc, %f2, %f0 +; +; SPARC64-VIS3-LABEL: test_select_dfp_icc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: fmovd %f4, %f0 +; SPARC64-VIS3-NEXT: cmp %o0, 0 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: fmovde %icc, %f2, %f0 entry: %0 = icmp eq i32 %a, 0 %1 = select i1 %0, double %f1, double %f2 @@ -229,6 +260,17 @@ define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind { ; SPARC64-NEXT: fcmps %fcc0, %f1, %f0 ; SPARC64-NEXT: retl ; SPARC64-NEXT: movne %fcc0, %o1, %o0 +; +; SPARC64-VIS3-LABEL: test_select_int_fcc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: sethi %h44(.LCPI4_0), %o0 +; SPARC64-VIS3-NEXT: add %o0, %m44(.LCPI4_0), %o0 +; SPARC64-VIS3-NEXT: sllx %o0, 12, %o0 +; SPARC64-VIS3-NEXT: ld [%o0+%l44(.LCPI4_0)], %f0 +; SPARC64-VIS3-NEXT: mov %o2, %o0 +; SPARC64-VIS3-NEXT: fcmps %fcc0, %f1, %f0 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: movne %fcc0, %o1, %o0 entry: %0 = fcmp une float %f, 0.000000e+00 %a.b = select i1 %0, i32 %a, i32 %b @@ -284,6 +326,17 @@ define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind { ; SPARC64-NEXT: fcmps %fcc0, %f1, %f2 ; SPARC64-NEXT: retl ; SPARC64-NEXT: fmovsne %fcc0, %f3, %f0 +; +; SPARC64-VIS3-LABEL: test_select_fp_fcc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: sethi %h44(.LCPI5_0), %o0 +; SPARC64-VIS3-NEXT: add %o0, %m44(.LCPI5_0), %o0 +; SPARC64-VIS3-NEXT: sllx %o0, 12, %o0 +; SPARC64-VIS3-NEXT: ld [%o0+%l44(.LCPI5_0)], %f2 +; SPARC64-VIS3-NEXT: fmovs %f5, %f0 +; SPARC64-VIS3-NEXT: fcmps %fcc0, %f1, %f2 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: fmovsne %fcc0, %f3, %f0 entry: %0 = fcmp une float %f, 0.000000e+00 %1 = select i1 %0, float %f1, float %f2 @@ -352,6 +405,18 @@ define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind { ; SPARC64-NEXT: fmovd %f4, %f0 ; SPARC64-NEXT: retl ; SPARC64-NEXT: nop +; +; SPARC64-VIS3-LABEL: test_select_dfp_fcc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: sethi %h44(.LCPI6_0), %o0 +; SPARC64-VIS3-NEXT: add %o0, %m44(.LCPI6_0), %o0 +; SPARC64-VIS3-NEXT: sllx %o0, 12, %o0 +; SPARC64-VIS3-NEXT: ldd [%o0+%l44(.LCPI6_0)], %f6 +; SPARC64-VIS3-NEXT: fcmpd %fcc0, %f0, %f6 +; SPARC64-VIS3-NEXT: fmovdne %fcc0, %f2, %f4 +; SPARC64-VIS3-NEXT: fmovd %f4, %f0 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: nop entry: %0 = fcmp une double %f, 0.000000e+00 %1 = select i1 %0, double %f1, double %f2 @@ -453,6 +518,31 @@ define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) nounwind { ; SPARC64-NEXT: ! %bb.4: ! %exit.0 ; SPARC64-NEXT: retl ; SPARC64-NEXT: mov %g0, %o0 +; +; SPARC64-VIS3-LABEL: test_float_cc: +; SPARC64-VIS3: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: sethi %h44(.LCPI7_0), %o0 +; SPARC64-VIS3-NEXT: add %o0, %m44(.LCPI7_0), %o0 +; SPARC64-VIS3-NEXT: sllx %o0, 12, %o0 +; SPARC64-VIS3-NEXT: ldd [%o0+%l44(.LCPI7_0)], %f4 +; SPARC64-VIS3-NEXT: fcmpd %fcc0, %f0, %f4 +; SPARC64-VIS3-NEXT: fbuge %fcc0, .LBB7_3 +; SPARC64-VIS3-NEXT: nop +; SPARC64-VIS3-NEXT: ! %bb.1: ! %loop.2 +; SPARC64-VIS3-NEXT: fcmpd %fcc0, %f2, %f4 +; SPARC64-VIS3-NEXT: fbule %fcc0, .LBB7_3 +; SPARC64-VIS3-NEXT: nop +; SPARC64-VIS3-NEXT: ! %bb.2: ! %exit.1 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: mov 1, %o0 +; SPARC64-VIS3-NEXT: .LBB7_3: ! %loop +; SPARC64-VIS3-NEXT: ! =>This Inner Loop Header: Depth=1 +; SPARC64-VIS3-NEXT: cmp %o2, 10 +; SPARC64-VIS3-NEXT: be %icc, .LBB7_3 +; SPARC64-VIS3-NEXT: nop +; SPARC64-VIS3-NEXT: ! %bb.4: ! %exit.0 +; SPARC64-VIS3-NEXT: retl +; SPARC64-VIS3-NEXT: mov %g0, %o0 entry: %0 = fcmp uge double %a, 0.000000e+00 br i1 %0, label %loop, label %loop.2 @@ -558,6 +648,34 @@ define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) nounwind { ; SPARC64-NEXT: stx %i0, [%i3] ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore +; +; SPARC64-VIS3-LABEL: test_adde_sube: +; SPARC64-VIS3: .register %g2, #scratch +; SPARC64-VIS3-NEXT: ! %bb.0: ! %entry +; SPARC64-VIS3-NEXT: save %sp, -128, %sp +; SPARC64-VIS3-NEXT: ldx [%i0+8], %i4 +; SPARC64-VIS3-NEXT: ldx [%i0], %i5 +; SPARC64-VIS3-NEXT: ldx [%i1+8], %g2 +; SPARC64-VIS3-NEXT: ldx [%i1], %i1 +; SPARC64-VIS3-NEXT: addcc %i4, %g2, %g2 +; SPARC64-VIS3-NEXT: addxccc %i5, %i1, %i1 +; SPARC64-VIS3-NEXT: stx %i1, [%i2] +; SPARC64-VIS3-NEXT: stx %g2, [%i2+8] +; SPARC64-VIS3-NEXT: !APP +; SPARC64-VIS3-NEXT: !NO_APP +; SPARC64-VIS3-NEXT: ldx [%i0+8], %i1 +; SPARC64-VIS3-NEXT: mov %g0, %i2 +; SPARC64-VIS3-NEXT: ldx [%i0], %i0 +; SPARC64-VIS3-NEXT: cmp %i4, %i1 +; SPARC64-VIS3-NEXT: movcs %xcc, 1, %i2 +; SPARC64-VIS3-NEXT: srl %i2, 0, %i2 +; SPARC64-VIS3-NEXT: sub %i5, %i0, %i0 +; SPARC64-VIS3-NEXT: sub %i0, %i2, %i0 +; SPARC64-VIS3-NEXT: sub %i4, %i1, %i1 +; SPARC64-VIS3-NEXT: stx %i1, [%i3+8] +; SPARC64-VIS3-NEXT: stx %i0, [%i3] +; SPARC64-VIS3-NEXT: ret +; SPARC64-VIS3-NEXT: restore entry: %0 = bitcast ptr %a to ptr %1 = bitcast ptr %b to ptr From e4f2191f568db718ed67defa664f83f763e7e74a Mon Sep 17 00:00:00 2001 From: Koakuma Date: Wed, 16 Apr 2025 08:38:29 +0700 Subject: [PATCH 066/710] [SPARC] Use umulxhi to do extending 64x64->128 multiply when we have VIS3 Reviewers: s-barannikov, rorth, brad0 Reviewed By: s-barannikov Pull Request: https://github.com/llvm/llvm-project/pull/135714 --- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 6 +- llvm/lib/Target/Sparc/SparcInstrVIS.td | 10 ++++ llvm/test/CodeGen/SPARC/multiply-extension.ll | 59 +++++++++++++++++++ .../SPARC/smulo-128-legalisation-lowering.ll | 44 ++++++++++++++ .../SPARC/umulo-128-legalisation-lowering.ll | 33 +++++++++++ 5 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/SPARC/multiply-extension.ll diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 098e5f22834f4..0ad261135651f 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1854,8 +1854,10 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, if (Subtarget->is64Bit()) { setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, + Subtarget->isVIS3() ? Legal : Expand); + setOperationAction(ISD::MULHS, MVT::i64, + Subtarget->isVIS3() ? Legal : Expand); setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index ee24d8a54fe8e..d9fe3b49821e5 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -294,4 +294,14 @@ def : Pat<(f32 fpnegimm0), (FNEGS (FZEROS))>; // VIS3 instruction patterns. let Predicates = [HasVIS3] in { def : Pat<(i64 (adde i64:$lhs, i64:$rhs)), (ADDXCCC $lhs, $rhs)>; + +def : Pat<(i64 (mulhu i64:$lhs, i64:$rhs)), (UMULXHI $lhs, $rhs)>; +// Signed "MULXHI". +// Based on the formula presented in OSA2011 §7.140, but with bitops to select +// the values to be added. +// TODO: This expansion should probably be moved to DAG legalization phase. +def : Pat<(i64 (mulhs i64:$lhs, i64:$rhs)), + (SUBrr (UMULXHI $lhs, $rhs), + (ADDrr (ANDrr (SRAXri $lhs, 63), $rhs), + (ANDrr (SRAXri $rhs, 63), $lhs)))>; } // Predicates = [HasVIS3] diff --git a/llvm/test/CodeGen/SPARC/multiply-extension.ll b/llvm/test/CodeGen/SPARC/multiply-extension.ll new file mode 100644 index 0000000000000..4d752ff101ca2 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/multiply-extension.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=sparcv9 | FileCheck %s -check-prefix=V9 +; RUN: llc < %s -mtriple=sparcv9 -mattr=+vis3 | FileCheck %s -check-prefix=VIS3 + +define i128 @signed_multiply_extend(i64 %0, i64 %1) nounwind { +; V9-LABEL: signed_multiply_extend: +; V9: ! %bb.0: +; V9-NEXT: save %sp, -176, %sp +; V9-NEXT: srax %i0, 63, %o2 +; V9-NEXT: srax %i1, 63, %o0 +; V9-NEXT: mov %i1, %o1 +; V9-NEXT: call __multi3 +; V9-NEXT: mov %i0, %o3 +; V9-NEXT: mov %o0, %i0 +; V9-NEXT: ret +; V9-NEXT: restore %g0, %o1, %o1 +; +; VIS3-LABEL: signed_multiply_extend: +; VIS3: ! %bb.0: +; VIS3-NEXT: srax %o0, 63, %o2 +; VIS3-NEXT: and %o2, %o1, %o2 +; VIS3-NEXT: srax %o1, 63, %o3 +; VIS3-NEXT: and %o3, %o0, %o3 +; VIS3-NEXT: add %o3, %o2, %o2 +; VIS3-NEXT: umulxhi %o1, %o0, %o3 +; VIS3-NEXT: sub %o3, %o2, %o2 +; VIS3-NEXT: mulx %o1, %o0, %o1 +; VIS3-NEXT: retl +; VIS3-NEXT: mov %o2, %o0 + %3 = sext i64 %0 to i128 + %4 = sext i64 %1 to i128 + %5 = mul nsw i128 %4, %3 + ret i128 %5 +} + +define i128 @unsigned_multiply_extend(i64 %0, i64 %1) nounwind { +; V9-LABEL: unsigned_multiply_extend: +; V9: ! %bb.0: +; V9-NEXT: save %sp, -176, %sp +; V9-NEXT: mov %g0, %o0 +; V9-NEXT: mov %i1, %o1 +; V9-NEXT: mov %g0, %o2 +; V9-NEXT: call __multi3 +; V9-NEXT: mov %i0, %o3 +; V9-NEXT: mov %o0, %i0 +; V9-NEXT: ret +; V9-NEXT: restore %g0, %o1, %o1 +; +; VIS3-LABEL: unsigned_multiply_extend: +; VIS3: ! %bb.0: +; VIS3-NEXT: umulxhi %o1, %o0, %o2 +; VIS3-NEXT: mulx %o1, %o0, %o1 +; VIS3-NEXT: retl +; VIS3-NEXT: mov %o2, %o0 + %3 = zext i64 %0 to i128 + %4 = zext i64 %1 to i128 + %5 = mul nuw i128 %4, %3 + ret i128 %5 +} diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll index 07e4c408a3ff0..1e5ab7922de08 100644 --- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64 +; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu -mattr=vis3 | FileCheck %s --check-prefixes=SPARC64-VIS3 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC-LABEL: muloti_test: @@ -213,6 +214,49 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC64-NEXT: srl %i3, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore +; +; SPARC64-VIS3-LABEL: muloti_test: +; SPARC64-VIS3: .register %g2, #scratch +; SPARC64-VIS3-NEXT: .register %g3, #scratch +; SPARC64-VIS3-NEXT: ! %bb.0: ! %start +; SPARC64-VIS3-NEXT: save %sp, -128, %sp +; SPARC64-VIS3-NEXT: mov %g0, %i5 +; SPARC64-VIS3-NEXT: umulxhi %i0, %i3, %i4 +; SPARC64-VIS3-NEXT: srax %i0, 63, %g2 +; SPARC64-VIS3-NEXT: mulx %g2, %i3, %g3 +; SPARC64-VIS3-NEXT: add %i4, %g3, %i4 +; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %g3 +; SPARC64-VIS3-NEXT: mulx %i0, %i3, %g4 +; SPARC64-VIS3-NEXT: addcc %g4, %g3, %g3 +; SPARC64-VIS3-NEXT: addxccc %i4, %g0, %g4 +; SPARC64-VIS3-NEXT: umulxhi %i1, %i2, %i4 +; SPARC64-VIS3-NEXT: srax %i2, 63, %g5 +; SPARC64-VIS3-NEXT: mulx %i1, %g5, %l0 +; SPARC64-VIS3-NEXT: add %i4, %l0, %l0 +; SPARC64-VIS3-NEXT: mulx %i1, %i2, %i4 +; SPARC64-VIS3-NEXT: addcc %i4, %g3, %i4 +; SPARC64-VIS3-NEXT: addxccc %l0, %g0, %g3 +; SPARC64-VIS3-NEXT: srax %g3, 63, %l0 +; SPARC64-VIS3-NEXT: addcc %g4, %g3, %g3 +; SPARC64-VIS3-NEXT: srax %g4, 63, %g4 +; SPARC64-VIS3-NEXT: addxccc %g4, %l0, %g4 +; SPARC64-VIS3-NEXT: and %g5, %i0, %g5 +; SPARC64-VIS3-NEXT: and %g2, %i2, %g2 +; SPARC64-VIS3-NEXT: add %g2, %g5, %g2 +; SPARC64-VIS3-NEXT: umulxhi %i0, %i2, %g5 +; SPARC64-VIS3-NEXT: sub %g5, %g2, %g2 +; SPARC64-VIS3-NEXT: mulx %i0, %i2, %i0 +; SPARC64-VIS3-NEXT: addcc %i0, %g3, %i0 +; SPARC64-VIS3-NEXT: addxccc %g2, %g4, %i2 +; SPARC64-VIS3-NEXT: srax %i4, 63, %g2 +; SPARC64-VIS3-NEXT: xor %i2, %g2, %i2 +; SPARC64-VIS3-NEXT: xor %i0, %g2, %i0 +; SPARC64-VIS3-NEXT: or %i0, %i2, %i0 +; SPARC64-VIS3-NEXT: movrnz %i0, 1, %i5 +; SPARC64-VIS3-NEXT: mulx %i1, %i3, %i1 +; SPARC64-VIS3-NEXT: srl %i5, 0, %i2 +; SPARC64-VIS3-NEXT: ret +; SPARC64-VIS3-NEXT: restore %g0, %i4, %o0 start: %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r) %1 = extractvalue { i128, i1 } %0, 0 diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll index f3835790210a0..6d197c88bfecd 100644 --- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64 +; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu -mattr=vis3 | FileCheck %s --check-prefixes=SPARC64-VIS3 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC-LABEL: muloti_test: @@ -199,6 +200,38 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC64-NEXT: srl %i1, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore %g0, %o1, %o1 +; +; SPARC64-VIS3-LABEL: muloti_test: +; SPARC64-VIS3: .register %g2, #scratch +; SPARC64-VIS3-NEXT: .register %g3, #scratch +; SPARC64-VIS3-NEXT: ! %bb.0: ! %start +; SPARC64-VIS3-NEXT: save %sp, -128, %sp +; SPARC64-VIS3-NEXT: mov %g0, %i5 +; SPARC64-VIS3-NEXT: mov %g0, %g2 +; SPARC64-VIS3-NEXT: mov %g0, %g3 +; SPARC64-VIS3-NEXT: mov %g0, %g4 +; SPARC64-VIS3-NEXT: mov %g0, %g5 +; SPARC64-VIS3-NEXT: mulx %i2, %i1, %i4 +; SPARC64-VIS3-NEXT: mulx %i0, %i3, %l0 +; SPARC64-VIS3-NEXT: add %l0, %i4, %i4 +; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %l0 +; SPARC64-VIS3-NEXT: add %l0, %i4, %i4 +; SPARC64-VIS3-NEXT: cmp %i4, %l0 +; SPARC64-VIS3-NEXT: movrnz %i2, 1, %g2 +; SPARC64-VIS3-NEXT: movrnz %i0, 1, %g3 +; SPARC64-VIS3-NEXT: and %g3, %g2, %g2 +; SPARC64-VIS3-NEXT: umulxhi %i0, %i3, %i0 +; SPARC64-VIS3-NEXT: movrnz %i0, 1, %g4 +; SPARC64-VIS3-NEXT: movcs %xcc, 1, %i5 +; SPARC64-VIS3-NEXT: or %g2, %g4, %i0 +; SPARC64-VIS3-NEXT: umulxhi %i2, %i1, %i2 +; SPARC64-VIS3-NEXT: movrnz %i2, 1, %g5 +; SPARC64-VIS3-NEXT: or %i0, %g5, %i0 +; SPARC64-VIS3-NEXT: or %i0, %i5, %i0 +; SPARC64-VIS3-NEXT: mulx %i1, %i3, %i1 +; SPARC64-VIS3-NEXT: srl %i0, 0, %i2 +; SPARC64-VIS3-NEXT: ret +; SPARC64-VIS3-NEXT: restore %g0, %i4, %o0 start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) %1 = extractvalue { i128, i1 } %0, 0 From 0439a4eca78fa1e3aa45b49ff349c3da4fb02f48 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 16 Apr 2025 10:16:31 +0800 Subject: [PATCH 067/710] [RISCV] Add new CondCode COND_CV_BEQIMM/COND_CV_BNEIMM for CV immediate branch (#135771) If there is another branch instruction also with immediate operand, but it is used to specify which bit to be tested is set or clear. We only check whether operand2 is immediate or not here. There are no way to distinguish between them. So add new CondCode COND_CV_BEQIMM/COND_CV_BNEIMM that we can know what kinds of immediate branch instruction are matched in Select_* Pseudo. --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 42 +++++++++---------- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 6 ++- llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td | 9 +++- 5 files changed, 33 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 18ce5407f816c..f83c2b6da8923 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -789,7 +789,7 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { RISCVCC::CondCode CC; getOperandsForBranch(MI.getOperand(0).getReg(), CC, LHS, RHS, *MRI); - auto Bcc = MIB.buildInstr(RISCVCC::getBrCond(STI, CC), {}, {LHS, RHS}) + auto Bcc = MIB.buildInstr(RISCVCC::getBrCond(CC), {}, {LHS, RHS}) .addMBB(MI.getOperand(1).getMBB()); MI.eraseFromParent(); return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f24752b8721f5..beea13d5f8f3e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20645,7 +20645,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, // Insert appropriate branch. if (MI.getOperand(2).isImm()) - BuildMI(HeadMBB, DL, TII.getBrCond(CC, MI.getOperand(2).isImm())) + BuildMI(HeadMBB, DL, TII.getBrCond(CC)) .addReg(LHS) .addImm(MI.getOperand(2).getImm()) .addMBB(TailMBB); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index f8a35533ba952..5d661a3438b1c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -974,10 +974,6 @@ static RISCVCC::CondCode getCondFromBranchOpc(unsigned Opc) { switch (Opc) { default: return RISCVCC::COND_INVALID; - case RISCV::CV_BEQIMM: - return RISCVCC::COND_EQ; - case RISCV::CV_BNEIMM: - return RISCVCC::COND_NE; case RISCV::BEQ: return RISCVCC::COND_EQ; case RISCV::BNE: @@ -990,6 +986,10 @@ static RISCVCC::CondCode getCondFromBranchOpc(unsigned Opc) { return RISCVCC::COND_LTU; case RISCV::BGEU: return RISCVCC::COND_GEU; + case RISCV::CV_BEQIMM: + return RISCVCC::COND_CV_BEQIMM; + case RISCV::CV_BNEIMM: + return RISCVCC::COND_CV_BNEIMM; } } @@ -1027,23 +1027,14 @@ static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target, Cond.push_back(LastInst.getOperand(1)); } -unsigned RISCVCC::getBrCond(const RISCVSubtarget &STI, RISCVCC::CondCode CC, - bool Imm) { +unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case RISCVCC::COND_EQ: - if (!Imm) - return RISCV::BEQ; - if (STI.hasVendorXCVbi()) - return RISCV::CV_BEQIMM; - llvm_unreachable("Unknown branch immediate!"); + return RISCV::BEQ; case RISCVCC::COND_NE: - if (!Imm) - return RISCV::BNE; - if (STI.hasVendorXCVbi()) - return RISCV::CV_BNEIMM; - llvm_unreachable("Unknown branch immediate!"); + return RISCV::BNE; case RISCVCC::COND_LT: return RISCV::BLT; case RISCVCC::COND_GE: @@ -1052,12 +1043,15 @@ unsigned RISCVCC::getBrCond(const RISCVSubtarget &STI, RISCVCC::CondCode CC, return RISCV::BLTU; case RISCVCC::COND_GEU: return RISCV::BGEU; + case RISCVCC::COND_CV_BEQIMM: + return RISCV::CV_BEQIMM; + case RISCVCC::COND_CV_BNEIMM: + return RISCV::CV_BNEIMM; } } -const MCInstrDesc &RISCVInstrInfo::getBrCond(RISCVCC::CondCode CC, - bool Imm) const { - return get(RISCVCC::getBrCond(STI, CC, Imm)); +const MCInstrDesc &RISCVInstrInfo::getBrCond(RISCVCC::CondCode CC) const { + return get(RISCVCC::getBrCond(CC)); } RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) { @@ -1076,6 +1070,10 @@ RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) { return RISCVCC::COND_GEU; case RISCVCC::COND_GEU: return RISCVCC::COND_LTU; + case RISCVCC::COND_CV_BEQIMM: + return RISCVCC::COND_CV_BNEIMM; + case RISCVCC::COND_CV_BNEIMM: + return RISCVCC::COND_CV_BEQIMM; } } @@ -1206,10 +1204,8 @@ unsigned RISCVInstrInfo::insertBranch( // Either a one or two-way conditional branch. auto CC = static_cast(Cond[0].getImm()); - MachineInstr &CondMI = *BuildMI(&MBB, DL, getBrCond(CC, Cond[2].isImm())) - .add(Cond[1]) - .add(Cond[2]) - .addMBB(TBB); + MachineInstr &CondMI = + *BuildMI(&MBB, DL, getBrCond(CC)).add(Cond[1]).add(Cond[2]).addMBB(TBB); if (BytesAdded) *BytesAdded += getInstSizeInBytes(CondMI); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index bf0d6b2c59a45..67e457d64f6e3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -41,11 +41,13 @@ enum CondCode { COND_GE, COND_LTU, COND_GEU, + COND_CV_BEQIMM, + COND_CV_BNEIMM, COND_INVALID }; CondCode getOppositeBranchCondition(CondCode); -unsigned getBrCond(const RISCVSubtarget &STI, CondCode CC, bool Imm = false); +unsigned getBrCond(CondCode CC); } // end of namespace RISCVCC @@ -65,7 +67,7 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { explicit RISCVInstrInfo(RISCVSubtarget &STI); MCInst getNop() const override; - const MCInstrDesc &getBrCond(RISCVCC::CondCode CC, bool Imm = false) const; + const MCInstrDesc &getBrCond(RISCVCC::CondCode CC) const; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index b5df7b54fc9f1..5ce08d64b141d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -791,6 +791,13 @@ let Predicates = [HasVendorXCValu, IsRV32], AddedComplexity = 1 in { // Patterns for immediate branching operations //===----------------------------------------------------------------------===// +def IntCCtoRISCVCCCV : SDNodeXForm(N->getOperand(2))->get(); + assert(CC == ISD::SETEQ || CC == ISD::SETNE); + RISCVCC::CondCode BrCC = CC == ISD::SETEQ ? RISCVCC::COND_CV_BEQIMM : RISCVCC::COND_CV_BNEIMM; + return CurDAG->getTargetConstant(BrCC, SDLoc(N), Subtarget->getXLenVT()); +}]>; + let Predicates = [HasVendorXCVbi, IsRV32], AddedComplexity = 2 in { def : Pat<(riscv_brcc GPR:$rs1, simm5:$imm5, SETEQ, bb:$imm12), (CV_BEQIMM GPR:$rs1, simm5:$imm5, bare_simm13_lsb0:$imm12)>; @@ -807,7 +814,7 @@ let Predicates = [HasVendorXCVbi, IsRV32], AddedComplexity = 2 in { : Pat<(riscv_selectcc_frag:$cc (i32 GPR:$lhs), simm5:$Constant, Cond, (i32 GPR:$truev), GPR:$falsev), (Select_GPR_Using_CC_Imm GPR:$lhs, simm5:$Constant, - (IntCCtoRISCVCC $cc), GPR:$truev, GPR:$falsev)>; + (IntCCtoRISCVCCCV $cc), GPR:$truev, GPR:$falsev)>; def : Selectbi; def : Selectbi; From 5e9650ec2deb2f2bb6d5ad28e83bb6cd3c4189e4 Mon Sep 17 00:00:00 2001 From: Koakuma Date: Wed, 16 Apr 2025 09:27:17 +0700 Subject: [PATCH 068/710] Revert "[SPARC] Use umulxhi to do extending 64x64->128 multiply when we have VIS3" (#135897) This change breaks multiply tests on SPARC. https://lab.llvm.org/buildbot/#/builders/108/builds/11691/steps/6/logs/FAIL__LLVM__multiply-extension_ll Reverts llvm/llvm-project#135714 --- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 6 +- llvm/lib/Target/Sparc/SparcInstrVIS.td | 10 ---- llvm/test/CodeGen/SPARC/multiply-extension.ll | 59 ------------------- .../SPARC/smulo-128-legalisation-lowering.ll | 44 -------------- .../SPARC/umulo-128-legalisation-lowering.ll | 33 ----------- 5 files changed, 2 insertions(+), 150 deletions(-) delete mode 100644 llvm/test/CodeGen/SPARC/multiply-extension.ll diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 0ad261135651f..098e5f22834f4 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1854,10 +1854,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, if (Subtarget->is64Bit()) { setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i64, - Subtarget->isVIS3() ? Legal : Expand); - setOperationAction(ISD::MULHS, MVT::i64, - Subtarget->isVIS3() ? Legal : Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index d9fe3b49821e5..ee24d8a54fe8e 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -294,14 +294,4 @@ def : Pat<(f32 fpnegimm0), (FNEGS (FZEROS))>; // VIS3 instruction patterns. let Predicates = [HasVIS3] in { def : Pat<(i64 (adde i64:$lhs, i64:$rhs)), (ADDXCCC $lhs, $rhs)>; - -def : Pat<(i64 (mulhu i64:$lhs, i64:$rhs)), (UMULXHI $lhs, $rhs)>; -// Signed "MULXHI". -// Based on the formula presented in OSA2011 §7.140, but with bitops to select -// the values to be added. -// TODO: This expansion should probably be moved to DAG legalization phase. -def : Pat<(i64 (mulhs i64:$lhs, i64:$rhs)), - (SUBrr (UMULXHI $lhs, $rhs), - (ADDrr (ANDrr (SRAXri $lhs, 63), $rhs), - (ANDrr (SRAXri $rhs, 63), $lhs)))>; } // Predicates = [HasVIS3] diff --git a/llvm/test/CodeGen/SPARC/multiply-extension.ll b/llvm/test/CodeGen/SPARC/multiply-extension.ll deleted file mode 100644 index 4d752ff101ca2..0000000000000 --- a/llvm/test/CodeGen/SPARC/multiply-extension.ll +++ /dev/null @@ -1,59 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=sparcv9 | FileCheck %s -check-prefix=V9 -; RUN: llc < %s -mtriple=sparcv9 -mattr=+vis3 | FileCheck %s -check-prefix=VIS3 - -define i128 @signed_multiply_extend(i64 %0, i64 %1) nounwind { -; V9-LABEL: signed_multiply_extend: -; V9: ! %bb.0: -; V9-NEXT: save %sp, -176, %sp -; V9-NEXT: srax %i0, 63, %o2 -; V9-NEXT: srax %i1, 63, %o0 -; V9-NEXT: mov %i1, %o1 -; V9-NEXT: call __multi3 -; V9-NEXT: mov %i0, %o3 -; V9-NEXT: mov %o0, %i0 -; V9-NEXT: ret -; V9-NEXT: restore %g0, %o1, %o1 -; -; VIS3-LABEL: signed_multiply_extend: -; VIS3: ! %bb.0: -; VIS3-NEXT: srax %o0, 63, %o2 -; VIS3-NEXT: and %o2, %o1, %o2 -; VIS3-NEXT: srax %o1, 63, %o3 -; VIS3-NEXT: and %o3, %o0, %o3 -; VIS3-NEXT: add %o3, %o2, %o2 -; VIS3-NEXT: umulxhi %o1, %o0, %o3 -; VIS3-NEXT: sub %o3, %o2, %o2 -; VIS3-NEXT: mulx %o1, %o0, %o1 -; VIS3-NEXT: retl -; VIS3-NEXT: mov %o2, %o0 - %3 = sext i64 %0 to i128 - %4 = sext i64 %1 to i128 - %5 = mul nsw i128 %4, %3 - ret i128 %5 -} - -define i128 @unsigned_multiply_extend(i64 %0, i64 %1) nounwind { -; V9-LABEL: unsigned_multiply_extend: -; V9: ! %bb.0: -; V9-NEXT: save %sp, -176, %sp -; V9-NEXT: mov %g0, %o0 -; V9-NEXT: mov %i1, %o1 -; V9-NEXT: mov %g0, %o2 -; V9-NEXT: call __multi3 -; V9-NEXT: mov %i0, %o3 -; V9-NEXT: mov %o0, %i0 -; V9-NEXT: ret -; V9-NEXT: restore %g0, %o1, %o1 -; -; VIS3-LABEL: unsigned_multiply_extend: -; VIS3: ! %bb.0: -; VIS3-NEXT: umulxhi %o1, %o0, %o2 -; VIS3-NEXT: mulx %o1, %o0, %o1 -; VIS3-NEXT: retl -; VIS3-NEXT: mov %o2, %o0 - %3 = zext i64 %0 to i128 - %4 = zext i64 %1 to i128 - %5 = mul nuw i128 %4, %3 - ret i128 %5 -} diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll index 1e5ab7922de08..07e4c408a3ff0 100644 --- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64 -; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu -mattr=vis3 | FileCheck %s --check-prefixes=SPARC64-VIS3 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC-LABEL: muloti_test: @@ -214,49 +213,6 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC64-NEXT: srl %i3, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore -; -; SPARC64-VIS3-LABEL: muloti_test: -; SPARC64-VIS3: .register %g2, #scratch -; SPARC64-VIS3-NEXT: .register %g3, #scratch -; SPARC64-VIS3-NEXT: ! %bb.0: ! %start -; SPARC64-VIS3-NEXT: save %sp, -128, %sp -; SPARC64-VIS3-NEXT: mov %g0, %i5 -; SPARC64-VIS3-NEXT: umulxhi %i0, %i3, %i4 -; SPARC64-VIS3-NEXT: srax %i0, 63, %g2 -; SPARC64-VIS3-NEXT: mulx %g2, %i3, %g3 -; SPARC64-VIS3-NEXT: add %i4, %g3, %i4 -; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %g3 -; SPARC64-VIS3-NEXT: mulx %i0, %i3, %g4 -; SPARC64-VIS3-NEXT: addcc %g4, %g3, %g3 -; SPARC64-VIS3-NEXT: addxccc %i4, %g0, %g4 -; SPARC64-VIS3-NEXT: umulxhi %i1, %i2, %i4 -; SPARC64-VIS3-NEXT: srax %i2, 63, %g5 -; SPARC64-VIS3-NEXT: mulx %i1, %g5, %l0 -; SPARC64-VIS3-NEXT: add %i4, %l0, %l0 -; SPARC64-VIS3-NEXT: mulx %i1, %i2, %i4 -; SPARC64-VIS3-NEXT: addcc %i4, %g3, %i4 -; SPARC64-VIS3-NEXT: addxccc %l0, %g0, %g3 -; SPARC64-VIS3-NEXT: srax %g3, 63, %l0 -; SPARC64-VIS3-NEXT: addcc %g4, %g3, %g3 -; SPARC64-VIS3-NEXT: srax %g4, 63, %g4 -; SPARC64-VIS3-NEXT: addxccc %g4, %l0, %g4 -; SPARC64-VIS3-NEXT: and %g5, %i0, %g5 -; SPARC64-VIS3-NEXT: and %g2, %i2, %g2 -; SPARC64-VIS3-NEXT: add %g2, %g5, %g2 -; SPARC64-VIS3-NEXT: umulxhi %i0, %i2, %g5 -; SPARC64-VIS3-NEXT: sub %g5, %g2, %g2 -; SPARC64-VIS3-NEXT: mulx %i0, %i2, %i0 -; SPARC64-VIS3-NEXT: addcc %i0, %g3, %i0 -; SPARC64-VIS3-NEXT: addxccc %g2, %g4, %i2 -; SPARC64-VIS3-NEXT: srax %i4, 63, %g2 -; SPARC64-VIS3-NEXT: xor %i2, %g2, %i2 -; SPARC64-VIS3-NEXT: xor %i0, %g2, %i0 -; SPARC64-VIS3-NEXT: or %i0, %i2, %i0 -; SPARC64-VIS3-NEXT: movrnz %i0, 1, %i5 -; SPARC64-VIS3-NEXT: mulx %i1, %i3, %i1 -; SPARC64-VIS3-NEXT: srl %i5, 0, %i2 -; SPARC64-VIS3-NEXT: ret -; SPARC64-VIS3-NEXT: restore %g0, %i4, %o0 start: %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r) %1 = extractvalue { i128, i1 } %0, 0 diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll index 6d197c88bfecd..f3835790210a0 100644 --- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64 -; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu -mattr=vis3 | FileCheck %s --check-prefixes=SPARC64-VIS3 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC-LABEL: muloti_test: @@ -200,38 +199,6 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind { ; SPARC64-NEXT: srl %i1, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore %g0, %o1, %o1 -; -; SPARC64-VIS3-LABEL: muloti_test: -; SPARC64-VIS3: .register %g2, #scratch -; SPARC64-VIS3-NEXT: .register %g3, #scratch -; SPARC64-VIS3-NEXT: ! %bb.0: ! %start -; SPARC64-VIS3-NEXT: save %sp, -128, %sp -; SPARC64-VIS3-NEXT: mov %g0, %i5 -; SPARC64-VIS3-NEXT: mov %g0, %g2 -; SPARC64-VIS3-NEXT: mov %g0, %g3 -; SPARC64-VIS3-NEXT: mov %g0, %g4 -; SPARC64-VIS3-NEXT: mov %g0, %g5 -; SPARC64-VIS3-NEXT: mulx %i2, %i1, %i4 -; SPARC64-VIS3-NEXT: mulx %i0, %i3, %l0 -; SPARC64-VIS3-NEXT: add %l0, %i4, %i4 -; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %l0 -; SPARC64-VIS3-NEXT: add %l0, %i4, %i4 -; SPARC64-VIS3-NEXT: cmp %i4, %l0 -; SPARC64-VIS3-NEXT: movrnz %i2, 1, %g2 -; SPARC64-VIS3-NEXT: movrnz %i0, 1, %g3 -; SPARC64-VIS3-NEXT: and %g3, %g2, %g2 -; SPARC64-VIS3-NEXT: umulxhi %i0, %i3, %i0 -; SPARC64-VIS3-NEXT: movrnz %i0, 1, %g4 -; SPARC64-VIS3-NEXT: movcs %xcc, 1, %i5 -; SPARC64-VIS3-NEXT: or %g2, %g4, %i0 -; SPARC64-VIS3-NEXT: umulxhi %i2, %i1, %i2 -; SPARC64-VIS3-NEXT: movrnz %i2, 1, %g5 -; SPARC64-VIS3-NEXT: or %i0, %g5, %i0 -; SPARC64-VIS3-NEXT: or %i0, %i5, %i0 -; SPARC64-VIS3-NEXT: mulx %i1, %i3, %i1 -; SPARC64-VIS3-NEXT: srl %i0, 0, %i2 -; SPARC64-VIS3-NEXT: ret -; SPARC64-VIS3-NEXT: restore %g0, %i4, %o0 start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) %1 = extractvalue { i128, i1 } %0, 0 From bed03ae36600f83f214c41af333f47fe8ead9ede Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Wed, 16 Apr 2025 10:47:05 +0800 Subject: [PATCH 069/710] [X86] Fix the issue of creating index reg negations (#135632) The 8 and 16 bit LEA instruction support was added by PR #122102, and we have to update creating index register negations accordingly. The issue is exposed with APX NDD instructions. --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 19 ++++- .../CodeGen/X86/apx/ndd-neg-addr-index.ll | 71 +++++++++++++++++++ 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d322e70fc0c20..01118beb9cf5e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -275,8 +275,23 @@ namespace { #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC) // Negate the index if needed. if (AM.NegateIndex) { - unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r) - : GET_ND_IF_ENABLED(X86::NEG32r); + unsigned NegOpc; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Unsupported VT!"); + case MVT::i64: + NegOpc = GET_ND_IF_ENABLED(X86::NEG64r); + break; + case MVT::i32: + NegOpc = GET_ND_IF_ENABLED(X86::NEG32r); + break; + case MVT::i16: + NegOpc = GET_ND_IF_ENABLED(X86::NEG16r); + break; + case MVT::i8: + NegOpc = GET_ND_IF_ENABLED(X86::NEG8r); + break; + } SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, AM.IndexReg), 0); AM.IndexReg = Neg; diff --git a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll new file mode 100644 index 0000000000000..6679b5f58e8c1 --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding -o - | FileCheck %s --check-prefix=NDD + + +define void @neg_8bit_1(i1 %cmp) { +; NDD-LABEL: neg_8bit_1: +; NDD: # %bb.0: # %entry +; NDD-NEXT: andb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x01] +; NDD-NEXT: movzbl 0, %ecx # encoding: [0x0f,0xb6,0x0c,0x25,0x00,0x00,0x00,0x00] +; NDD-NEXT: negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8] +; NDD-NEXT: leab 2(%rcx,%rax), %al # encoding: [0x66,0x8d,0x44,0x01,0x02] +; NDD-NEXT: movb %al, 0 # encoding: [0x88,0x04,0x25,0x00,0x00,0x00,0x00] +; NDD-NEXT: retq # encoding: [0xc3] +entry: + %cond = select i1 %cmp, i8 1, i8 2 + %0 = load i8, ptr null, align 4 + %add = add i8 %cond, %0 + store i8 %add, ptr null, align 4 + ret void +} + +define void @neg_8bit_2(i8 %int8) { +; NDD-LABEL: neg_8bit_2: +; NDD: # %bb.0: # %entry +; NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NDD-NEXT: addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff] +; NDD-NEXT: negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8] +; NDD-NEXT: leab 1(%rdi,%rax), %al # encoding: [0x66,0x8d,0x44,0x07,0x01] +; NDD-NEXT: mulb %dil # encoding: [0x40,0xf6,0xe7] +; NDD-NEXT: testb %al, %al # encoding: [0x84,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] +entry: + %0 = shl i8 %int8, 1 + %sub = sub i8 %int8, %0 + %add = add i8 %sub, 1 + %div = mul i8 %add, %int8 + %cmp = icmp slt i8 %div, 0 + br i1 %cmp, label %label2, label %label1 + +label1: ; preds = %entry + ret void + +label2: ; preds = %entry + ret void +} + +define i32 @neg_16bit(i16 %0) { +; NDD-LABEL: neg_16bit: +; NDD: # %bb.0: # %entry +; NDD-NEXT: # kill: def $edi killed $edi def $rdi +; NDD-NEXT: incw %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xff,0xc7] +; NDD-NEXT: addw $256, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xc7,0x00,0x01] +; NDD-NEXT: # imm = 0x100 +; NDD-NEXT: testw %ax, %ax # encoding: [0x66,0x85,0xc0] +; NDD-NEXT: cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1] +; NDD-NEXT: andw $-256, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0x00,0xff] +; NDD-NEXT: negw %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd8] +; NDD-NEXT: leaw 1(%rdi,%rax), %ax # encoding: [0x66,0x8d,0x44,0x07,0x01] +; NDD-NEXT: movzwl %ax, %eax # encoding: [0x0f,0xb7,0xc0] +; NDD-NEXT: movq %rax, 0 # encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00] +; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; NDD-NEXT: retq # encoding: [0xc3] +entry: + %add = add i16 %0, 1 + %rem = srem i16 %add, 256 + %1 = zext i16 %rem to i19 + %2 = sext i19 %1 to i64 + %3 = getelementptr i8, ptr null, i64 %2 + store ptr %3, ptr null, align 4 + ret i32 0 +} From e676866368a84c88aad90e138268e00a1c56a230 Mon Sep 17 00:00:00 2001 From: yingopq <115543042+yingopq@users.noreply.github.com> Date: Wed, 16 Apr 2025 10:56:06 +0800 Subject: [PATCH 070/710] [Mips] Fix clang crashes when compiling a variadic function while targeting mips3 (#130558) issue reason: Because mips3 has the feature 'FeatureGP64Bit', when target mips3 process function `writeVarArgRegs`, the result of `getGPRSizeInBytes` is 8 and the result of `GetVarArgRegs` is `Mips::A0, Mips::A1, Mips::A2, Mips::A3`. This would generate `gpr64 = COPY $a1` which should be `gpr64 = COPY $a1_64`. Also when process `CC_Mips_FixedArg`, mips would CCDelegateTo `CC_MipsO32_FP`. In fact, it should CCDelegateTo `CC_MipsN`. Fix #98716. --- .../Target/Mips/MCTargetDesc/MipsABIInfo.cpp | 10 ++-- .../Target/Mips/MCTargetDesc/MipsABIInfo.h | 2 +- llvm/lib/Target/Mips/MipsCallLowering.cpp | 3 +- llvm/lib/Target/Mips/MipsCallingConv.td | 2 +- llvm/lib/Target/Mips/MipsISelLowering.cpp | 3 +- llvm/test/CodeGen/Mips/vararg.ll | 54 +++++++++++++++++++ 6 files changed, 66 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/vararg.ll diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp index 08cbba952ccc8..1be29cf3c94b9 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp @@ -37,9 +37,13 @@ ArrayRef MipsABIInfo::GetByValArgRegs() const { llvm_unreachable("Unhandled ABI"); } -ArrayRef MipsABIInfo::GetVarArgRegs() const { - if (IsO32()) - return ArrayRef(O32IntRegs); +ArrayRef MipsABIInfo::getVarArgRegs(bool isGP64bit) const { + if (IsO32()) { + if (isGP64bit) + return ArrayRef(Mips64IntRegs); + else + return ArrayRef(O32IntRegs); + } if (IsN32() || IsN64()) return ArrayRef(Mips64IntRegs); llvm_unreachable("Unhandled ABI"); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h index 41f80771142de..44b023c7c3ef6 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h @@ -46,7 +46,7 @@ class MipsABIInfo { ArrayRef GetByValArgRegs() const; /// The registers to use for the variable argument list. - ArrayRef GetVarArgRegs() const; + ArrayRef getVarArgRegs(bool isGP64bit) const; /// Obtain the size of the area allocated by the callee for arguments. /// CallingConv::FastCall affects the value for O32. diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index b856290211277..01c9d0b38323e 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -406,7 +406,8 @@ bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, return false; if (F.isVarArg()) { - ArrayRef ArgRegs = ABI.GetVarArgRegs(); + ArrayRef ArgRegs = + ABI.getVarArgRegs(MF.getSubtarget().isGP64bit()); unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs); int VaArgOffset; diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 25384a3fe8de3..3c60114f507b9 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -339,7 +339,7 @@ def CC_Mips_FixedArg : CallingConv<[ CCIfCC<"CallingConv::Fast", CCDelegateTo>, - CCIfSubtarget<"isABI_O32()", CCDelegateTo>, + CCIfSubtarget<"isABI_O32()", CCIfSubtargetNot<"isGP64bit()", CCDelegateTo>>, CCDelegateTo ]>; diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index fa8a4704730cf..55fc636d3c781 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3400,7 +3400,6 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, getPointerTy(DAG.getDataLayout())); - std::deque> RegsToPass; SmallVector MemOpChains; @@ -4654,7 +4653,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector &OutChains, SDValue Chain, const SDLoc &DL, SelectionDAG &DAG, CCState &State) const { - ArrayRef ArgRegs = ABI.GetVarArgRegs(); + ArrayRef ArgRegs = ABI.getVarArgRegs(Subtarget.isGP64bit()); unsigned Idx = State.getFirstUnallocated(ArgRegs); unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8); diff --git a/llvm/test/CodeGen/Mips/vararg.ll b/llvm/test/CodeGen/Mips/vararg.ll new file mode 100644 index 0000000000000..ed4a805af0c0c --- /dev/null +++ b/llvm/test/CodeGen/Mips/vararg.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips3 -target-abi o32 < %s | FileCheck %s -check-prefixes=MIPS3-O32 +; RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips3 -target-abi n32 < %s | FileCheck %s -check-prefixes=MIPS3-N32 +; RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips3 -target-abi n64 < %s | FileCheck %s -check-prefixes=MIPS3-N64 + +define void @func(ptr nocapture %x, ...) nounwind { +; MIPS3-O32-LABEL: func: +; MIPS32-O32: # %bb.0: # %entry +; MIPS32-O32-NEXT: addiu $sp, $sp, -48 +; MIPS32-O32-NEXT: sd $11, 56($sp) +; MIPS32-O32-NEXT: sd $10, 48($sp) +; MIPS32-O32-NEXT: sd $9, 40($sp) +; MIPS32-O32-NEXT: sd $8, 32($sp) +; MIPS32-O32-NEXT: sd $7, 24($sp) +; MIPS32-O32-NEXT: sd $6, 16($sp) +; MIPS32-O32-NEXT: sd $5, 8($sp) +; MIPS32-O32-NEXT: sw $4, 4($sp) +; MIPS32-O32-NEXT: jr $ra +; MIPS32-O32-NEXT: addiu $sp, $sp, 48 +; +; MIPS3-N32-LABEL: func: +; MIPS32-N32: # %bb.0: # %entry +; MIPS32-N32-NEXT: addiu $sp, $sp, -64 +; MIPS32-N32-NEXT: sd $11, 56($sp) +; MIPS32-N32-NEXT: sd $10, 48($sp) +; MIPS32-N32-NEXT: sd $9, 40($sp) +; MIPS32-N32-NEXT: sd $8, 32($sp) +; MIPS32-N32-NEXT: sd $7, 24($sp) +; MIPS32-N32-NEXT: sd $6, 16($sp) +; MIPS32-N32-NEXT: sd $5, 8($sp) +; MIPS32-N32-NEXT: sw $4, 4($sp) +; MIPS32-N32-NEXT: jr $ra +; MIPS32-N32-NEXT: addiu $sp, $sp, 64 +; +; MIPS3-N64-LABEL: func: +; MIPS32-N64: # %bb.0: # %entry +; MIPS32-N64-NEXT: addiu $sp, $sp, -64 +; MIPS32-N64-NEXT: sdl $4, 7($sp) +; MIPS32-N64-NEXT: sd $11, 56($sp) +; MIPS32-N64-NEXT: sd $10, 48($sp) +; MIPS32-N64-NEXT: sd $9, 40($sp) +; MIPS32-N64-NEXT: sd $8, 32($sp) +; MIPS32-N64-NEXT: sd $7, 24($sp) +; MIPS32-N64-NEXT: sd $6, 16($sp) +; MIPS32-N64-NEXT: sd $5, 8($sp) +; MIPS32-N64-NEXT: sw $4, 4($sp) +; MIPS32-N64-NEXT: jr $ra +; MIPS32-N64-NEXT: addiu $sp, $sp, 64 + +entry: + %x.addr = alloca ptr, align 4 + store ptr %x, ptr %x.addr, align 4 + ret void +} From 517605c20e6014543e91d45524a17c443aa11bd4 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Tue, 15 Apr 2025 20:00:51 -0700 Subject: [PATCH 071/710] [alpha.webkit.UnretainedCallArgsChecker] Add the support for RetainPtrArc (#135532) WebKit uses #define to rename RetainPtr to RetainPtrArc so add the support for it. --- .../Checkers/WebKit/PtrTypesSemantics.cpp | 15 +++++++++------ .../Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp | 8 ++++---- .../Analysis/Checkers/WebKit/objc-mock-types.h | 5 +++++ .../Checkers/WebKit/unretained-call-args-arc.mm | 11 +++++++++++ .../Checkers/WebKit/unretained-call-args.mm | 11 +++++++++++ 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 134afcd124526..811888e119449 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -119,7 +119,9 @@ bool isRefType(const std::string &Name) { Name == "RefPtr" || Name == "RefPtrAllowingPartiallyDestroyed"; } -bool isRetainPtr(const std::string &Name) { return Name == "RetainPtr"; } +bool isRetainPtr(const std::string &Name) { + return Name == "RetainPtr" || Name == "RetainPtrArc"; +} bool isCheckedPtr(const std::string &Name) { return Name == "CheckedPtr" || Name == "CheckedRef"; @@ -157,7 +159,8 @@ bool isCtorOfCheckedPtr(const clang::FunctionDecl *F) { bool isCtorOfRetainPtr(const clang::FunctionDecl *F) { const std::string &FunctionName = safeGetName(F); return FunctionName == "RetainPtr" || FunctionName == "adoptNS" || - FunctionName == "adoptCF" || FunctionName == "retainPtr"; + FunctionName == "adoptCF" || FunctionName == "retainPtr" || + FunctionName == "RetainPtrArc" || FunctionName == "adoptNSArc"; } bool isCtorOfSafePtr(const clang::FunctionDecl *F) { @@ -190,7 +193,7 @@ bool isRefOrCheckedPtrType(const clang::QualType T) { } bool isRetainPtrType(const clang::QualType T) { - return isPtrOfType(T, [](auto Name) { return Name == "RetainPtr"; }); + return isPtrOfType(T, [](auto Name) { return isRetainPtr(Name); }); } bool isOwnerPtrType(const clang::QualType T) { @@ -374,7 +377,7 @@ std::optional isGetterOfSafePtr(const CXXMethodDecl *M) { method == "impl")) return true; - if (className == "RetainPtr" && method == "get") + if (isRetainPtr(className) && method == "get") return true; // Ref -> T conversion @@ -395,7 +398,7 @@ std::optional isGetterOfSafePtr(const CXXMethodDecl *M) { } } - if (className == "RetainPtr") { + if (isRetainPtr(className)) { if (auto *maybeRefToRawOperator = dyn_cast(M)) { auto QT = maybeRefToRawOperator->getConversionType(); auto *T = QT.getTypePtrOrNull(); @@ -429,7 +432,7 @@ bool isCheckedPtr(const CXXRecordDecl *R) { bool isRetainPtr(const CXXRecordDecl *R) { assert(R); if (auto *TmplR = R->getTemplateInstantiationPattern()) - return safeGetName(TmplR) == "RetainPtr"; + return isRetainPtr(safeGetName(TmplR)); return false; } diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp index d372c5d1ba626..d3eee11311d91 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp @@ -71,7 +71,7 @@ class RetainPtrCtorAdoptChecker } bool TraverseClassTemplateDecl(ClassTemplateDecl *CTD) { - if (safeGetName(CTD) == "RetainPtr") + if (isRetainPtr(safeGetName(CTD))) return true; // Skip the contents of RetainPtr. return Base::TraverseClassTemplateDecl(CTD); } @@ -193,7 +193,7 @@ class RetainPtrCtorAdoptChecker if (!Cls) return; - if (safeGetName(Cls) != "RetainPtr" || !CE->getNumArgs()) + if (!isRetainPtr(safeGetName(Cls)) || !CE->getNumArgs()) return; // Ignore RetainPtr construction inside adoptNS, adoptCF, and retainPtr. @@ -322,12 +322,12 @@ class RetainPtrCtorAdoptChecker if (auto *CD = dyn_cast(MD)) { auto QT = CD->getConversionType().getCanonicalType(); auto *ResultType = QT.getTypePtrOrNull(); - if (safeGetName(Cls) == "RetainPtr" && ResultType && + if (isRetainPtr(safeGetName(Cls)) && ResultType && (ResultType->isPointerType() || ResultType->isReferenceType() || ResultType->isObjCObjectPointerType())) return IsOwnedResult::NotOwned; } - if (safeGetName(MD) == "leakRef" && safeGetName(Cls) == "RetainPtr") + if (safeGetName(MD) == "leakRef" && isRetainPtr(safeGetName(Cls))) return IsOwnedResult::Owned; } } diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h index 51de81ac0f033..a4332df682060 100644 --- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h +++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h @@ -17,6 +17,7 @@ typedef const struct CF_BRIDGED_TYPE(NSString) __CFString * CFStringRef; typedef const struct CF_BRIDGED_TYPE(NSArray) __CFArray * CFArrayRef; typedef struct CF_BRIDGED_MUTABLE_TYPE(NSMutableArray) __CFArray * CFMutableArrayRef; typedef struct CF_BRIDGED_MUTABLE_TYPE(CFRunLoopRef) __CFRunLoop * CFRunLoopRef; +typedef struct CF_BRIDGED_TYPE(id) CGImage *CGImageRef; #define NS_RETURNS_RETAINED __attribute__((ns_returns_retained)) #define CF_CONSUMED __attribute__((cf_consumed)) @@ -150,6 +151,10 @@ namespace WTF { void WTFCrash(void); +#if __has_feature(objc_arc) +#define RetainPtr RetainPtrArc +#endif + template class RetainPtr; template RetainPtr adoptNS(T*); template RetainPtr adoptCF(T); diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm index f1f4d912663aa..4207c1836079f 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm @@ -5,6 +5,8 @@ SomeObj *provide(); CFMutableArrayRef provide_cf(); void someFunction(); +CGImageRef provideImage(); +NSString *stringForImage(CGImageRef); namespace raw_ptr { @@ -36,4 +38,13 @@ - (SomeObj *)getSomeObj { - (void)doWorkOnSomeObj { [[self getSomeObj] doWork]; } + +- (CGImageRef)createImage { + return provideImage(); +} + +- (NSString *)convertImage { + RetainPtr image = [self createImage]; + return stringForImage(image.get()); +} @end diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm index 0667e4964f1a8..eb36b49313d42 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm @@ -9,6 +9,9 @@ CFMutableArrayRef provide_cf(); void consume_cf(CFMutableArrayRef); +CGImageRef provideImage(); +NSString *stringForImage(CGImageRef); + void some_function(); namespace simple { @@ -440,4 +443,12 @@ - (void)doWorkOnSomeObj { [[self getSomeObj] doWork]; } +- (CGImageRef)createImage { + return provideImage(); +} + +- (NSString *)convertImage { + RetainPtr image = [self createImage]; + return stringForImage(image.get()); +} @end From 71d091699f956c89135bc165165e815ab7876359 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Tue, 15 Apr 2025 20:12:17 -0700 Subject: [PATCH 072/710] [libc][bazel] Fold "libc_function_deps" into "deps" for libc_tests. (#135835) libc_function_deps and deps are now identical, as we no longer need or have special treatment for libc_function targets. Merge these attributes passed to the libc_test macro, and fix all relevant libc_test macro invocations. This change is a no-op. This concludes cleanup started in 9b13d345303d819bb83de7ebbeb826d704add0bc. --- .../libc/test/libc_test_rules.bzl | 7 +- .../libc/test/src/complex/BUILD.bazel | 12 +-- .../libc/test/src/fenv/BUILD.bazel | 52 +++------ .../libc/test/src/inttypes/BUILD.bazel | 6 +- .../test/src/math/libc_math_test_rules.bzl | 3 +- .../libc/test/src/stdbit/BUILD.bazel | 11 +- .../libc/test/src/stdio/BUILD.bazel | 46 ++++---- .../libc/test/src/stdlib/BUILD.bazel | 100 +++++++++++------- .../libc/test/src/string/BUILD.bazel | 48 ++++----- .../libc/test/src/strings/BUILD.bazel | 12 +-- .../libc/test/src/sys/epoll/BUILD.bazel | 50 ++++----- .../libc/test/src/sys/socket/BUILD.bazel | 42 +++----- .../libc/test/src/unistd/BUILD.bazel | 100 +++++++----------- 13 files changed, 208 insertions(+), 281 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl index 7e798429ef19b..123e05727aeff 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl @@ -15,14 +15,13 @@ When performing tests we make sure to always use the internal version. load("//libc:libc_build_rules.bzl", "libc_common_copts") load("//libc:libc_configure_options.bzl", "LIBC_CONFIGURE_OPTIONS") -def libc_test(name, libc_function_deps = [], copts = [], deps = [], local_defines = [], **kwargs): +def libc_test(name, copts = [], deps = [], local_defines = [], **kwargs): """Add target for a libc test. Args: name: Test target name - libc_function_deps: List of libc_function targets used by this test. copts: The list of options to add to the C++ compilation command. - deps: The list of other libraries to be linked in to the test target. + deps: The list of libc functions and libraries to be linked in. local_defines: The list of target local_defines if any. **kwargs: Attributes relevant for a cc_test. """ @@ -37,7 +36,7 @@ def libc_test(name, libc_function_deps = [], copts = [], deps = [], local_define "//libc:func_free", "//libc:func_malloc", "//libc:func_realloc", - ] + libc_function_deps + deps, + ] + deps, copts = copts + libc_common_copts(), linkstatic = 1, **kwargs diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/complex/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/complex/BUILD.bazel index d0965bb2ee147..cc3a8d8b4b96a 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/complex/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/complex/BUILD.bazel @@ -7,8 +7,7 @@ load("//libc/test:libc_test_rules.bzl", "libc_test") "CImagTest.h", func_name + "_test.cpp", ], - libc_function_deps = ["//libc:func_name".replace("func_name", func_name)], - deps = [ + deps = ["//libc:func_name".replace("func_name", func_name)] + [ "//libc:hdr_math_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -29,8 +28,7 @@ load("//libc/test:libc_test_rules.bzl", "libc_test") "ConjTest.h", func_name + "_test.cpp", ], - libc_function_deps = ["//libc:func_name".replace("func_name", func_name)], - deps = [ + deps = ["//libc:func_name".replace("func_name", func_name)] + [ "//libc:hdr_math_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -51,8 +49,7 @@ load("//libc/test:libc_test_rules.bzl", "libc_test") "CprojTest.h", func_name + "_test.cpp", ], - libc_function_deps = ["//libc:func_name".replace("func_name", func_name)], - deps = [ + deps = ["//libc:func_name".replace("func_name", func_name)] + [ "//libc:hdr_math_macros", "//libc/test/UnitTest:fp_test_helpers", ] + (["//libc/utils/MPCWrapper:mpc_wrapper"] if func_name == "cprojf" else []), @@ -73,8 +70,7 @@ load("//libc/test:libc_test_rules.bzl", "libc_test") "CRealTest.h", func_name + "_test.cpp", ], - libc_function_deps = ["//libc:func_name".replace("func_name", func_name)], - deps = [ + deps = ["//libc:func_name".replace("func_name", func_name)] + [ "//libc:hdr_math_macros", "//libc/test/UnitTest:fp_test_helpers", ], diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel index c6ae534b0f640..1af1a984db92a 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel @@ -16,14 +16,12 @@ libc_test( "exception_status_test.cpp", "excepts.h", ], - libc_function_deps = [ + deps = [ + "//libc:__support_fputil_fenv_impl", "//libc:feclearexcept", "//libc:feraiseexcept", "//libc:fesetexcept", "//libc:fetestexcept", - ], - deps = [ - "//libc:__support_fputil_fenv_impl", "//libc:hdr_fenv_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -35,11 +33,9 @@ libc_test( "excepts.h", "rounding_mode_test.cpp", ], - libc_function_deps = [ + deps = [ "//libc:fegetround", "//libc:fesetround", - ], - deps = [ "//libc:hdr_fenv_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -51,16 +47,14 @@ libc_test( "enabled_exceptions_test.cpp", "excepts.h", ], - libc_function_deps = [ - "//libc:feclearexcept", - "//libc:feraiseexcept", - "//libc:fetestexcept", - ], tags = ["nosan"], deps = [ "//libc:__support_common", "//libc:__support_fputil_fenv_impl", "//libc:__support_macros_properties_architectures", + "//libc:feclearexcept", + "//libc:feraiseexcept", + "//libc:fetestexcept", "//libc:hdr_fenv_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -72,14 +66,12 @@ libc_test( "excepts.h", "feholdexcept_test.cpp", ], - libc_function_deps = [ - "//libc:feholdexcept", - ], tags = ["nosan"], deps = [ "//libc:__support_common", "//libc:__support_fputil_fenv_impl", "//libc:__support_macros_properties_architectures", + "//libc:feholdexcept", "//libc:hdr_fenv_macros", "//libc:types_fenv_t", "//libc/test/UnitTest:fp_test_helpers", @@ -92,13 +84,11 @@ libc_test( "exception_flags_test.cpp", "excepts.h", ], - libc_function_deps = [ + deps = [ + "//libc:__support_fputil_fenv_impl", "//libc:fegetexceptflag", "//libc:fesetexceptflag", "//libc:fetestexceptflag", - ], - deps = [ - "//libc:__support_fputil_fenv_impl", "//libc:hdr_fenv_macros", "//libc:types_fexcept_t", "//libc/test/UnitTest:fp_test_helpers", @@ -111,11 +101,9 @@ libc_test( "excepts.h", "feclearexcept_test.cpp", ], - libc_function_deps = [ - "//libc:feclearexcept", - ], deps = [ "//libc:__support_fputil_fenv_impl", + "//libc:feclearexcept", "//libc:hdr_fenv_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -127,14 +115,12 @@ libc_test( "excepts.h", "feenableexcept_test.cpp", ], - libc_function_deps = [ - "//libc:fedisableexcept", - "//libc:feenableexcept", - "//libc:fegetexcept", - ], deps = [ "//libc:__support_common", "//libc:__support_macros_properties_architectures", + "//libc:fedisableexcept", + "//libc:feenableexcept", + "//libc:fegetexcept", "//libc:hdr_fenv_macros", "//libc/test/UnitTest:fp_test_helpers", ], @@ -146,11 +132,9 @@ libc_test( "excepts.h", "feupdateenv_test.cpp", ], - libc_function_deps = [ - "//libc:feupdateenv", - ], deps = [ "//libc:__support_fputil_fenv_impl", + "//libc:feupdateenv", "//libc:hdr_fenv_macros", "//libc:types_fenv_t", "//libc/test/UnitTest:fp_test_helpers", @@ -163,15 +147,13 @@ libc_test( "excepts.h", "getenv_and_setenv_test.cpp", ], - libc_function_deps = [ + deps = [ + "//libc:__support_fputil_fenv_impl", + "//libc:__support_macros_properties_os", "//libc:fegetenv", "//libc:fegetround", "//libc:fesetenv", "//libc:fesetround", - ], - deps = [ - "//libc:__support_fputil_fenv_impl", - "//libc:__support_macros_properties_os", "//libc:hdr_fenv_macros", "//libc:types_fenv_t", "//libc/test/UnitTest:fp_test_helpers", diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel index bda7245d1f677..3dd4ab379efe0 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/inttypes/BUILD.bazel @@ -13,7 +13,7 @@ licenses(["notice"]) libc_test( name = "imaxabs_test", srcs = ["imaxabs_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:imaxabs", ], ) @@ -21,10 +21,8 @@ libc_test( libc_test( name = "imaxdiv_test", srcs = ["imaxdiv_test.cpp"], - libc_function_deps = [ - "//libc:imaxdiv", - ], deps = [ + "//libc:imaxdiv", "//libc/test/src/stdlib:div_test_helper", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl index 16845ab66dfd4..d2297d51383cc 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl @@ -23,8 +23,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs): libc_test( name = test_name, srcs = [test_name + ".cpp"] + hdrs, - libc_function_deps = ["//libc:func_name".replace("func_name", name)], - deps = [ + deps = ["//libc:func_name".replace("func_name", name)] + [ "//libc:__support_cpp_algorithm", "//libc:__support_cpp_bit", "//libc:__support_cpp_limits", diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdbit/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdbit/BUILD.bazel index b9d153947dc7d..119d7d5337750 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdbit/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdbit/BUILD.bazel @@ -39,11 +39,12 @@ bit_prefix_list = [ libc_test( name = bit_prefix + bit_suffix + "_test", srcs = [bit_prefix + bit_suffix + "_test.cpp"], - libc_function_deps = ["//libc:func_name".replace( - "func_name", - bit_prefix + bit_suffix, - )], - deps = ["//libc:__support_cpp_limits"], + deps = [ + "//libc:func_name".replace( + "func_name", + bit_prefix + bit_suffix, + ), + ] + ["//libc:__support_cpp_limits"], ) for bit_prefix in bit_prefix_list for bit_suffix in bit_suffix_list diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel index c3865ea07ea91..484d3e5e0a24e 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel @@ -13,8 +13,6 @@ licenses(["notice"]) libc_test( name = "printf_parser_test", srcs = ["printf_core/parser_test.cpp"], - libc_function_deps = [ - ], deps = [ "//libc:__support_arg_list", "//libc:__support_cpp_bit", @@ -28,8 +26,6 @@ libc_test( libc_test( name = "printf_writer_test", srcs = ["printf_core/writer_test.cpp"], - libc_function_deps = [ - ], deps = [ "//libc:__support_arg_list", "//libc:__support_cpp_string_view", @@ -42,8 +38,6 @@ libc_test( libc_test( name = "printf_converter_test", srcs = ["printf_core/converter_test.cpp"], - libc_function_deps = [ - ], deps = [ "//libc:__support_arg_list", "//libc:__support_cpp_string_view", @@ -56,11 +50,9 @@ libc_test( libc_test( name = "sprintf_test", srcs = ["sprintf_test.cpp"], - libc_function_deps = [ - "//libc:sprintf", - ], deps = [ "//libc:__support_fputil_fp_bits", + "//libc:sprintf", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -68,7 +60,7 @@ libc_test( libc_test( name = "snprintf_test", srcs = ["snprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:snprintf", ], ) @@ -76,7 +68,7 @@ libc_test( libc_test( name = "printf_test", srcs = ["printf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:printf", ], ) @@ -84,7 +76,7 @@ libc_test( libc_test( name = "fprintf_test", srcs = ["fprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:fprintf", ], ) @@ -92,7 +84,7 @@ libc_test( libc_test( name = "vsprintf_test", srcs = ["vsprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:vsprintf", ], ) @@ -100,7 +92,7 @@ libc_test( libc_test( name = "vsnprintf_test", srcs = ["vsnprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:vsnprintf", ], ) @@ -108,7 +100,7 @@ libc_test( libc_test( name = "vprintf_test", srcs = ["vprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:vprintf", ], ) @@ -116,7 +108,7 @@ libc_test( libc_test( name = "vfprintf_test", srcs = ["vfprintf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:vfprintf", ], ) @@ -124,25 +116,23 @@ libc_test( libc_test( name = "remove_test", srcs = ["remove_test.cpp"], - libc_function_deps = [ - "//libc:remove", - "//libc:open", - "//libc:mkdirat", + deps = [ "//libc:access", "//libc:close", + "//libc:mkdirat", + "//libc:open", + "//libc:remove", ], ) libc_test( name = "sscanf_test", srcs = ["sscanf_test.cpp"], - libc_function_deps = [ - "//libc:sscanf", - ], deps = [ "//libc:__support_cpp_limits", "//libc:__support_fputil_fp_bits", "//libc:hdr_stdio_macros", + "//libc:sscanf", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -150,16 +140,16 @@ libc_test( libc_test( name = "fscanf_test", srcs = ["fscanf_test.cpp"], - libc_function_deps = [ + deps = [ + "//libc:__support_cpp_string_view", "//libc:fscanf", ], - deps = ["//libc:__support_cpp_string_view"], ) libc_test( name = "vsscanf_test", srcs = ["vsscanf_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:vsscanf", ], ) @@ -167,8 +157,8 @@ libc_test( libc_test( name = "vfscanf_test", srcs = ["vfscanf_test.cpp"], - libc_function_deps = [ + deps = [ + "//libc:__support_cpp_string_view", "//libc:vfscanf", ], - deps = ["//libc:__support_cpp_string_view"], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index 5f43ec7c7a109..40f672d8099f1 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -13,19 +13,19 @@ licenses(["notice"]) libc_test( name = "abs_test", srcs = ["abs_test.cpp"], - libc_function_deps = ["//libc:abs"], + deps = ["//libc:abs"], ) libc_test( name = "labs_test", srcs = ["labs_test.cpp"], - libc_function_deps = ["//libc:labs"], + deps = ["//libc:labs"], ) libc_test( name = "llabs_test", srcs = ["llabs_test.cpp"], - libc_function_deps = ["//libc:llabs"], + deps = ["//libc:llabs"], ) libc_test_library( @@ -37,9 +37,9 @@ libc_test_library( libc_test( name = "div_test", srcs = ["div_test.cpp"], - libc_function_deps = ["//libc:div"], deps = [ ":div_test_helper", + "//libc:div", "//libc:types_div_t", ], ) @@ -47,9 +47,9 @@ libc_test( libc_test( name = "ldiv_test", srcs = ["ldiv_test.cpp"], - libc_function_deps = ["//libc:ldiv"], deps = [ ":div_test_helper", + "//libc:ldiv", "//libc:types_ldiv_t", ], ) @@ -57,9 +57,9 @@ libc_test( libc_test( name = "lldiv_test", srcs = ["lldiv_test.cpp"], - libc_function_deps = ["//libc:lldiv"], deps = [ ":div_test_helper", + "//libc:lldiv", "//libc:types_lldiv_t", ], ) @@ -77,36 +77,46 @@ libc_test_library( libc_test( name = "atoi_test", srcs = ["atoi_test.cpp"], - libc_function_deps = ["//libc:atoi"], - deps = [":atoi_test_helper"], + deps = [ + ":atoi_test_helper", + "//libc:atoi", + ], ) libc_test( name = "atol_test", srcs = ["atol_test.cpp"], - libc_function_deps = ["//libc:atol"], - deps = [":atoi_test_helper"], + deps = [ + ":atoi_test_helper", + "//libc:atol", + ], ) libc_test( name = "atoll_test", srcs = ["atoll_test.cpp"], - libc_function_deps = ["//libc:atoll"], - deps = [":atoi_test_helper"], + deps = [ + ":atoi_test_helper", + "//libc:atoll", + ], ) libc_test( name = "atof_test", srcs = ["atof_test.cpp"], - libc_function_deps = ["//libc:atof"], - deps = ["//libc:__support_fputil_fp_bits"], + deps = [ + "//libc:__support_fputil_fp_bits", + "//libc:atof", + ], ) libc_test( name = "bsearch_test", srcs = ["bsearch_test.cpp"], - libc_function_deps = ["//libc:bsearch"], - deps = ["//libc:types_size_t"], + deps = [ + "//libc:bsearch", + "//libc:types_size_t", + ], ) libc_test_library( @@ -121,9 +131,9 @@ libc_test_library( libc_test( name = "quick_sort_test", srcs = ["quick_sort_test.cpp"], - libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", + "//libc:qsort", "//libc:qsort_util", "//libc:types_size_t", ], @@ -132,9 +142,9 @@ libc_test( libc_test( name = "heap_sort_test", srcs = ["heap_sort_test.cpp"], - libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", + "//libc:qsort", "//libc:qsort_util", "//libc:types_size_t", ], @@ -143,8 +153,10 @@ libc_test( libc_test( name = "qsort_r_test", srcs = ["qsort_r_test.cpp"], - libc_function_deps = ["//libc:qsort_r"], - deps = ["//libc:types_size_t"], + deps = [ + "//libc:qsort_r", + "//libc:types_size_t", + ], ) libc_test_library( @@ -160,22 +172,28 @@ libc_test_library( libc_test( name = "strfromf_test", srcs = ["strfromf_test.cpp"], - libc_function_deps = ["//libc:strfromf"], - deps = [":strfrom_test_helper"], + deps = [ + ":strfrom_test_helper", + "//libc:strfromf", + ], ) libc_test( name = "strfromd_test", srcs = ["strfromd_test.cpp"], - libc_function_deps = ["//libc:strfromd"], - deps = [":strfrom_test_helper"], + deps = [ + ":strfrom_test_helper", + "//libc:strfromd", + ], ) libc_test( name = "strfroml_test", srcs = ["strfroml_test.cpp"], - libc_function_deps = ["//libc:strfroml"], - deps = [":strfrom_test_helper"], + deps = [ + ":strfrom_test_helper", + "//libc:strfroml", + ], ) libc_test_library( @@ -194,37 +212,45 @@ libc_test_library( libc_test( name = "strtol_test", srcs = ["strtol_test.cpp"], - libc_function_deps = ["//libc:strtol"], - deps = [":strtol_test_helper"], + deps = [ + ":strtol_test_helper", + "//libc:strtol", + ], ) libc_test( name = "strtoll_test", srcs = ["strtoll_test.cpp"], - libc_function_deps = ["//libc:strtoll"], - deps = [":strtol_test_helper"], + deps = [ + ":strtol_test_helper", + "//libc:strtoll", + ], ) libc_test( name = "strtoul_test", srcs = ["strtoul_test.cpp"], - libc_function_deps = ["//libc:strtoul"], - deps = [":strtol_test_helper"], + deps = [ + ":strtol_test_helper", + "//libc:strtoul", + ], ) libc_test( name = "strtoull_test", srcs = ["strtoull_test.cpp"], - libc_function_deps = ["//libc:strtoull"], - deps = [":strtol_test_helper"], + deps = [ + ":strtol_test_helper", + "//libc:strtoull", + ], ) libc_test( name = "strtof_test", srcs = ["strtof_test.cpp"], - libc_function_deps = ["//libc:strtof"], deps = [ "//libc:__support_fputil_fp_bits", + "//libc:strtof", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -232,9 +258,9 @@ libc_test( libc_test( name = "strtod_test", srcs = ["strtod_test.cpp"], - libc_function_deps = ["//libc:strtod"], deps = [ "//libc:__support_fputil_fp_bits", + "//libc:strtod", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -242,9 +268,9 @@ libc_test( libc_test( name = "strtold_test", srcs = ["strtold_test.cpp"], - libc_function_deps = ["//libc:strtold"], deps = [ "//libc:__support_fputil_fp_bits", "//libc:__support_uint128", + "//libc:strtold", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel index 7274819e6758c..7555c9f35e3a0 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel @@ -13,7 +13,7 @@ licenses(["notice"]) libc_test( name = "strlen_test", srcs = ["strlen_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strlen", ], ) @@ -21,7 +21,7 @@ libc_test( libc_test( name = "strcpy_test", srcs = ["strcpy_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strcpy", ], ) @@ -29,7 +29,7 @@ libc_test( libc_test( name = "strcmp_test", srcs = ["strcmp_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strcmp", ], ) @@ -37,7 +37,7 @@ libc_test( libc_test( name = "memchr_test", srcs = ["memchr_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:memchr", ], ) @@ -51,16 +51,16 @@ libc_test_library( libc_test( name = "strchr_test", srcs = ["strchr_test.cpp"], - libc_function_deps = [ + deps = [ + ":strchr_test_helper", "//libc:strchr", ], - deps = [":strchr_test_helper"], ) libc_test( name = "strstr_test", srcs = ["strstr_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strstr", ], ) @@ -68,7 +68,7 @@ libc_test( libc_test( name = "strnlen_test", srcs = ["strnlen_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strnlen", ], ) @@ -76,7 +76,7 @@ libc_test( libc_test( name = "memrchr_test", srcs = ["memrchr_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:memrchr", ], ) @@ -84,16 +84,16 @@ libc_test( libc_test( name = "strrchr_test", srcs = ["strrchr_test.cpp"], - libc_function_deps = [ + deps = [ + ":strchr_test_helper", "//libc:strrchr", ], - deps = [":strchr_test_helper"], ) libc_test( name = "strcspn_test", srcs = ["strcspn_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strcspn", ], ) @@ -101,7 +101,7 @@ libc_test( libc_test( name = "strspn_test", srcs = ["strspn_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strspn", ], ) @@ -109,7 +109,7 @@ libc_test( libc_test( name = "strtok_test", srcs = ["strtok_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:strtok", ], ) @@ -138,20 +138,18 @@ libc_test_library( libc_test( name = "memcpy_test", srcs = ["memcpy_test.cpp"], - libc_function_deps = [ - "//libc:memcpy", - ], deps = [ ":memory_check_utils", ":protected_pages", "//libc:__support_macros_properties_os", + "//libc:memcpy", ], ) libc_test( name = "mempcpy_test", srcs = ["mempcpy_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:mempcpy", ], ) @@ -159,26 +157,22 @@ libc_test( libc_test( name = "memset_test", srcs = ["memset_test.cpp"], - libc_function_deps = [ - "//libc:memset", - ], deps = [ ":memory_check_utils", ":protected_pages", "//libc:__support_macros_properties_os", + "//libc:memset", ], ) libc_test( name = "memmove_test", srcs = ["memmove_test.cpp"], - libc_function_deps = [ - "//libc:memcmp", - "//libc:memmove", - ], deps = [ ":memory_check_utils", "//libc:__support_cpp_span", + "//libc:memcmp", + "//libc:memmove", "//libc/test/UnitTest:memory_matcher", ], ) @@ -186,11 +180,9 @@ libc_test( libc_test( name = "memcmp_test", srcs = ["memcmp_test.cpp"], - libc_function_deps = [ - "//libc:memcmp", - ], deps = [ ":memory_check_utils", + "//libc:memcmp", "//libc/test/UnitTest:test_logger", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel index bcf4b4201e043..2e6f5644eec71 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/strings/BUILD.bazel @@ -13,11 +13,9 @@ licenses(["notice"]) libc_test( name = "bcopy_test", srcs = ["bcopy_test.cpp"], - libc_function_deps = [ - "//libc:bcopy", - ], deps = [ "//libc:__support_cpp_span", + "//libc:bcopy", "//libc/test/UnitTest:memory_matcher", "//libc/test/src/string:memory_check_utils", ], @@ -26,10 +24,8 @@ libc_test( libc_test( name = "bcmp_test", srcs = ["bcmp_test.cpp"], - libc_function_deps = [ - "//libc:bcmp", - ], deps = [ + "//libc:bcmp", "//libc/test/UnitTest:test_logger", "//libc/test/src/string:memory_check_utils", ], @@ -38,10 +34,8 @@ libc_test( libc_test( name = "bzero_test", srcs = ["bzero_test.cpp"], - libc_function_deps = [ - "//libc:bzero", - ], deps = [ + "//libc:bzero", "//libc/test/src/string:memory_check_utils", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel index 63ddebdadbdc9..e391703075aa7 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel @@ -13,11 +13,9 @@ licenses(["notice"]) libc_test( name = "epoll_create_test", srcs = ["linux/epoll_create_test.cpp"], - libc_function_deps = [ - "//libc:epoll_create", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:epoll_create", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -25,11 +23,9 @@ libc_test( libc_test( name = "epoll_create1_test", srcs = ["linux/epoll_create1_test.cpp"], - libc_function_deps = [ - "//libc:epoll_create1", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:epoll_create1", "//libc:hdr_sys_epoll_macros", "//libc/test/UnitTest:errno_test_helpers", ], @@ -38,14 +34,12 @@ libc_test( libc_test( name = "epoll_ctl_test", srcs = ["linux/epoll_ctl_test.cpp"], - libc_function_deps = [ + deps = [ + "//libc:close", "//libc:epoll_create1", "//libc:epoll_ctl", - "//libc:pipe", - "//libc:close", - ], - deps = [ "//libc:hdr_sys_epoll_macros", + "//libc:pipe", "//libc:types_struct_epoll_event", "//libc/test/UnitTest:errno_test_helpers", ], @@ -54,15 +48,13 @@ libc_test( libc_test( name = "epoll_wait_test", srcs = ["linux/epoll_wait_test.cpp"], - libc_function_deps = [ - "//libc:epoll_wait", + deps = [ + "//libc:close", "//libc:epoll_create1", "//libc:epoll_ctl", - "//libc:pipe", - "//libc:close", - ], - deps = [ + "//libc:epoll_wait", "//libc:hdr_sys_epoll_macros", + "//libc:pipe", "//libc:types_struct_epoll_event", "//libc/test/UnitTest:errno_test_helpers", ], @@ -71,15 +63,13 @@ libc_test( libc_test( name = "epoll_pwait_test", srcs = ["linux/epoll_pwait_test.cpp"], - libc_function_deps = [ - "//libc:epoll_pwait", + deps = [ + "//libc:close", "//libc:epoll_create1", "//libc:epoll_ctl", - "//libc:pipe", - "//libc:close", - ], - deps = [ + "//libc:epoll_pwait", "//libc:hdr_sys_epoll_macros", + "//libc:pipe", "//libc:types_struct_epoll_event", "//libc/test/UnitTest:errno_test_helpers", ], @@ -88,15 +78,13 @@ libc_test( libc_test( name = "epoll_pwait2_test", srcs = ["linux/epoll_pwait2_test.cpp"], - libc_function_deps = [ - "//libc:epoll_pwait2", + deps = [ + "//libc:close", "//libc:epoll_create1", "//libc:epoll_ctl", - "//libc:pipe", - "//libc:close", - ], - deps = [ + "//libc:epoll_pwait2", "//libc:hdr_sys_epoll_macros", + "//libc:pipe", "//libc:types_struct_epoll_event", "//libc:types_struct_timespec", "//libc/test/UnitTest:errno_test_helpers", diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel index ac7f48d0aeebb..743bf2a4743b7 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/socket/BUILD.bazel @@ -13,11 +13,9 @@ licenses(["notice"]) libc_test( name = "socket_test", srcs = ["linux/socket_test.cpp"], - libc_function_deps = [ - "//libc:socket", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:socket", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -25,11 +23,9 @@ libc_test( libc_test( name = "socketpair_test", srcs = ["linux/socketpair_test.cpp"], - libc_function_deps = [ - "//libc:socketpair", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:socketpair", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -37,13 +33,11 @@ libc_test( libc_test( name = "send_recv_test", srcs = ["linux/send_recv_test.cpp"], - libc_function_deps = [ - "//libc:socketpair", - "//libc:send", - "//libc:recv", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:recv", + "//libc:send", + "//libc:socketpair", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -51,13 +45,11 @@ libc_test( libc_test( name = "sendto_recvfrom_test", srcs = ["linux/sendto_recvfrom_test.cpp"], - libc_function_deps = [ - "//libc:socketpair", - "//libc:sendto", - "//libc:recvfrom", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:recvfrom", + "//libc:sendto", + "//libc:socketpair", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -65,13 +57,11 @@ libc_test( libc_test( name = "sendmsg_recvmsg_test", srcs = ["linux/sendmsg_recvmsg_test.cpp"], - libc_function_deps = [ - "//libc:socketpair", - "//libc:sendmsg", - "//libc:recvmsg", - "//libc:close", - ], deps = [ + "//libc:close", + "//libc:recvmsg", + "//libc:sendmsg", + "//libc:socketpair", "//libc/test/UnitTest:errno_test_helpers", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel index 62641889f9a72..661e0a6ff5dfe 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel @@ -13,13 +13,11 @@ licenses(["notice"]) libc_test( name = "access_test", srcs = ["access_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:access", "//libc:close", + "//libc:open", "//libc:unlink", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -27,15 +25,13 @@ libc_test( libc_test( name = "dup_test", srcs = ["dup_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:dup", + "//libc:open", "//libc:read", "//libc:unlink", "//libc:write", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -43,15 +39,13 @@ libc_test( libc_test( name = "dup2_test", srcs = ["dup2_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:dup2", + "//libc:open", "//libc:read", "//libc:unlink", "//libc:write", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -59,15 +53,13 @@ libc_test( libc_test( name = "dup3_test", srcs = ["dup3_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:dup3", + "//libc:open", "//libc:read", "//libc:unlink", "//libc:write", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -75,16 +67,14 @@ libc_test( libc_test( name = "ftruncate_test", srcs = ["ftruncate_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ + "//libc:__support_cpp_string_view", "//libc:close", - "//libc:read", "//libc:ftruncate", + "//libc:open", + "//libc:read", "//libc:unlink", "//libc:write", - ], - deps = [ - "//libc:__support_cpp_string_view", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -92,16 +82,14 @@ libc_test( libc_test( name = "pread_pwrite_test", srcs = ["pread_pwrite_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:fsync", + "//libc:open", "//libc:pread", "//libc:pwrite", "//libc:unlink", "//libc:write", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -109,15 +97,13 @@ libc_test( libc_test( name = "read_write_test", srcs = ["read_write_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:fsync", + "//libc:open", "//libc:read", - "//libc:write", "//libc:remove", - ], - deps = [ + "//libc:write", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -125,13 +111,11 @@ libc_test( libc_test( name = "link_test", srcs = ["link_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", "//libc:link", + "//libc:open", "//libc:unlink", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -139,24 +123,20 @@ libc_test( libc_test( name = "swab_test", srcs = ["swab_test.cpp"], - libc_function_deps = [ - "//libc:swab", - ], deps = [ "//libc:string_utils", + "//libc:swab", ], ) libc_test( name = "symlink_test", srcs = ["symlink_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", + "//libc:open", "//libc:symlink", "//libc:unlink", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -164,16 +144,14 @@ libc_test( libc_test( name = "truncate_test", srcs = ["truncate_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ + "//libc:__support_cpp_string_view", "//libc:close", + "//libc:open", "//libc:read", "//libc:truncate", "//libc:unlink", "//libc:write", - ], - deps = [ - "//libc:__support_cpp_string_view", "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -181,12 +159,10 @@ libc_test( libc_test( name = "unlink_test", srcs = ["unlink_test.cpp"], - libc_function_deps = [ - "//libc:open", + deps = [ "//libc:close", + "//libc:open", "//libc:unlink", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -194,7 +170,7 @@ libc_test( libc_test( name = "getppid_test", srcs = ["getppid_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:getppid", ], ) @@ -202,7 +178,7 @@ libc_test( libc_test( name = "getuid_test", srcs = ["getuid_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:getuid", ], ) @@ -210,12 +186,10 @@ libc_test( libc_test( name = "isatty_test", srcs = ["isatty_test.cpp"], - libc_function_deps = [ + deps = [ + "//libc:close", "//libc:isatty", "//libc:open", - "//libc:close", - ], - deps = [ "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -223,7 +197,7 @@ libc_test( libc_test( name = "geteuid_test", srcs = ["geteuid_test.cpp"], - libc_function_deps = [ + deps = [ "//libc:geteuid", ], ) @@ -236,7 +210,7 @@ libc_test( # libc_test( # name = "syscall_test", # srcs = ["syscall_test.cpp"], -# libc_function_deps = [ +# deps = [ # "//libc:syscall", # ], # ) @@ -246,7 +220,7 @@ libc_test( # libc_test( # name = "sysconf_test", # srcs = ["sysconf_test.cpp"], -# libc_function_deps = [ +# deps = [ # "//libc:sysconf", # ], # ) @@ -256,12 +230,10 @@ libc_test( # libc_test( # name = "getopt_test", # srcs = ["getopt_test.cpp"], -# libc_function_deps = [ +# deps = [ +# "//libc:__support_cpp_array", # "//libc:getopt", # "//libc:fopencookie", # "//libc:fflush", # ], -# deps = [ -# "//libc:__support_cpp_array", -# ], # ) From a024d13f84dbe7b3d1eee555ddc82cdd1af814e0 Mon Sep 17 00:00:00 2001 From: tangaac Date: Wed, 16 Apr 2025 11:33:29 +0800 Subject: [PATCH 073/710] [LoongArch] make ABDS/ABDU legal for lsx/lasx (#134190) --- .../LoongArch/LoongArchISelLowering.cpp | 4 + .../LoongArch/LoongArchLASXInstrInfo.td | 4 + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 4 + .../LoongArch/lasx/ir-instruction/absd.ll | 151 +++++------------- .../LoongArch/lsx/ir-instruction/absd.ll | 151 +++++------------- 5 files changed, 86 insertions(+), 228 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 002d88cbeeba3..e5ccbe897d19c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -272,6 +272,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::ABDS, VT, Legal); + setOperationAction(ISD::ABDU, VT, Legal); } for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -336,6 +338,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::ABDS, VT, Legal); + setOperationAction(ISD::ABDU, VT, Legal); } for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) setOperationAction(ISD::BITREVERSE, VT, Custom); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index d6d532cddb594..aaa4b94b6e994 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1840,6 +1840,10 @@ def : Pat<(vt (concat_vectors LSX128:$vd, LSX128:$vj)), (XVPERMI_Q (SUBREG_TO_REG (i64 0), LSX128:$vd, sub_128), (SUBREG_TO_REG (i64 0), LSX128:$vj, sub_128), 2)>; +// XVABSD_{B/H/W/D}[U] +defm : PatXrXr; +defm : PatXrXrU; + } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index b0d880749bf92..e7327ce7461f7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -2002,6 +2002,10 @@ def : Pat<(f32 f32imm_vldi:$in), def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// VABSD_{B/H/W/D}[U] +defm : PatVrVr; +defm : PatVrVrU; + } // Predicates = [HasExtLSX] /// Intrinsic pattern diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll index bd5b16f5147a2..c5df9f8420837 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s -;; TODO: Currently LoongArch generates sub-optimal code for these cases ;; 1. trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) or abdu(a,b) ;; 2. abs(sub_nsw(x, y)) -> abds(a,b) ;; 3. sub(smax(a,b),smin(a,b)) -> abds(a,b) or abdu(a,b) @@ -9,16 +8,12 @@ ;; 5. sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) or abdu(a,b) ;; ;; abds / abdu can be lowered to xvabsd.{b/h/w/d} / xvabsd.{b/h/w/d}u instruction. -;; -;; Later patch will address it. ;; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) define <32 x i8> @xvabsd_b(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: xvabsd_b: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.b $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.sext = sext <32 x i8> %a to <32 x i16> %b.sext = sext <32 x i8> %b to <32 x i16> @@ -31,9 +26,7 @@ define <32 x i8> @xvabsd_b(<32 x i8> %a, <32 x i8> %b) { define <16 x i16> @xvabsd_h(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: xvabsd_h: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.h $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.sext = sext <16 x i16> %a to <16 x i32> %b.sext = sext <16 x i16> %b to <16 x i32> @@ -46,9 +39,7 @@ define <16 x i16> @xvabsd_h(<16 x i16> %a, <16 x i16> %b) { define <8 x i32> @xvabsd_w(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: xvabsd_w: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.sext = sext <8 x i32> %a to <8 x i64> %b.sext = sext <8 x i32> %b to <8 x i64> @@ -61,9 +52,7 @@ define <8 x i32> @xvabsd_w(<8 x i32> %a, <8 x i32> %b) { define <4 x i64> @xvabsd_d(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: xvabsd_d: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.d $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.sext = sext <4 x i64> %a to <4 x i128> %b.sext = sext <4 x i64> %b to <4 x i128> @@ -76,9 +65,7 @@ define <4 x i64> @xvabsd_d(<4 x i64> %a, <4 x i64> %b) { define <32 x i8> @xvabsd_bu(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: xvabsd_bu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.zext = zext <32 x i8> %a to <32 x i16> %b.zext = zext <32 x i8> %b to <32 x i16> @@ -91,9 +78,7 @@ define <32 x i8> @xvabsd_bu(<32 x i8> %a, <32 x i8> %b) { define <16 x i16> @xvabsd_hu(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: xvabsd_hu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.zext = zext <16 x i16> %a to <16 x i32> %b.zext = zext <16 x i16> %b to <16 x i32> @@ -106,9 +91,7 @@ define <16 x i16> @xvabsd_hu(<16 x i16> %a, <16 x i16> %b) { define <8 x i32> @xvabsd_wu(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: xvabsd_wu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.zext = zext <8 x i32> %a to <8 x i64> %b.zext = zext <8 x i32> %b to <8 x i64> @@ -121,9 +104,7 @@ define <8 x i32> @xvabsd_wu(<8 x i32> %a, <8 x i32> %b) { define <4 x i64> @xvabsd_du(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: xvabsd_du: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.du $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a.zext = zext <4 x i64> %a to <4 x i128> %b.zext = zext <4 x i64> %b to <4 x i128> @@ -137,9 +118,7 @@ define <4 x i64> @xvabsd_du(<4 x i64> %a, <4 x i64> %b) { define <32 x i8> @xvabsd_b_nsw(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: xvabsd_b_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvneg.b $xr1, $xr0 -; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvabsd.b $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %sub = sub nsw <32 x i8> %a, %b %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %sub, i1 true) @@ -149,9 +128,7 @@ define <32 x i8> @xvabsd_b_nsw(<32 x i8> %a, <32 x i8> %b) { define <16 x i16> @xvabsd_h_nsw(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: xvabsd_h_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvneg.h $xr1, $xr0 -; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvabsd.h $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %sub = sub nsw <16 x i16> %a, %b %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) @@ -161,9 +138,7 @@ define <16 x i16> @xvabsd_h_nsw(<16 x i16> %a, <16 x i16> %b) { define <8 x i32> @xvabsd_w_nsw(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: xvabsd_w_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvneg.w $xr1, $xr0 -; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvabsd.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %sub = sub nsw <8 x i32> %a, %b %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) @@ -173,9 +148,7 @@ define <8 x i32> @xvabsd_w_nsw(<8 x i32> %a, <8 x i32> %b) { define <4 x i64> @xvabsd_d_nsw(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: xvabsd_d_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvneg.d $xr1, $xr0 -; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvabsd.d $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %sub = sub nsw <4 x i64> %a, %b %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) @@ -186,9 +159,7 @@ define <4 x i64> @xvabsd_d_nsw(<4 x i64> %a, <4 x i64> %b) { define <32 x i8> @maxmin_b(<32 x i8> %0, <32 x i8> %1) { ; CHECK-LABEL: maxmin_b: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.b $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> %0, <32 x i8> %1) %b = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> %0, <32 x i8> %1) @@ -199,9 +170,7 @@ define <32 x i8> @maxmin_b(<32 x i8> %0, <32 x i8> %1) { define <16 x i16> @maxmin_h(<16 x i16> %0, <16 x i16> %1) { ; CHECK-LABEL: maxmin_h: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.h $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> %1) %b = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %0, <16 x i16> %1) @@ -212,9 +181,7 @@ define <16 x i16> @maxmin_h(<16 x i16> %0, <16 x i16> %1) { define <8 x i32> @maxmin_w(<8 x i32> %0, <8 x i32> %1) { ; CHECK-LABEL: maxmin_w: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> %1) %b = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %0, <8 x i32> %1) @@ -225,9 +192,7 @@ define <8 x i32> @maxmin_w(<8 x i32> %0, <8 x i32> %1) { define <4 x i64> @maxmin_d(<4 x i64> %0, <4 x i64> %1) { ; CHECK-LABEL: maxmin_d: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.d $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %0, <4 x i64> %1) %b = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %0, <4 x i64> %1) @@ -238,9 +203,7 @@ define <4 x i64> @maxmin_d(<4 x i64> %0, <4 x i64> %1) { define <32 x i8> @maxmin_bu(<32 x i8> %0, <32 x i8> %1) { ; CHECK-LABEL: maxmin_bu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> %0, <32 x i8> %1) %b = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %0, <32 x i8> %1) @@ -251,9 +214,7 @@ define <32 x i8> @maxmin_bu(<32 x i8> %0, <32 x i8> %1) { define <16 x i16> @maxmin_hu(<16 x i16> %0, <16 x i16> %1) { ; CHECK-LABEL: maxmin_hu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> %0, <16 x i16> %1) %b = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> %1) @@ -264,9 +225,7 @@ define <16 x i16> @maxmin_hu(<16 x i16> %0, <16 x i16> %1) { define <8 x i32> @maxmin_wu(<8 x i32> %0, <8 x i32> %1) { ; CHECK-LABEL: maxmin_wu: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <8 x i32> @llvm.umax.v8i32(<8 x i32> %0, <8 x i32> %1) %b = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> %1) @@ -277,9 +236,7 @@ define <8 x i32> @maxmin_wu(<8 x i32> %0, <8 x i32> %1) { define <4 x i64> @maxmin_du(<4 x i64> %0, <4 x i64> %1) { ; CHECK-LABEL: maxmin_du: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.du $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %0, <4 x i64> %1) %b = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %0, <4 x i64> %1) @@ -290,9 +247,7 @@ define <4 x i64> @maxmin_du(<4 x i64> %0, <4 x i64> %1) { define <32 x i8> @maxmin_bu_com1(<32 x i8> %0, <32 x i8> %1) { ; CHECK-LABEL: maxmin_bu_com1: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %a = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> %0, <32 x i8> %1) %b = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %1, <32 x i8> %0) @@ -304,9 +259,7 @@ define <32 x i8> @maxmin_bu_com1(<32 x i8> %0, <32 x i8> %1) { define <32 x i8> @xvabsd_b_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { ; CHECK-LABEL: xvabsd_b_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.b $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp slt <32 x i8> %a, %b %ab = sub <32 x i8> %a, %b @@ -318,9 +271,7 @@ define <32 x i8> @xvabsd_b_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { define <16 x i16> @xvabsd_h_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { ; CHECK-LABEL: xvabsd_h_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.h $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp slt <16 x i16> %a, %b %ab = sub <16 x i16> %a, %b @@ -332,9 +283,7 @@ define <16 x i16> @xvabsd_h_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { define <8 x i32> @xvabsd_w_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { ; CHECK-LABEL: xvabsd_w_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp slt <8 x i32> %a, %b %ab = sub <8 x i32> %a, %b @@ -346,9 +295,7 @@ define <8 x i32> @xvabsd_w_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { define <4 x i64> @xvabsd_d_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { ; CHECK-LABEL: xvabsd_d_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.d $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp slt <4 x i64> %a, %b %ab = sub <4 x i64> %a, %b @@ -360,9 +307,7 @@ define <4 x i64> @xvabsd_d_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { define <32 x i8> @xvabsd_bu_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { ; CHECK-LABEL: xvabsd_bu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ult <32 x i8> %a, %b %ab = sub <32 x i8> %a, %b @@ -374,9 +319,7 @@ define <32 x i8> @xvabsd_bu_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { define <16 x i16> @xvabsd_hu_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { ; CHECK-LABEL: xvabsd_hu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ult <16 x i16> %a, %b %ab = sub <16 x i16> %a, %b @@ -388,9 +331,7 @@ define <16 x i16> @xvabsd_hu_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { define <8 x i32> @xvabsd_wu_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { ; CHECK-LABEL: xvabsd_wu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i32> %a, %b %ab = sub <8 x i32> %a, %b @@ -402,9 +343,7 @@ define <8 x i32> @xvabsd_wu_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { define <4 x i64> @xvabsd_du_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { ; CHECK-LABEL: xvabsd_du_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.du $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i64> %a, %b %ab = sub <4 x i64> %a, %b @@ -417,9 +356,7 @@ define <4 x i64> @xvabsd_du_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { define <32 x i8> @xvabsd_b_select(<32 x i8> %a, <32 x i8> %b) nounwind { ; CHECK-LABEL: xvabsd_b_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.b $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp slt <32 x i8> %a, %b %ab = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b @@ -431,9 +368,7 @@ define <32 x i8> @xvabsd_b_select(<32 x i8> %a, <32 x i8> %b) nounwind { define <16 x i16> @xvabsd_h_select(<16 x i16> %a, <16 x i16> %b) nounwind { ; CHECK-LABEL: xvabsd_h_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.h $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp sle <16 x i16> %a, %b %ab = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b @@ -445,9 +380,7 @@ define <16 x i16> @xvabsd_h_select(<16 x i16> %a, <16 x i16> %b) nounwind { define <8 x i32> @xvabsd_w_select(<8 x i32> %a, <8 x i32> %b) nounwind { ; CHECK-LABEL: xvabsd_w_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp sgt <8 x i32> %a, %b %ab = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b @@ -459,9 +392,7 @@ define <8 x i32> @xvabsd_w_select(<8 x i32> %a, <8 x i32> %b) nounwind { define <4 x i64> @xvabsd_d_select(<4 x i64> %a, <4 x i64> %b) nounwind { ; CHECK-LABEL: xvabsd_d_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.d $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp sge <4 x i64> %a, %b %ab = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b @@ -473,9 +404,7 @@ define <4 x i64> @xvabsd_d_select(<4 x i64> %a, <4 x i64> %b) nounwind { define <32 x i8> @xvabsd_bu_select(<32 x i8> %a, <32 x i8> %b) nounwind { ; CHECK-LABEL: xvabsd_bu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ult <32 x i8> %a, %b %ab = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b @@ -487,9 +416,7 @@ define <32 x i8> @xvabsd_bu_select(<32 x i8> %a, <32 x i8> %b) nounwind { define <16 x i16> @xvabsd_hu_select(<16 x i16> %a, <16 x i16> %b) nounwind { ; CHECK-LABEL: xvabsd_hu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ule <16 x i16> %a, %b %ab = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b @@ -501,9 +428,7 @@ define <16 x i16> @xvabsd_hu_select(<16 x i16> %a, <16 x i16> %b) nounwind { define <8 x i32> @xvabsd_wu_select(<8 x i32> %a, <8 x i32> %b) nounwind { ; CHECK-LABEL: xvabsd_wu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp ugt <8 x i32> %a, %b %ab = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b @@ -515,9 +440,7 @@ define <8 x i32> @xvabsd_wu_select(<8 x i32> %a, <8 x i32> %b) nounwind { define <4 x i64> @xvabsd_du_select(<4 x i64> %a, <4 x i64> %b) nounwind { ; CHECK-LABEL: xvabsd_du_select: ; CHECK: # %bb.0: -; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 -; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: xvabsd.du $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %cmp = icmp uge <4 x i64> %a, %b %ab = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll index 2cbd74204d5d6..f77a31b600761 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s -;; TODO: Currently LoongArch generates sub-optimal code for five cases ;; 1. trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) or abdu(a,b) ;; 2. abs(sub_nsw(x, y)) -> abds(a,b) ;; 3. sub(smax(a,b),smin(a,b)) -> abds(a,b) or abdu(a,b) @@ -9,16 +8,12 @@ ;; 5. sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) or abdu(a,b) ;; ;; abds / abdu can be lowered to vabsd.{b/h/w/d} / vabsd.{b/h/w/d}u instruction -;; -;; Later patch will address it. ;; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) define <16 x i8> @vabsd_b(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: vabsd_b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.b $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.sext = sext <16 x i8> %a to <16 x i16> %b.sext = sext <16 x i8> %b to <16 x i16> @@ -31,9 +26,7 @@ define <16 x i8> @vabsd_b(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @vabsd_h(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: vabsd_h: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.h $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.sext = sext <8 x i16> %a to <8 x i32> %b.sext = sext <8 x i16> %b to <8 x i32> @@ -46,9 +39,7 @@ define <8 x i16> @vabsd_h(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @vabsd_w(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vabsd_w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.w $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.sext = sext <4 x i32> %a to <4 x i64> %b.sext = sext <4 x i32> %b to <4 x i64> @@ -61,9 +52,7 @@ define <4 x i32> @vabsd_w(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @vabsd_d(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vabsd_d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.d $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.sext = sext <2 x i64> %a to <2 x i128> %b.sext = sext <2 x i64> %b to <2 x i128> @@ -76,9 +65,7 @@ define <2 x i64> @vabsd_d(<2 x i64> %a, <2 x i64> %b) { define <16 x i8> @vabsd_bu(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: vabsd_bu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.zext = zext <16 x i8> %a to <16 x i16> %b.zext = zext <16 x i8> %b to <16 x i16> @@ -91,9 +78,7 @@ define <16 x i8> @vabsd_bu(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @vabsd_hu(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: vabsd_hu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.zext = zext <8 x i16> %a to <8 x i32> %b.zext = zext <8 x i16> %b to <8 x i32> @@ -106,9 +91,7 @@ define <8 x i16> @vabsd_hu(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @vabsd_wu(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vabsd_wu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.zext = zext <4 x i32> %a to <4 x i64> %b.zext = zext <4 x i32> %b to <4 x i64> @@ -121,9 +104,7 @@ define <4 x i32> @vabsd_wu(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @vabsd_du(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vabsd_du: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.du $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a.zext = zext <2 x i64> %a to <2 x i128> %b.zext = zext <2 x i64> %b to <2 x i128> @@ -137,9 +118,7 @@ define <2 x i64> @vabsd_du(<2 x i64> %a, <2 x i64> %b) { define <16 x i8> @vabsd_b_nsw(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: vabsd_b_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vneg.b $vr1, $vr0 -; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vabsd.b $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %sub = sub nsw <16 x i8> %a, %b %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) @@ -149,9 +128,7 @@ define <16 x i8> @vabsd_b_nsw(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @vabsd_h_nsw(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: vabsd_h_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vneg.h $vr1, $vr0 -; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vabsd.h $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %sub = sub nsw <8 x i16> %a, %b %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) @@ -161,9 +138,7 @@ define <8 x i16> @vabsd_h_nsw(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @vabsd_w_nsw(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vabsd_w_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vneg.w $vr1, $vr0 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vabsd.w $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %sub = sub nsw <4 x i32> %a, %b %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) @@ -173,9 +148,7 @@ define <4 x i32> @vabsd_w_nsw(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @vabsd_d_nsw(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vabsd_d_nsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vneg.d $vr1, $vr0 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vabsd.d $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %sub = sub nsw <2 x i64> %a, %b %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) @@ -186,9 +159,7 @@ define <2 x i64> @vabsd_d_nsw(<2 x i64> %a, <2 x i64> %b) { define <16 x i8> @maxmin_b(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: maxmin_b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.b $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %0, <16 x i8> %1) %b = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %0, <16 x i8> %1) @@ -199,9 +170,7 @@ define <16 x i8> @maxmin_b(<16 x i8> %0, <16 x i8> %1) { define <8 x i16> @maxmin_h(<8 x i16> %0, <8 x i16> %1) { ; CHECK-LABEL: maxmin_h: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.h $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %0, <8 x i16> %1) %b = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %0, <8 x i16> %1) @@ -212,9 +181,7 @@ define <8 x i16> @maxmin_h(<8 x i16> %0, <8 x i16> %1) { define <4 x i32> @maxmin_w(<4 x i32> %0, <4 x i32> %1) { ; CHECK-LABEL: maxmin_w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.w $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1) %b = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %0, <4 x i32> %1) @@ -225,9 +192,7 @@ define <4 x i32> @maxmin_w(<4 x i32> %0, <4 x i32> %1) { define <2 x i64> @maxmin_d(<2 x i64> %0, <2 x i64> %1) { ; CHECK-LABEL: maxmin_d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.d $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <2 x i64> @llvm.smax.v2i64(<2 x i64> %0, <2 x i64> %1) %b = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> %1) @@ -238,9 +203,7 @@ define <2 x i64> @maxmin_d(<2 x i64> %0, <2 x i64> %1) { define <16 x i8> @maxmin_bu(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: maxmin_bu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %0, <16 x i8> %1) @@ -251,9 +214,7 @@ define <16 x i8> @maxmin_bu(<16 x i8> %0, <16 x i8> %1) { define <8 x i16> @maxmin_hu(<8 x i16> %0, <8 x i16> %1) { ; CHECK-LABEL: maxmin_hu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %0, <8 x i16> %1) %b = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %0, <8 x i16> %1) @@ -264,9 +225,7 @@ define <8 x i16> @maxmin_hu(<8 x i16> %0, <8 x i16> %1) { define <4 x i32> @maxmin_wu(<4 x i32> %0, <4 x i32> %1) { ; CHECK-LABEL: maxmin_wu: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %0, <4 x i32> %1) %b = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %0, <4 x i32> %1) @@ -277,9 +236,7 @@ define <4 x i32> @maxmin_wu(<4 x i32> %0, <4 x i32> %1) { define <2 x i64> @maxmin_du(<2 x i64> %0, <2 x i64> %1) { ; CHECK-LABEL: maxmin_du: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.du $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <2 x i64> @llvm.umax.v2i64(<2 x i64> %0, <2 x i64> %1) %b = tail call <2 x i64> @llvm.umin.v2i64(<2 x i64> %0, <2 x i64> %1) @@ -290,9 +247,7 @@ define <2 x i64> @maxmin_du(<2 x i64> %0, <2 x i64> %1) { define <16 x i8> @maxmin_bu_com1(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: maxmin_bu_com1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %1, <16 x i8> %0) @@ -304,9 +259,7 @@ define <16 x i8> @maxmin_bu_com1(<16 x i8> %0, <16 x i8> %1) { define <16 x i8> @vabsd_b_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: vabsd_b_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.b $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp slt <16 x i8> %a, %b %ab = sub <16 x i8> %a, %b @@ -318,9 +271,7 @@ define <16 x i8> @vabsd_b_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @vabsd_h_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: vabsd_h_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.h $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp slt <8 x i16> %a, %b %ab = sub <8 x i16> %a, %b @@ -332,9 +283,7 @@ define <8 x i16> @vabsd_h_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @vabsd_w_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: vabsd_w_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.w $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp slt <4 x i32> %a, %b %ab = sub <4 x i32> %a, %b @@ -346,9 +295,7 @@ define <4 x i32> @vabsd_w_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @vabsd_d_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { ; CHECK-LABEL: vabsd_d_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.d $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp slt <2 x i64> %a, %b %ab = sub <2 x i64> %a, %b @@ -360,9 +307,7 @@ define <2 x i64> @vabsd_d_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { define <16 x i8> @vabsd_bu_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: vabsd_bu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ult <16 x i8> %a, %b %ab = sub <16 x i8> %a, %b @@ -374,9 +319,7 @@ define <16 x i8> @vabsd_bu_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @vabsd_hu_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: vabsd_hu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i16> %a, %b %ab = sub <8 x i16> %a, %b @@ -388,9 +331,7 @@ define <8 x i16> @vabsd_hu_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @vabsd_wu_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: vabsd_wu_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i32> %a, %b %ab = sub <4 x i32> %a, %b @@ -402,9 +343,7 @@ define <4 x i32> @vabsd_wu_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @vabsd_du_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { ; CHECK-LABEL: vabsd_du_cmp: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.du $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ult <2 x i64> %a, %b %ab = sub <2 x i64> %a, %b @@ -417,9 +356,7 @@ define <2 x i64> @vabsd_du_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { define <16 x i8> @vabsd_b_select(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: vabsd_b_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.b $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp slt <16 x i8> %a, %b %ab = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b @@ -431,9 +368,7 @@ define <16 x i8> @vabsd_b_select(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @vabsd_h_select(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: vabsd_h_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.h $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp sle <8 x i16> %a, %b %ab = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b @@ -445,9 +380,7 @@ define <8 x i16> @vabsd_h_select(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @vabsd_w_select(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: vabsd_w_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.w $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp sgt <4 x i32> %a, %b %ab = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b @@ -459,9 +392,7 @@ define <4 x i32> @vabsd_w_select(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @vabsd_d_select(<2 x i64> %a, <2 x i64> %b) nounwind { ; CHECK-LABEL: vabsd_d_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.d $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp sge <2 x i64> %a, %b %ab = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b @@ -473,9 +404,7 @@ define <2 x i64> @vabsd_d_select(<2 x i64> %a, <2 x i64> %b) nounwind { define <16 x i8> @vabsd_bu_select(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: vabsd_bu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ult <16 x i8> %a, %b %ab = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b @@ -487,9 +416,7 @@ define <16 x i8> @vabsd_bu_select(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @vabsd_hu_select(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: vabsd_hu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ule <8 x i16> %a, %b %ab = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b @@ -501,9 +428,7 @@ define <8 x i16> @vabsd_hu_select(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @vabsd_wu_select(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: vabsd_wu_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp ugt <4 x i32> %a, %b %ab = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b @@ -515,9 +440,7 @@ define <4 x i32> @vabsd_wu_select(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @vabsd_du_select(<2 x i64> %a, <2 x i64> %b) nounwind { ; CHECK-LABEL: vabsd_du_select: ; CHECK: # %bb.0: -; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: vabsd.du $vr0, $vr0, $vr1 ; CHECK-NEXT: ret %cmp = icmp uge <2 x i64> %a, %b %ab = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b From b07c88563febdb62b82daad0480d7b6131bc54d4 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Tue, 15 Apr 2025 23:44:33 -0400 Subject: [PATCH 074/710] [Support] Add format object for interleaved ranges (#135517) Add two new format functions for printing ranges: `interleaved` and `interleaved_array`. This is meant to improve the ergonomics of printing ranges. Before this patch, we have to either use `llvm::interleave` or write a for loop by hand. For example: Before: ```c++ ArrayRef types = ...; ArrayRef values = ...; LLVM_DEBUG({ llvm::dbgs() << "Types: "; llvm::interleave_comma(llvm::dbgs(), types); llvm::dbgs() << "\n"; llvm::dbgs() << "Values: ["; llvm::interleave_comma(llvm::dbgs(), values); llvm::dbgs() << "]\n"; }): ``` After: ```c++ ArrayRef types = ...; ArrayRef values = ...; LLVM_DEBUG(llvm::dbgs() << "Types: " << interleaved(types) << "\n"); LLVM_DEBUG(llvm::dbgs() << "Values: " << interleaved_array(values) << "\n"); ``` The separator and the prefix/suffix strings are customizable. --- llvm/include/llvm/Support/InterleavedRange.h | 99 +++++++++++++++++++ llvm/unittests/Support/CMakeLists.txt | 3 +- .../Support/InterleavedRangeTest.cpp | 70 +++++++++++++ 3 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 llvm/include/llvm/Support/InterleavedRange.h create mode 100644 llvm/unittests/Support/InterleavedRangeTest.cpp diff --git a/llvm/include/llvm/Support/InterleavedRange.h b/llvm/include/llvm/Support/InterleavedRange.h new file mode 100644 index 0000000000000..4e70028504806 --- /dev/null +++ b/llvm/include/llvm/Support/InterleavedRange.h @@ -0,0 +1,99 @@ +//===- InterleavedRange.h - Output stream formatting for ranges -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements format objects for printing ranges to output streams. +// For example: +// ```c++ +// ArrayRef Types = ...; +// OS << "Types: " << interleaved(Types); // ==> "Types: i32, f16, i8" +// ArrayRef Values = ...; +// OS << "Values: " << interleaved_array(Values); // ==> "Values: [1, 2, 3]" +// ``` +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_INTERLEAVED_RANGE_H +#define LLVM_SUPPORT_INTERLEAVED_RANGE_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +/// Format object class for interleaved ranges. Supports specifying the +/// separator and, optionally, the prefix and suffix to be printed surrounding +/// the range. +/// Uses the operator '<<' of the range element type for printing. The range +/// type itself does not have to have an '<<' operator defined. +template class InterleavedRange { + const Range &TheRange; + StringRef Separator; + StringRef Prefix; + StringRef Suffix; + +public: + InterleavedRange(const Range &R, StringRef Separator, StringRef Prefix, + StringRef Suffix) + : TheRange(R), Separator(Separator), Prefix(Prefix), Suffix(Suffix) {} + + friend raw_ostream &operator<<(raw_ostream &OS, + const InterleavedRange &Interleaved) { + if (!Interleaved.Prefix.empty()) + OS << Interleaved.Prefix; + llvm::interleave(Interleaved.TheRange, OS, Interleaved.Separator); + if (!Interleaved.Suffix.empty()) + OS << Interleaved.Suffix; + return OS; + } + + std::string str() const { + std::string Result; + raw_string_ostream Stream(Result); + Stream << *this; + Stream.flush(); + return Result; + } + + operator std::string() const { return str(); } +}; + +/// Output range `R` as a sequence of interleaved elements. Requires the range +/// element type to be printable using `raw_ostream& operator<<`. The +/// `Separator` and `Prefix` / `Suffix` can be customized. Examples: +/// ```c++ +/// SmallVector Vals = {1, 2, 3}; +/// OS << interleaved(Vals); // ==> "1, 2, 3" +/// OS << interleaved(Vals, ";"); // ==> "1;2;3" +/// OS << interleaved(Vals, " ", "{", "}"); // ==> "{1 2 3}" +/// ``` +template +InterleavedRange interleaved(const Range &R, StringRef Separator = ", ", + StringRef Prefix = "", + StringRef Suffix = "") { + return {R, Separator, Prefix, Suffix}; +} + +/// Output range `R` as an array of interleaved elements. Requires the range +/// element type to be printable using `raw_ostream& operator<<`. The +/// `Separator` can be customized. Examples: +/// ```c++ +/// SmallVector Vals = {1, 2, 3}; +/// OS << interleaved_array(Vals); // ==> "[1, 2, 3]" +/// OS << interleaved_array(Vals, ";"); // ==> "[1;2;3]" +/// OS << interleaved_array(Vals, " "); // ==> "[1 2 3]" +/// ``` +template +InterleavedRange interleaved_array(const Range &R, + StringRef Separator = ", ") { + return {R, Separator, "[", "]"}; +} + +} // end namespace llvm + +#endif // LLVM_SUPPORT_INTERLEAVED_RANGE_H diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index 6c4e7cb689b20..4a12a928af119 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_unittest(SupportTests HashBuilderTest.cpp IndexedAccessorTest.cpp InstructionCostTest.cpp + InterleavedRangeTest.cpp JSONTest.cpp KnownBitsTest.cpp LEB128Test.cpp @@ -61,7 +62,7 @@ add_llvm_unittest(SupportTests MemoryBufferRefTest.cpp MemoryBufferTest.cpp MemoryTest.cpp - MustacheTest.cpp + MustacheTest.cpp ModRefTest.cpp NativeFormatTests.cpp OptimizedStructLayoutTest.cpp diff --git a/llvm/unittests/Support/InterleavedRangeTest.cpp b/llvm/unittests/Support/InterleavedRangeTest.cpp new file mode 100644 index 0000000000000..8640b81fe8ad8 --- /dev/null +++ b/llvm/unittests/Support/InterleavedRangeTest.cpp @@ -0,0 +1,70 @@ +//===- InterleavedRangeTest.cpp - Unit tests for interleaved format -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/InterleavedRange.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +TEST(InterleavedRangeTest, VectorInt) { + SmallVector V = {0, 1, 2, 3}; + + // First, make sure that the raw print API works as expected. + std::string Buff; + raw_string_ostream OS(Buff); + OS << interleaved(V); + EXPECT_EQ("0, 1, 2, 3", Buff); + Buff.clear(); + OS << interleaved_array(V); + EXPECT_EQ("[0, 1, 2, 3]", Buff); + + // In the rest of the tests, use `.str()` for convenience. + EXPECT_EQ("0, 1, 2, 3", interleaved(V).str()); + EXPECT_EQ("{{0,1,2,3}}", interleaved(V, ",", "{{", "}}").str()); + EXPECT_EQ("[0, 1, 2, 3]", interleaved_array(V).str()); + EXPECT_EQ("[0;1;2;3]", interleaved_array(V, ";").str()); + EXPECT_EQ("0;1;2;3", interleaved(V, ";").str()); +} + +TEST(InterleavedRangeTest, VectorIntEmpty) { + SmallVector V = {}; + EXPECT_EQ("", interleaved(V).str()); + EXPECT_EQ("{{}}", interleaved(V, ",", "{{", "}}").str()); + EXPECT_EQ("[]", interleaved_array(V).str()); + EXPECT_EQ("", interleaved(V, ";").str()); +} + +TEST(InterleavedRangeTest, VectorIntOneElem) { + SmallVector V = {42}; + EXPECT_EQ("42", interleaved(V).str()); + EXPECT_EQ("{{42}}", interleaved(V, ",", "{{", "}}").str()); + EXPECT_EQ("[42]", interleaved_array(V).str()); + EXPECT_EQ("42", interleaved(V, ";").str()); +} + +struct CustomPrint { + int N; + friend raw_ostream &operator<<(raw_ostream &OS, const CustomPrint &CP) { + OS << "$$" << CP.N << "##"; + return OS; + } +}; + +TEST(InterleavedRangeTest, CustomPrint) { + CustomPrint V[] = {{3}, {4}, {5}}; + EXPECT_EQ("$$3##, $$4##, $$5##", interleaved(V).str()); + EXPECT_EQ("{{$$3##;$$4##;$$5##}}", interleaved(V, ";", "{{", "}}").str()); + EXPECT_EQ("[$$3##, $$4##, $$5##]", interleaved_array(V).str()); +} + +} // namespace From 04b87e15e40f8857e29ade8321b8b67691545a50 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 16 Apr 2025 06:14:38 +0200 Subject: [PATCH 075/710] [flang][fir] Lower `do concurrent` loop nests to `fir.do_concurrent` (#132904) Adds support for lowering `do concurrent` nests from PFT to the new `fir.do_concurrent` MLIR op as well as its special terminator `fir.do_concurrent.loop` which models the actual loop nest. To that end, this PR emits the allocations for the iteration variables within the block of the `fir.do_concurrent` op and creates a region for the `fir.do_concurrent.loop` op that accepts arguments equal in number to the number of the input `do concurrent` iteration ranges. For example, given the following input: ```fortran do concurrent(i=1:10, j=11:20) end do ``` the changes in this PR emit the following MLIR: ```mlir fir.do_concurrent { %22 = fir.alloca i32 {bindc_name = "i"} %23:2 = hlfir.declare %22 {uniq_name = "_QFsub1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) %24 = fir.alloca i32 {bindc_name = "j"} %25:2 = hlfir.declare %24 {uniq_name = "_QFsub1Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) fir.do_concurrent.loop (%arg1, %arg2) = (%18, %20) to (%19, %21) step (%c1, %c1_0) { %26 = fir.convert %arg1 : (index) -> i32 fir.store %26 to %23#0 : !fir.ref %27 = fir.convert %arg2 : (index) -> i32 fir.store %27 to %25#0 : !fir.ref } } ``` --- flang/lib/Lower/Bridge.cpp | 228 +++++++++++------- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 3 + flang/test/Lower/do_concurrent.f90 | 39 ++- .../do_concurrent_local_default_init.f90 | 4 +- flang/test/Lower/loops.f90 | 37 +-- flang/test/Lower/loops3.f90 | 4 +- flang/test/Lower/nsw.f90 | 5 +- .../Transforms/DoConcurrent/basic_host.f90 | 3 + .../DoConcurrent/locally_destroyed_temp.f90 | 3 + .../DoConcurrent/loop_nest_test.f90 | 3 + .../multiple_iteration_ranges.f90 | 3 + .../DoConcurrent/non_const_bounds.f90 | 3 + .../DoConcurrent/not_perfectly_nested.f90 | 3 + 13 files changed, 208 insertions(+), 130 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index b4d1197822a43..625dd116fe726 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -94,10 +94,11 @@ struct IncrementLoopInfo { template explicit IncrementLoopInfo(Fortran::semantics::Symbol &sym, const T &lower, const T &upper, const std::optional &step, - bool isUnordered = false) + bool isConcurrent = false) : loopVariableSym{&sym}, lowerExpr{Fortran::semantics::GetExpr(lower)}, upperExpr{Fortran::semantics::GetExpr(upper)}, - stepExpr{Fortran::semantics::GetExpr(step)}, isUnordered{isUnordered} {} + stepExpr{Fortran::semantics::GetExpr(step)}, + isConcurrent{isConcurrent} {} IncrementLoopInfo(IncrementLoopInfo &&) = default; IncrementLoopInfo &operator=(IncrementLoopInfo &&x) = default; @@ -120,7 +121,7 @@ struct IncrementLoopInfo { const Fortran::lower::SomeExpr *upperExpr; const Fortran::lower::SomeExpr *stepExpr; const Fortran::lower::SomeExpr *maskExpr = nullptr; - bool isUnordered; // do concurrent, forall + bool isConcurrent; llvm::SmallVector localSymList; llvm::SmallVector localInitSymList; llvm::SmallVector< @@ -130,7 +131,7 @@ struct IncrementLoopInfo { mlir::Value loopVariable = nullptr; // Data members for structured loops. - fir::DoLoopOp doLoop = nullptr; + mlir::Operation *loopOp = nullptr; // Data members for unstructured loops. bool hasRealControl = false; @@ -1980,7 +1981,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm_unreachable("illegal reduction operator"); } - /// Collect DO CONCURRENT or FORALL loop control information. + /// Collect DO CONCURRENT loop control information. IncrementLoopNestInfo getConcurrentControl( const Fortran::parser::ConcurrentHeader &header, const std::list &localityList = {}) { @@ -2291,8 +2292,14 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get( builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}); - if (has_attrs) - info.doLoop.setLoopAnnotationAttr(la); + if (has_attrs) { + if (auto loopOp = mlir::dyn_cast(info.loopOp)) + loopOp.setLoopAnnotationAttr(la); + + if (auto doConcurrentOp = + mlir::dyn_cast(info.loopOp)) + doConcurrentOp.setLoopAnnotationAttr(la); + } } /// Generate FIR to begin a structured or unstructured increment loop nest. @@ -2301,96 +2308,77 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm::SmallVectorImpl &dirs) { assert(!incrementLoopNestInfo.empty() && "empty loop nest"); mlir::Location loc = toLocation(); - mlir::Operation *boundsAndStepIP = nullptr; mlir::arith::IntegerOverflowFlags iofBackup{}; + llvm::SmallVector nestLBs; + llvm::SmallVector nestUBs; + llvm::SmallVector nestSts; + llvm::SmallVector nestReduceOperands; + llvm::SmallVector nestReduceAttrs; + bool genDoConcurrent = false; + for (IncrementLoopInfo &info : incrementLoopNestInfo) { - mlir::Value lowerValue; - mlir::Value upperValue; - mlir::Value stepValue; + genDoConcurrent = info.isStructured() && info.isConcurrent; - { - mlir::OpBuilder::InsertionGuard guard(*builder); + if (!genDoConcurrent) + info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, + info.isConcurrent); - // Set the IP before the first loop in the nest so that all nest bounds - // and step values are created outside the nest. - if (boundsAndStepIP) - builder->setInsertionPointAfter(boundsAndStepIP); + if (!getLoweringOptions().getIntegerWrapAround()) { + iofBackup = builder->getIntegerOverflowFlags(); + builder->setIntegerOverflowFlags( + mlir::arith::IntegerOverflowFlags::nsw); + } - info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, - info.isUnordered); - if (!getLoweringOptions().getIntegerWrapAround()) { - iofBackup = builder->getIntegerOverflowFlags(); - builder->setIntegerOverflowFlags( - mlir::arith::IntegerOverflowFlags::nsw); - } - lowerValue = genControlValue(info.lowerExpr, info); - upperValue = genControlValue(info.upperExpr, info); - bool isConst = true; - stepValue = genControlValue(info.stepExpr, info, - info.isStructured() ? nullptr : &isConst); - if (!getLoweringOptions().getIntegerWrapAround()) - builder->setIntegerOverflowFlags(iofBackup); - boundsAndStepIP = stepValue.getDefiningOp(); - - // Use a temp variable for unstructured loops with non-const step. - if (!isConst) { - info.stepVariable = - builder->createTemporary(loc, stepValue.getType()); - boundsAndStepIP = - builder->create(loc, stepValue, info.stepVariable); + nestLBs.push_back(genControlValue(info.lowerExpr, info)); + nestUBs.push_back(genControlValue(info.upperExpr, info)); + bool isConst = true; + nestSts.push_back(genControlValue( + info.stepExpr, info, info.isStructured() ? nullptr : &isConst)); + + if (!getLoweringOptions().getIntegerWrapAround()) + builder->setIntegerOverflowFlags(iofBackup); + + // Use a temp variable for unstructured loops with non-const step. + if (!isConst) { + mlir::Value stepValue = nestSts.back(); + info.stepVariable = builder->createTemporary(loc, stepValue.getType()); + builder->create(loc, stepValue, info.stepVariable); + } + + if (genDoConcurrent && nestReduceOperands.empty()) { + // Create DO CONCURRENT reduce operands and attributes + for (const auto &reduceSym : info.reduceSymList) { + const fir::ReduceOperationEnum reduceOperation = reduceSym.first; + const Fortran::semantics::Symbol *sym = reduceSym.second; + fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr); + nestReduceOperands.push_back(fir::getBase(exv)); + auto reduceAttr = + fir::ReduceAttr::get(builder->getContext(), reduceOperation); + nestReduceAttrs.push_back(reduceAttr); } } + } + for (auto [info, lowerValue, upperValue, stepValue] : + llvm::zip_equal(incrementLoopNestInfo, nestLBs, nestUBs, nestSts)) { // Structured loop - generate fir.do_loop. if (info.isStructured()) { + if (genDoConcurrent) + continue; + + // The loop variable is a doLoop op argument. mlir::Type loopVarType = info.getLoopVariableType(); - mlir::Value loopValue; - if (info.isUnordered) { - llvm::SmallVector reduceOperands; - llvm::SmallVector reduceAttrs; - // Create DO CONCURRENT reduce operands and attributes - for (const auto &reduceSym : info.reduceSymList) { - const fir::ReduceOperationEnum reduce_operation = reduceSym.first; - const Fortran::semantics::Symbol *sym = reduceSym.second; - fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr); - reduceOperands.push_back(fir::getBase(exv)); - auto reduce_attr = - fir::ReduceAttr::get(builder->getContext(), reduce_operation); - reduceAttrs.push_back(reduce_attr); - } - // The loop variable value is explicitly updated. - info.doLoop = builder->create( - loc, lowerValue, upperValue, stepValue, /*unordered=*/true, - /*finalCountValue=*/false, /*iterArgs=*/std::nullopt, - llvm::ArrayRef(reduceOperands), reduceAttrs); - builder->setInsertionPointToStart(info.doLoop.getBody()); - loopValue = builder->createConvert(loc, loopVarType, - info.doLoop.getInductionVar()); - } else { - // The loop variable is a doLoop op argument. - info.doLoop = builder->create( - loc, lowerValue, upperValue, stepValue, /*unordered=*/false, - /*finalCountValue=*/true, - builder->createConvert(loc, loopVarType, lowerValue)); - builder->setInsertionPointToStart(info.doLoop.getBody()); - loopValue = info.doLoop.getRegionIterArgs()[0]; - } + auto loopOp = builder->create( + loc, lowerValue, upperValue, stepValue, /*unordered=*/false, + /*finalCountValue=*/true, + builder->createConvert(loc, loopVarType, lowerValue)); + info.loopOp = loopOp; + builder->setInsertionPointToStart(loopOp.getBody()); + mlir::Value loopValue = loopOp.getRegionIterArgs()[0]; + // Update the loop variable value in case it has non-index references. builder->create(loc, loopValue, info.loopVariable); - if (info.maskExpr) { - Fortran::lower::StatementContext stmtCtx; - mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx); - stmtCtx.finalizeAndReset(); - mlir::Value maskCondCast = - builder->createConvert(loc, builder->getI1Type(), maskCond); - auto ifOp = builder->create(loc, maskCondCast, - /*withElseRegion=*/false); - builder->setInsertionPointToStart(&ifOp.getThenRegion().front()); - } - if (info.hasLocalitySpecs()) - handleLocalitySpecs(info); - addLoopAnnotationAttr(info, dirs); continue; } @@ -2454,6 +2442,60 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->restoreInsertionPoint(insertPt); } } + + if (genDoConcurrent) { + auto loopWrapperOp = builder->create(loc); + builder->setInsertionPointToStart( + builder->createBlock(&loopWrapperOp.getRegion())); + + for (IncrementLoopInfo &info : llvm::reverse(incrementLoopNestInfo)) { + info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, + info.isConcurrent); + } + + builder->setInsertionPointToEnd(loopWrapperOp.getBody()); + auto loopOp = builder->create( + loc, nestLBs, nestUBs, nestSts, nestReduceOperands, + nestReduceAttrs.empty() + ? nullptr + : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs), + nullptr); + + llvm::SmallVector loopBlockArgTypes( + incrementLoopNestInfo.size(), builder->getIndexType()); + llvm::SmallVector loopBlockArgLocs( + incrementLoopNestInfo.size(), loc); + mlir::Region &loopRegion = loopOp.getRegion(); + mlir::Block *loopBlock = builder->createBlock( + &loopRegion, loopRegion.begin(), loopBlockArgTypes, loopBlockArgLocs); + builder->setInsertionPointToStart(loopBlock); + + for (auto [info, blockArg] : + llvm::zip_equal(incrementLoopNestInfo, loopBlock->getArguments())) { + info.loopOp = loopOp; + mlir::Value loopValue = + builder->createConvert(loc, info.getLoopVariableType(), blockArg); + builder->create(loc, loopValue, info.loopVariable); + + if (info.maskExpr) { + Fortran::lower::StatementContext stmtCtx; + mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx); + stmtCtx.finalizeAndReset(); + mlir::Value maskCondCast = + builder->createConvert(loc, builder->getI1Type(), maskCond); + auto ifOp = builder->create(loc, maskCondCast, + /*withElseRegion=*/false); + builder->setInsertionPointToStart(&ifOp.getThenRegion().front()); + } + } + + IncrementLoopInfo &innermostInfo = incrementLoopNestInfo.back(); + + if (innermostInfo.hasLocalitySpecs()) + handleLocalitySpecs(innermostInfo); + + addLoopAnnotationAttr(innermostInfo, dirs); + } } /// Generate FIR to end a structured or unstructured increment loop nest. @@ -2470,29 +2512,31 @@ class FirConverter : public Fortran::lower::AbstractConverter { it != rend; ++it) { IncrementLoopInfo &info = *it; if (info.isStructured()) { - // End fir.do_loop. - if (info.isUnordered) { - builder->setInsertionPointAfter(info.doLoop); + // End fir.do_concurent.loop. + if (info.isConcurrent) { + builder->setInsertionPointAfter(info.loopOp->getParentOp()); continue; } + + // End fir.do_loop. // Decrement tripVariable. - builder->setInsertionPointToEnd(info.doLoop.getBody()); + auto doLoopOp = mlir::cast(info.loopOp); + builder->setInsertionPointToEnd(doLoopOp.getBody()); llvm::SmallVector results; results.push_back(builder->create( - loc, info.doLoop.getInductionVar(), info.doLoop.getStep(), - iofAttr)); + loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr)); // Step loopVariable to help optimizations such as vectorization. // Induction variable elimination will clean up as necessary. mlir::Value step = builder->createConvert( - loc, info.getLoopVariableType(), info.doLoop.getStep()); + loc, info.getLoopVariableType(), doLoopOp.getStep()); mlir::Value loopVar = builder->create(loc, info.loopVariable); results.push_back( builder->create(loc, loopVar, step, iofAttr)); builder->create(loc, results); - builder->setInsertionPointAfter(info.doLoop); + builder->setInsertionPointAfter(doLoopOp); // The loop control variable may be used after the loop. - builder->create(loc, info.doLoop.getResult(1), + builder->create(loc, doLoopOp.getResult(1), info.loopVariable); continue; } diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 3cf9b5ae72d9e..d35367d7657cf 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -280,6 +280,9 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() { if (auto cufKernelOp = getRegion().getParentOfType()) return &cufKernelOp.getRegion().front(); + if (auto doConcurentOp = getRegion().getParentOfType()) + return doConcurentOp.getBody(); + return getEntryBlock(); } diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90 index ef93d2d6b035b..cc113f59c35e3 100644 --- a/flang/test/Lower/do_concurrent.f90 +++ b/flang/test/Lower/do_concurrent.f90 @@ -14,6 +14,9 @@ subroutine sub1(n) implicit none integer :: n, m, i, j, k integer, dimension(n) :: a +!CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFsub1En"} +!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Ea"} + !CHECK: %[[LB1:.*]] = arith.constant 1 : i32 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index !CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref @@ -29,10 +32,30 @@ subroutine sub1(n) !CHECK: %[[UB3:.*]] = arith.constant 10 : i32 !CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index -!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered -!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered -!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered +!CHECK: fir.do_concurrent +!CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i"} +!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] +!CHECK: %[[J:.*]] = fir.alloca i32 {bindc_name = "j"} +!CHECK: %[[J_DECL:.*]]:2 = hlfir.declare %[[J]] +!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k"} +!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] + +!CHECK: fir.do_concurrent.loop (%[[I_IV:.*]], %[[J_IV:.*]], %[[K_IV:.*]]) = +!CHECK-SAME: (%[[LB1_CVT]], %[[LB2_CVT]], %[[LB3_CVT]]) to +!CHECK-SAME: (%[[UB1_CVT]], %[[UB2_CVT]], %[[UB3_CVT]]) step +!CHECK-SAME: (%{{.*}}, %{{.*}}, %{{.*}}) { +!CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32 +!CHECK: fir.store %[[I_IV_CVT]] to %[[I_DECL]]#0 : !fir.ref +!CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32 +!CHECK: fir.store %[[J_IV_CVT]] to %[[J_DECL]]#0 : !fir.ref +!CHECK: %[[K_IV_CVT:.*]] = fir.convert %[[K_IV]] : (index) -> i32 +!CHECK: fir.store %[[K_IV_CVT]] to %[[K_DECL]]#0 : !fir.ref +!CHECK: %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref +!CHECK: %[[I_VAL:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref +!CHECK: %[[I_VAL_CVT:.*]] = fir.convert %[[I_VAL]] : (i32) -> i64 +!CHECK: %[[A_ELEM:.*]] = hlfir.designate %[[A_DECL]]#0 (%[[I_VAL_CVT]]) +!CHECK: hlfir.assign %[[N_VAL]] to %[[A_ELEM]] : i32, !fir.ref do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10) a(i) = n end do @@ -45,14 +68,17 @@ subroutine sub2(n) integer, dimension(n) :: a !CHECK: %[[LB1:.*]] = arith.constant 1 : i32 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index -!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref +!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref !CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index -!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered +!CHECK: fir.do_concurrent +!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB1_CVT]]) to (%[[UB1_CVT]]) step (%{{.*}}) + !CHECK: %[[LB2:.*]] = arith.constant 1 : i32 !CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index !CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs fastmath : (!fir.ref, !fir.ref) -> i32 !CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index -!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered +!CHECK: fir.do_concurrent +!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB2_CVT]]) to (%[[UB2_CVT]]) step (%{{.*}}) do concurrent(i=1:n) do concurrent(j=1:bar(n*m, n/m)) a(i) = n @@ -60,7 +86,6 @@ subroutine sub2(n) end do end subroutine - !CHECK-LABEL: unstructured subroutine unstructured(inner_step) integer(4) :: i, j, inner_step diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90 index 7652e4fcd0402..207704ac1a990 100644 --- a/flang/test/Lower/do_concurrent_local_default_init.f90 +++ b/flang/test/Lower/do_concurrent_local_default_init.f90 @@ -29,7 +29,7 @@ subroutine test_default_init() ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>>> {fir.bindc_name = "p"}) { ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref>>>> ! CHECK: %[[VAL_7:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box>>>) -> index -! CHECK: fir.do_loop +! CHECK: fir.do_concurrent.loop ! CHECK: %[[VAL_16:.*]] = fir.alloca !fir.box>>> {bindc_name = "p", pinned, uniq_name = "_QFtest_ptrEp"} ! CHECK: %[[VAL_17:.*]] = fir.zero_bits !fir.ptr>> ! CHECK: %[[VAL_18:.*]] = arith.constant 0 : index @@ -43,7 +43,7 @@ subroutine test_default_init() ! CHECK: } ! CHECK-LABEL: func.func @_QPtest_default_init( -! CHECK: fir.do_loop +! CHECK: fir.do_concurrent.loop ! CHECK: %[[VAL_26:.*]] = fir.alloca !fir.type<_QFtest_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFtest_default_initEa"} ! CHECK: %[[VAL_27:.*]] = fir.embox %[[VAL_26]] : (!fir.ref>) -> !fir.box> ! CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.box>) -> !fir.box diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index ea65ba3e4d66d..60df27a591dc3 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -2,15 +2,6 @@ ! CHECK-LABEL: loop_test subroutine loop_test - ! CHECK: %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i"} - ! CHECK: %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i"} - ! CHECK: %[[VAL_4:.*]] = fir.alloca i16 {bindc_name = "i"} - ! CHECK: %[[VAL_5:.*]] = fir.alloca i8 {bindc_name = "k"} - ! CHECK: %[[VAL_6:.*]] = fir.alloca i8 {bindc_name = "j"} - ! CHECK: %[[VAL_7:.*]] = fir.alloca i8 {bindc_name = "i"} - ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "k"} - ! CHECK: %[[VAL_9:.*]] = fir.alloca i32 {bindc_name = "j"} - ! CHECK: %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i"} ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<5x5x5xi32> {bindc_name = "a", uniq_name = "_QFloop_testEa"} ! CHECK: %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "asum", uniq_name = "_QFloop_testEasum"} ! CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFloop_testEi"} @@ -25,7 +16,7 @@ subroutine loop_test j = 200 k = 300 - ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered + ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}} do concurrent (i=1:5, j=1:5, k=1:5) ! shared(a) ! CHECK: fir.coordinate_of a(i,j,k) = 0 @@ -33,7 +24,7 @@ subroutine loop_test ! CHECK: fir.call @_FortranAioBeginExternalListOutput print*, 'A:', i, j, k - ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered + ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}} ! CHECK: fir.if do concurrent (integer(1)::i=1:5, j=1:5, k=1:5, i.ne.j .and. k.ne.3) shared(a) ! CHECK-COUNT-2: fir.coordinate_of @@ -53,7 +44,7 @@ subroutine loop_test ! CHECK: fir.call @_FortranAioBeginExternalListOutput print*, 'B:', i, j, k, '-', asum - ! CHECK: fir.do_loop {{.*}} unordered + ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}} ! CHECK-COUNT-2: fir.if do concurrent (integer(2)::i=1:5, i.ne.3) if (i.eq.2 .or. i.eq.4) goto 5 ! fir.if @@ -62,7 +53,7 @@ subroutine loop_test 5 continue enddo - ! CHECK: fir.do_loop {{.*}} unordered + ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}} ! CHECK-COUNT-2: fir.if do concurrent (integer(2)::i=1:5, i.ne.3) if (i.eq.2 .or. i.eq.4) then ! fir.if @@ -93,10 +84,6 @@ end subroutine loop_test ! CHECK-LABEL: c.func @_QPlis subroutine lis(n) - ! CHECK-DAG: fir.alloca i32 {bindc_name = "m"} - ! CHECK-DAG: fir.alloca i32 {bindc_name = "j"} - ! CHECK-DAG: fir.alloca i32 {bindc_name = "i"} - ! CHECK-DAG: fir.alloca i8 {bindc_name = "i"} ! CHECK-DAG: fir.alloca i32 {bindc_name = "j", uniq_name = "_QFlisEj"} ! CHECK-DAG: fir.alloca i32 {bindc_name = "k", uniq_name = "_QFlisEk"} ! CHECK-DAG: fir.alloca !fir.box>> {bindc_name = "p", uniq_name = "_QFlisEp"} @@ -117,8 +104,8 @@ subroutine lis(n) ! CHECK: } r = 0 - ! CHECK: fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %{{.*}} unordered { - ! CHECK: fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg3 = %{{.*}}) -> (index, i32) { + ! CHECK: fir.do_concurrent { + ! CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { ! CHECK: } ! CHECK: } do concurrent (integer(kind=1)::i=n:1:-1) @@ -128,16 +115,18 @@ subroutine lis(n) enddo enddo - ! CHECK: fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { - ! CHECK: fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { + ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}) = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) { ! CHECK: fir.if %{{.*}} { ! CHECK: %[[V_95:[0-9]+]] = fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "t", pinned, uniq_name = "_QFlisEt"} ! CHECK: %[[V_96:[0-9]+]] = fir.alloca !fir.box>> {bindc_name = "p", pinned, uniq_name = "_QFlisEp"} ! CHECK: fir.store %{{.*}} to %[[V_96]] : !fir.ref>>> ! CHECK: fir.do_loop %arg3 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg4 = %{{.*}}) -> (index, i32) { - ! CHECK: fir.do_loop %arg5 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { - ! CHECK: fir.load %[[V_96]] : !fir.ref>>> - ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> + ! CHECK: fir.do_concurrent { + ! CHECK: fir.alloca i32 {bindc_name = "m"} + ! CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { + ! CHECK: fir.load %[[V_96]] : !fir.ref>>> + ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> + ! CHECK: } ! CHECK: } ! CHECK: } ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 78f39e1013082..84db1972cca16 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -12,9 +12,7 @@ subroutine loop_test ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"} ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref - ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { - ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { - ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m) tmp = i + j + k sum = tmp + sum diff --git a/flang/test/Lower/nsw.f90 b/flang/test/Lower/nsw.f90 index 4ee9e5da829e6..2ec1efb2af42a 100644 --- a/flang/test/Lower/nsw.f90 +++ b/flang/test/Lower/nsw.f90 @@ -139,7 +139,6 @@ subroutine loop_params3(a,lb,ub,st) ! CHECK-LABEL: func.func @_QPloop_params3( ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref ! CHECK: %[[VAL_11:.*]] = fir.declare %{{.*}}lb"} : (!fir.ref, !fir.dscope) -> !fir.ref ! CHECK: %[[VAL_12:.*]] = fir.declare %{{.*}}ub"} : (!fir.ref, !fir.dscope) -> !fir.ref ! CHECK: %[[VAL_14:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref @@ -153,4 +152,6 @@ subroutine loop_params3(a,lb,ub,st) ! CHECK: %[[VAL_31:.*]] = fir.load %[[VAL_15]] : !fir.ref ! CHECK: %[[VAL_32:.*]] = arith.muli %[[VAL_31]], %[[VAL_4]] overflow : i32 ! CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> index -! CHECK: fir.do_loop %[[VAL_34:.*]] = %[[VAL_28]] to %[[VAL_30]] step %[[VAL_33]] unordered { +! CHECK: fir.do_concurrent { +! CHECK: %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref +! CHECK: fir.do_concurrent.loop (%[[VAL_34:.*]]) = (%[[VAL_28]]) to (%[[VAL_30]]) step (%[[VAL_33]]) { diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index 12f63031cbaee..b84d4481ac766 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 index f82696669eca6..4e13c0919589a 100644 --- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 +++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! Tests that "loop-local values" are properly handled by localizing them to the ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue` ! for a definition of "loop-local values" and how they are handled. diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 index 32bed61fe69e4..adc4a488d1ec9 100644 --- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 +++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! Tests loop-nest detection algorithm for do-concurrent mapping. ! REQUIRES: asserts diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 index d0210726de83e..26800678d381c 100644 --- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! Tests mapping of a `do concurrent` loop with multiple iteration ranges. ! RUN: split-file %s %t diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 index cd1bd4f98a3f5..23a3aae976c07 100644 --- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 +++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ ! RUN: | FileCheck %s diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 index 184fdfe00d397..d1c02101318ab 100644 --- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 +++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 @@ -1,3 +1,6 @@ +! Fails until we update the pass to use the `fir.do_concurrent` op. +! XFAIL: * + ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that ! we skip converting the not-perfectly nested `do concurrent` loop. From 3264a50fe2b61e79572d1623d0cceb2fe88da533 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Wed, 16 Apr 2025 10:03:21 +0530 Subject: [PATCH 076/710] [clang][NVPTX] Add builtins and intrinsics for conversions of new FP types (#134345) This change: - Adds NVVM intrinsics and clang builtins for the cvt instruction variants of types (FP6) `.e2m3x2`, `.e3m2x2`, and (FP8) `.ue8m0x2` introduced in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`. - Adds clang builtins for cvt instruction variant of type tf32. Tests are added in `NVPTX/convert-sm100a.ll` and `clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt --- clang/include/clang/Basic/BuiltinsNVPTX.td | 31 +++ clang/test/CodeGen/builtins-nvptx.c | 150 ++++++++++- llvm/include/llvm/IR/IntrinsicsNVVM.td | 31 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 35 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 56 ++++ llvm/test/CodeGen/NVPTX/convert-sm100a.ll | 290 +++++++++++++++++++++ 6 files changed, 589 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 61e48b31c244b..bdbdfa2cea6c6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>; +def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>; +def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; @@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; +def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ue8m0x2_to_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + // FNS let Attributes = [NoThrow] in { def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index e2c159aac903f..7404ce01c535c 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -25,14 +25,29 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s // ### The last run to check with the highest SM and PTX version available // ### to make sure target builtins are still accepted. -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -995,7 +1010,7 @@ __device__ void nvvm_cvt_sm80() { // CHECK-LABEL: nvvm_cvt_sm89 __device__ void nvvm_cvt_sm89() { -#if __CUDA_ARCH__ >= 890 +#if (PTX >= 81) && (__CUDA_ARCH__ >= 890) // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00) __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) @@ -1022,6 +1037,133 @@ __device__ void nvvm_cvt_sm89() { __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); + + // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rna_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm90 +__device__ void nvvm_cvt_sm90() { +#if (PTX >= 78) && (__CUDA_ARCH__ >= 900) + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00) + __nvvm_f2tf32_rn(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00) + __nvvm_f2tf32_rn_relu(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00) + __nvvm_f2tf32_rz(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00) + __nvvm_f2tf32_rz_relu(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100 +__device__ void nvvm_cvt_sm100() { +#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000) + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_relu_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_relu_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100a_sm101a_sm120a +__device__ void nvvm_cvt_sm100a_sm101a_sm120a() { +#if (PTX >= 86) && \ + (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL || \ + __CUDA_ARCH_FEAT_SM120_ALL) + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_relu_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_relu_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + __nvvm_e2m3x2_to_f16x2_rn(0x4C4C); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + __nvvm_e3m2x2_to_f16x2_rn(0x4848); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + __nvvm_ue8m0x2_to_bf16x2(0x4C4C); + #endif // CHECK: ret void } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 4aeb1d8a2779e..7af051d22f9b7 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1628,6 +1628,37 @@ let TargetPrefix = "nvvm" in { Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">, Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_FF_TO_I16 : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_I16_TO_F16X2 + : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_BF16X2_TO_I16 : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + foreach relu = ["", "_relu"] in { + defvar suffix = !strconcat("_rn", relu); + def int_nvvm_ff_to_ # type # suffix # _satfinite : CVT_FF_TO_I16; + def int_nvvm_ # type # _to_f16x2 # suffix : CVT_I16_TO_F16X2; + } + } + + // UE8M0x2 conversions. + foreach rmode = ["_rz", "_rp"] in { + foreach satmode = ["", "_satfinite"] in { + defvar suffix = !strconcat(rmode, satmode); + def int_nvvm_ff_to_ue8m0x2 # suffix : CVT_FF_TO_I16<"ue8m0x2", suffix>; + def int_nvvm_bf16x2_to_ue8m0x2 # suffix : CVT_BF16X2_TO_I16<"ue8m0x2", suffix>; + } + } + + def int_nvvm_ue8m0x2_to_bf16x2 : ClangBuiltin<"__nvvm_ue8m0x2_to_bf16x2">, + Intrinsic<[llvm_v2bf16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; // FNS diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index ca15783654381..46549d850cf0a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -703,6 +703,41 @@ let hasSideEffects = false in { defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>; defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>; defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>; + + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + def CVT_ # type # _f32_sf : NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, + Float32Regs:$src2, CvtMode:$mode), + "cvt${mode:base}.satfinite${mode:relu}." + # type # ".f32 \t$dst, $src1, $src2;", []>; + def CVT_f16x2_ # type : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + "cvt${mode:base}${mode:relu}.f16x2." + # type # " \t$dst, $src;", []>; + } + + // UE8M0x2 conversions. + class CVT_f32_to_ue8m0x2 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), + "cvt${mode:base}" # sat # ".ue8m0x2.f32 \t$dst, $src1, $src2;", []>; + + class CVT_bf16x2_to_ue8m0x2 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + "cvt${mode:base}" # sat # ".ue8m0x2.bf16x2 \t$dst, $src;", []>; + + def CVT_ue8m0x2_f32 : CVT_f32_to_ue8m0x2; + def CVT_ue8m0x2_f32_sf : CVT_f32_to_ue8m0x2<".satfinite">; + def CVT_ue8m0x2_bf16x2 : CVT_bf16x2_to_ue8m0x2; + def CVT_ue8m0x2_bf16x2_sf : CVT_bf16x2_to_ue8m0x2<".satfinite">; + + def CVT_bf16x2_ue8m0x2 : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src), + "cvt.rn.bf16x2.ue8m0x2 \t$dst, $src;", []>; + } def fpround_oneuse : OneUse1; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8528ff702f236..b5d3c9a05ec34 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1944,6 +1944,62 @@ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a), def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a), (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; +def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), + (CVT_bf16x2_ue8m0x2 $a)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + // // FNS // diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll new file mode 100644 index 0000000000000..f0dd5f084026b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -0,0 +1,290 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} + +define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_sf_e2m3x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_sf_e3m2x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_sf_e3m2x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float %f1, float %f2) + ret i16 %val +} + +define <2 x half> @cvt_rn_f16x2_e2m3x2(i16 %in) { +; CHECK-LABEL: cvt_rn_f16x2_e2m3x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0]; +; CHECK-NEXT: cvt.rn.f16x2.e2m3x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_e2m3x2_relu(i16 %in) { +; CHECK-LABEL: cvt_rn_relu_f16x2_e2m3x2_relu( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16x2.e2m3x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_f16x2_e3m2x2(i16 %in) { +; CHECK-LABEL: cvt_rn_f16x2_e3m2x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: cvt.rn.f16x2.e3m2x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_e3m2x2(i16 %in) { +; CHECK-LABEL: cvt_rn_relu_f16x2_e3m2x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16x2.e3m2x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 %in) + ret <2 x half> %val +} + +define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_sf_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rp_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rp_sf_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rz_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rz_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rz.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rz_sf_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rz_sf_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_sf_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rp_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rp_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rp_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rp.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rp_sf_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rp_sf_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rp_sf_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %in) + ret i16 %val +} + +define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) { +; CHECK-LABEL: cvt_bf16x2_ue8m0x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_bf16x2_ue8m0x2_param_0]; +; CHECK-NEXT: cvt.rn.bf16x2.ue8m0x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %in) + ret <2 x bfloat> %val +} From 2a024046217a1acae4806328ac77bd88648c2bab Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Wed, 16 Apr 2025 02:09:32 -0300 Subject: [PATCH 077/710] [clang] fix a crash in error recovery in expressions resolving to templates (#135893) We were using AssumedTemplate incorrectly for error recovery. Fixes #135621 --- clang/docs/ReleaseNotes.rst | 3 ++- clang/lib/AST/Type.cpp | 3 ++- clang/lib/Sema/SemaExpr.cpp | 12 ++++++++---- clang/test/SemaTemplate/recovery-crash.cpp | 11 +++++++++++ 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ee69af5632f6e..84ad253c1ec4f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -402,6 +402,7 @@ Bug Fixes in This Version when using the ``INTn_C`` macros. (#GH85995) - Fixed an assertion failure in the expansion of builtin macros like ``__has_embed()`` with line breaks before the closing paren. (#GH133574) +- Fixed a crash in error recovery for expressions resolving to templates. (#GH135621) - Clang no longer accepts invalid integer constants which are too large to fit into any (standard or extended) integer type when the constant is unevaluated. Merely forming the token is sufficient to render the program invalid. Code @@ -544,7 +545,7 @@ Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ - For ARM targets, cc1as now considers the FPU's features for the selected CPU or Architecture. - The ``+nosimd`` attribute is now fully supported for ARM. Previously, this had no effect when being used with - ARM targets, however this will now disable NEON instructions being generated. The ``simd`` option is + ARM targets, however this will now disable NEON instructions being generated. The ``simd`` option is also now printed when the ``--print-supported-extensions`` option is used. - Support for __ptrauth type qualifier has been added. diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 62e48062cf241..53620003c9655 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -4401,7 +4401,8 @@ TemplateSpecializationType::TemplateSpecializationType( T.getKind() == TemplateName::SubstTemplateTemplateParmPack || T.getKind() == TemplateName::UsingTemplate || T.getKind() == TemplateName::QualifiedTemplate || - T.getKind() == TemplateName::DeducedTemplate) && + T.getKind() == TemplateName::DeducedTemplate || + T.getKind() == TemplateName::AssumedTemplate) && "Unexpected template name for TemplateSpecializationType"); auto *TemplateArgs = diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 3ac7d61546ceb..c65b4eadf9c67 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -21111,11 +21111,15 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) { const bool IsTypeAliasTemplateDecl = isa(Temp); NestedNameSpecifier *NNS = ULE->getQualifierLoc().getNestedNameSpecifier(); - TemplateName TN(dyn_cast(Temp)); - if (TN.isNull()) + // FIXME: AssumedTemplate is not very appropriate for error recovery here, + // as it models only the unqualified-id case, where this case can clearly be + // qualified. Thus we can't just qualify an assumed template. + TemplateName TN; + if (auto *TD = dyn_cast(Temp)) + TN = Context.getQualifiedTemplateName(NNS, ULE->hasTemplateKeyword(), + TemplateName(TD)); + else TN = Context.getAssumedTemplateName(NameInfo.getName()); - TN = Context.getQualifiedTemplateName(NNS, - /*TemplateKeyword=*/true, TN); Diag(NameInfo.getLoc(), diag::err_template_kw_refers_to_type_template) << TN << ULE->getSourceRange() << IsTypeAliasTemplateDecl; diff --git a/clang/test/SemaTemplate/recovery-crash.cpp b/clang/test/SemaTemplate/recovery-crash.cpp index ac8053da101ab..9b106f1f21fc5 100644 --- a/clang/test/SemaTemplate/recovery-crash.cpp +++ b/clang/test/SemaTemplate/recovery-crash.cpp @@ -67,3 +67,14 @@ namespace test1 { // expected-note@#defined-here {{defined here}} void NonTemplateClass::UndeclaredMethod() {} } + +namespace GH135621 { + template struct S {}; + // expected-note@-1 {{class template declared here}} + template void f() { + S::template S; + // expected-error@-1 {{'S' is expected to be a non-type template, but instantiated to a class template}} + } + template void f(); + // expected-note@-1 {{requested here}} +} // namespace GH135621 From 3de88fe40fd0dc5f52ba0bc7ebbaf16e938d6670 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Tue, 15 Apr 2025 22:33:27 -0700 Subject: [PATCH 078/710] [DirectX] Implement the DXILCBufferAccess pass (#134571) This introduces a pass that walks accesses to globals in cbuffers and replaces them with accesses via the cbuffer handle itself. The logic to interpret the cbuffer metadata is kept in `lib/Frontend/HLSL` so that it can be reused by other consumers of that metadata. Fixes #124630. --- llvm/include/llvm/Frontend/HLSL/CBuffer.h | 64 ++++++ llvm/lib/Frontend/HLSL/CBuffer.cpp | 71 ++++++ llvm/lib/Frontend/HLSL/CMakeLists.txt | 1 + llvm/lib/Target/DirectX/CMakeLists.txt | 1 + llvm/lib/Target/DirectX/DXILCBufferAccess.cpp | 210 ++++++++++++++++++ llvm/lib/Target/DirectX/DXILCBufferAccess.h | 28 +++ llvm/lib/Target/DirectX/DirectX.h | 6 + .../Target/DirectX/DirectXPassRegistry.def | 1 + .../Target/DirectX/DirectXTargetMachine.cpp | 3 + .../DirectX/CBufferAccess/array-typedgep.ll | 32 +++ .../CodeGen/DirectX/CBufferAccess/arrays.ll | 124 +++++++++++ .../CodeGen/DirectX/CBufferAccess/float.ll | 25 +++ .../DirectX/CBufferAccess/gep-ce-two-uses.ll | 36 +++ .../CodeGen/DirectX/CBufferAccess/scalars.ll | 100 +++++++++ .../CodeGen/DirectX/CBufferAccess/vectors.ll | 119 ++++++++++ llvm/test/CodeGen/DirectX/llc-pipeline.ll | 1 + 16 files changed, 822 insertions(+) create mode 100644 llvm/include/llvm/Frontend/HLSL/CBuffer.h create mode 100644 llvm/lib/Frontend/HLSL/CBuffer.cpp create mode 100644 llvm/lib/Target/DirectX/DXILCBufferAccess.cpp create mode 100644 llvm/lib/Target/DirectX/DXILCBufferAccess.h create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/array-typedgep.ll create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/arrays.ll create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/float.ll create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/scalars.ll create mode 100644 llvm/test/CodeGen/DirectX/CBufferAccess/vectors.ll diff --git a/llvm/include/llvm/Frontend/HLSL/CBuffer.h b/llvm/include/llvm/Frontend/HLSL/CBuffer.h new file mode 100644 index 0000000000000..694a7fa854576 --- /dev/null +++ b/llvm/include/llvm/Frontend/HLSL/CBuffer.h @@ -0,0 +1,64 @@ +//===- CBuffer.h - HLSL constant buffer handling ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file contains utilities to work with constant buffers in HLSL. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FRONTEND_HLSL_CBUFFER_H +#define LLVM_FRONTEND_HLSL_CBUFFER_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include + +namespace llvm { +class Module; +class GlobalVariable; +class NamedMDNode; + +namespace hlsl { + +struct CBufferMember { + GlobalVariable *GV; + size_t Offset; + + CBufferMember(GlobalVariable *GV, size_t Offset) : GV(GV), Offset(Offset) {} +}; + +struct CBufferMapping { + GlobalVariable *Handle; + SmallVector Members; + + CBufferMapping(GlobalVariable *Handle) : Handle(Handle) {} +}; + +class CBufferMetadata { + NamedMDNode *MD; + SmallVector Mappings; + + CBufferMetadata(NamedMDNode *MD) : MD(MD) {} + +public: + static std::optional get(Module &M); + + using iterator = SmallVector::iterator; + iterator begin() { return Mappings.begin(); } + iterator end() { return Mappings.end(); } + + void eraseFromModule(); +}; + +APInt translateCBufArrayOffset(const DataLayout &DL, APInt Offset, + ArrayType *Ty); + +} // namespace hlsl +} // namespace llvm + +#endif // LLVM_FRONTEND_HLSL_CBUFFER_H diff --git a/llvm/lib/Frontend/HLSL/CBuffer.cpp b/llvm/lib/Frontend/HLSL/CBuffer.cpp new file mode 100644 index 0000000000000..37c0d912e09ee --- /dev/null +++ b/llvm/lib/Frontend/HLSL/CBuffer.cpp @@ -0,0 +1,71 @@ +//===- CBuffer.cpp - HLSL constant buffer handling ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/HLSL/CBuffer.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" + +using namespace llvm; +using namespace llvm::hlsl; + +static size_t getMemberOffset(GlobalVariable *Handle, size_t Index) { + auto *HandleTy = cast(Handle->getValueType()); + assert(HandleTy->getName().ends_with(".CBuffer") && "Not a cbuffer type"); + assert(HandleTy->getNumTypeParameters() == 1 && "Expected layout type"); + + auto *LayoutTy = cast(HandleTy->getTypeParameter(0)); + assert(LayoutTy->getName().ends_with(".Layout") && "Not a layout type"); + + // Skip the "size" parameter. + size_t ParamIndex = Index + 1; + assert(LayoutTy->getNumIntParameters() > ParamIndex && + "Not enough parameters"); + + return LayoutTy->getIntParameter(ParamIndex); +} + +std::optional CBufferMetadata::get(Module &M) { + NamedMDNode *CBufMD = M.getNamedMetadata("hlsl.cbs"); + if (!CBufMD) + return std::nullopt; + + std::optional Result({CBufMD}); + + for (const MDNode *MD : CBufMD->operands()) { + assert(MD->getNumOperands() && "Invalid cbuffer metadata"); + + auto *Handle = cast( + cast(MD->getOperand(0))->getValue()); + CBufferMapping &Mapping = Result->Mappings.emplace_back(Handle); + + for (int I = 1, E = MD->getNumOperands(); I < E; ++I) { + Metadata *OpMD = MD->getOperand(I); + // Some members may be null if they've been optimized out. + if (!OpMD) + continue; + auto *V = cast(cast(OpMD)->getValue()); + Mapping.Members.emplace_back(V, getMemberOffset(Handle, I - 1)); + } + } + + return Result; +} + +void CBufferMetadata::eraseFromModule() { + // Remove the cbs named metadata + MD->eraseFromParent(); +} + +APInt hlsl::translateCBufArrayOffset(const DataLayout &DL, APInt Offset, + ArrayType *Ty) { + int64_t TypeSize = DL.getTypeSizeInBits(Ty->getElementType()) / 8; + int64_t RoundUp = alignTo(TypeSize, Align(CBufferRowSizeInBytes)); + return Offset.udiv(TypeSize) * RoundUp; +} diff --git a/llvm/lib/Frontend/HLSL/CMakeLists.txt b/llvm/lib/Frontend/HLSL/CMakeLists.txt index eda6cb8e69a49..07a0c845ceef6 100644 --- a/llvm/lib/Frontend/HLSL/CMakeLists.txt +++ b/llvm/lib/Frontend/HLSL/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_component_library(LLVMFrontendHLSL + CBuffer.cpp HLSLResource.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt index 13f8adbe4f132..c55028bc75dd6 100644 --- a/llvm/lib/Target/DirectX/CMakeLists.txt +++ b/llvm/lib/Target/DirectX/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_target(DirectXCodeGen DirectXTargetMachine.cpp DirectXTargetTransformInfo.cpp DXContainerGlobals.cpp + DXILCBufferAccess.cpp DXILDataScalarization.cpp DXILFinalizeLinkage.cpp DXILFlattenArrays.cpp diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp new file mode 100644 index 0000000000000..7559f61b4cfb9 --- /dev/null +++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp @@ -0,0 +1,210 @@ +//===- DXILCBufferAccess.cpp - Translate CBuffer Loads --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DXILCBufferAccess.h" +#include "DirectX.h" +#include "llvm/Frontend/HLSL/CBuffer.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicsDirectX.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "dxil-cbuffer-access" +using namespace llvm; + +namespace { +/// Helper for building a `load.cbufferrow` intrinsic given a simple type. +struct CBufferRowIntrin { + Intrinsic::ID IID; + Type *RetTy; + unsigned int EltSize; + unsigned int NumElts; + + CBufferRowIntrin(const DataLayout &DL, Type *Ty) { + assert(Ty == Ty->getScalarType() && "Expected scalar type"); + + switch (DL.getTypeSizeInBits(Ty)) { + case 16: + IID = Intrinsic::dx_resource_load_cbufferrow_8; + RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty); + EltSize = 2; + NumElts = 8; + break; + case 32: + IID = Intrinsic::dx_resource_load_cbufferrow_4; + RetTy = StructType::get(Ty, Ty, Ty, Ty); + EltSize = 4; + NumElts = 4; + break; + case 64: + IID = Intrinsic::dx_resource_load_cbufferrow_2; + RetTy = StructType::get(Ty, Ty); + EltSize = 8; + NumElts = 2; + break; + default: + llvm_unreachable("Only 16, 32, and 64 bit types supported"); + } + } +}; +} // namespace + +static size_t getOffsetForCBufferGEP(GEPOperator *GEP, GlobalVariable *Global, + const DataLayout &DL) { + // Since we should always have a constant offset, we should only ever have a + // single GEP of indirection from the Global. + assert(GEP->getPointerOperand() == Global && + "Indirect access to resource handle"); + + APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset); + (void)Success; + assert(Success && "Offsets into cbuffer globals must be constant"); + + if (auto *ATy = dyn_cast(Global->getValueType())) + ConstantOffset = hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy); + + return ConstantOffset.getZExtValue(); +} + +/// Replace access via cbuffer global with a load from the cbuffer handle +/// itself. +static void replaceAccess(LoadInst *LI, GlobalVariable *Global, + GlobalVariable *HandleGV, size_t BaseOffset, + SmallVectorImpl &DeadInsts) { + const DataLayout &DL = HandleGV->getDataLayout(); + + size_t Offset = BaseOffset; + if (auto *GEP = dyn_cast(LI->getPointerOperand())) + Offset += getOffsetForCBufferGEP(GEP, Global, DL); + else if (LI->getPointerOperand() != Global) + llvm_unreachable("Load instruction doesn't reference cbuffer global"); + + IRBuilder<> Builder(LI); + auto *Handle = Builder.CreateLoad(HandleGV->getValueType(), HandleGV, + HandleGV->getName()); + + Type *Ty = LI->getType(); + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + // The cbuffer consists of some number of 16-byte rows. + unsigned int CurrentRow = Offset / hlsl::CBufferRowSizeInBytes; + unsigned int CurrentIndex = + (Offset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, + {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr, + LI->getName()); + auto *Elt = + Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, LI->getName()); + + Value *Result = nullptr; + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + if (Remaining == 0) { + // We only have a single element, so we're done. + Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type here. + if (auto *VT = dyn_cast(LI->getType())) { + assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0)); + } + } else { + // Walk each element and extract it, wrapping to new rows as needed. + SmallVector Extracts{Elt}; + while (Remaining--) { + CurrentIndex %= Intrin.NumElts; + + if (CurrentIndex == 0) + CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, + {Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)}, + nullptr, LI->getName()); + + Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + LI->getName())); + } + + // Finally, we build up the original loaded value. + Result = PoisonValue::get(Ty); + for (int I = 0, E = Extracts.size(); I < E; ++I) + Result = + Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I)); + } + + LI->replaceAllUsesWith(Result); + DeadInsts.push_back(LI); +} + +static void replaceAccessesWithHandle(GlobalVariable *Global, + GlobalVariable *HandleGV, + size_t BaseOffset) { + SmallVector DeadInsts; + + SmallVector ToProcess{Global->users()}; + while (!ToProcess.empty()) { + User *Cur = ToProcess.pop_back_val(); + + // If we have a load instruction, replace the access. + if (auto *LI = dyn_cast(Cur)) { + replaceAccess(LI, Global, HandleGV, BaseOffset, DeadInsts); + continue; + } + + // Otherwise, walk users looking for a load... + ToProcess.append(Cur->user_begin(), Cur->user_end()); + } + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); +} + +static bool replaceCBufferAccesses(Module &M) { + std::optional CBufMD = hlsl::CBufferMetadata::get(M); + if (!CBufMD) + return false; + + for (const hlsl::CBufferMapping &Mapping : *CBufMD) + for (const hlsl::CBufferMember &Member : Mapping.Members) { + replaceAccessesWithHandle(Member.GV, Mapping.Handle, Member.Offset); + Member.GV->removeFromParent(); + } + + CBufMD->eraseFromModule(); + return true; +} + +PreservedAnalyses DXILCBufferAccess::run(Module &M, ModuleAnalysisManager &AM) { + PreservedAnalyses PA; + bool Changed = replaceCBufferAccesses(M); + + if (!Changed) + return PreservedAnalyses::all(); + return PA; +} + +namespace { +class DXILCBufferAccessLegacy : public ModulePass { +public: + bool runOnModule(Module &M) override { return replaceCBufferAccesses(M); } + StringRef getPassName() const override { return "DXIL CBuffer Access"; } + DXILCBufferAccessLegacy() : ModulePass(ID) {} + + static char ID; // Pass identification. +}; +char DXILCBufferAccessLegacy::ID = 0; +} // end anonymous namespace + +INITIALIZE_PASS(DXILCBufferAccessLegacy, DEBUG_TYPE, "DXIL CBuffer Access", + false, false) + +ModulePass *llvm::createDXILCBufferAccessLegacyPass() { + return new DXILCBufferAccessLegacy(); +} diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.h b/llvm/lib/Target/DirectX/DXILCBufferAccess.h new file mode 100644 index 0000000000000..6c1cde164004e --- /dev/null +++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.h @@ -0,0 +1,28 @@ +//===- DXILCBufferAccess.h - Translate CBuffer Loads ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file Pass for replacing loads from cbuffers in the cbuffer address space to +// cbuffer load intrinsics. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_DIRECTX_DXILCBUFFERACCESS_H +#define LLVM_LIB_TARGET_DIRECTX_DXILCBUFFERACCESS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class DXILCBufferAccess : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_DIRECTX_DXILCBUFFERACCESS_H diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h index 96a8a08c875f8..c0eb221d12203 100644 --- a/llvm/lib/Target/DirectX/DirectX.h +++ b/llvm/lib/Target/DirectX/DirectX.h @@ -35,6 +35,12 @@ void initializeDXILIntrinsicExpansionLegacyPass(PassRegistry &); /// Pass to expand intrinsic operations that lack DXIL opCodes ModulePass *createDXILIntrinsicExpansionLegacyPass(); +/// Initializer for DXIL CBuffer Access Pass +void initializeDXILCBufferAccessLegacyPass(PassRegistry &); + +/// Pass to translate loads in the cbuffer address space to intrinsics +ModulePass *createDXILCBufferAccessLegacyPass(); + /// Initializer for DXIL Data Scalarization Pass void initializeDXILDataScalarizationLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def index 87d91ead1896f..37093f16680a9 100644 --- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def +++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def @@ -23,6 +23,7 @@ MODULE_ANALYSIS("dxil-root-signature-analysis", dxil::RootSignatureAnalysis()) #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("dxil-cbuffer-access", DXILCBufferAccess()) MODULE_PASS("dxil-data-scalarization", DXILDataScalarization()) MODULE_PASS("dxil-flatten-arrays", DXILFlattenArrays()) MODULE_PASS("dxil-intrinsic-expansion", DXILIntrinsicExpansion()) diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index 747e4b3eb9411..41f6f37a41f9d 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "DirectXTargetMachine.h" +#include "DXILCBufferAccess.h" #include "DXILDataScalarization.h" #include "DXILFlattenArrays.h" #include "DXILIntrinsicExpansion.h" @@ -65,6 +66,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { initializeRootSignatureAnalysisWrapperPass(*PR); initializeDXILFinalizeLinkageLegacyPass(*PR); initializeDXILPrettyPrinterLegacyPass(*PR); + initializeDXILCBufferAccessLegacyPass(*PR); } class DXILTargetObjectFile : public TargetLoweringObjectFile { @@ -96,6 +98,7 @@ class DirectXPassConfig : public TargetPassConfig { void addCodeGenPrepare() override { addPass(createDXILFinalizeLinkageLegacyPass()); addPass(createDXILIntrinsicExpansionLegacyPass()); + addPass(createDXILCBufferAccessLegacyPass()); addPass(createDXILDataScalarizationLegacyPass()); addPass(createDXILFlattenArraysLegacyPass()); addPass(createDXILResourceAccessLegacyPass()); diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/array-typedgep.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/array-typedgep.ll new file mode 100644 index 0000000000000..dbd01b323aa2a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/array-typedgep.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [3 x float] }> + +@CB.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 36, 0)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@a1 = external addrspace(2) global [3 x float], align 4 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 36, 0)) @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBs_36_0tt(i32 0, i32 0, i32 1, i32 0, i1 false) + store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 36, 0)) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1 = load float, ptr addrspace(2) getelementptr inbounds ([3 x float], ptr addrspace(2) @a1, i32 0, i32 1), align 4 + store float %a1, ptr %dst, align 32 + + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/arrays.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/arrays.ll new file mode 100644 index 0000000000000..42d7943953b84 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/arrays.ll @@ -0,0 +1,124 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; double3 a2[2]; +; float16_t a3[2][2]; +; uint64_t a4[3]; +; int4 a5[2][3][4]; +; uint16_t a6[1]; +; int64_t a7[2]; +; bool a8[4]; +; } +%__cblayout_CB = type <{ [3 x float], [2 x <3 x double>], [2 x [2 x half]], [3 x i64], [2 x [3 x [4 x <4 x i32>]]], [1 x i16], [2 x i64], [4 x i32] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 608, 624, 656)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@a1 = external local_unnamed_addr addrspace(2) global [3 x float], align 4 +@a2 = external local_unnamed_addr addrspace(2) global [2 x <3 x double>], align 32 +@a3 = external local_unnamed_addr addrspace(2) global [2 x [2 x half]], align 2 +@a4 = external local_unnamed_addr addrspace(2) global [3 x i64], align 8 +@a5 = external local_unnamed_addr addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16 +@a6 = external local_unnamed_addr addrspace(2) global [1 x i16], align 2 +@a7 = external local_unnamed_addr addrspace(2) global [2 x i64], align 8 +@a8 = external local_unnamed_addr addrspace(2) global [4 x i32], align 4 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 608, 624, 656)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 608, 624, 656)) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 4), align 4 + store float %a1, ptr %dst, align 32 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 6) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2 = load <3 x double>, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a2, i32 32), align 8 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store <3 x double> %a2, ptr %a2.i, align 32 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32 + ; CHECK: store half [[X]], ptr [[PTR]] + %a3 = load half, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a3, i32 6), align 2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32 + store half %a3, ptr %a3.i, align 2 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 12) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a4 = load i64, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a4, i32 8), align 8 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store i64 %a4, ptr %a4.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 26) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5 = load <4 x i32>, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a5, i32 272), align 4 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 38) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64 + ; CHECK: store i16 [[X]], ptr [[PTR]] + %a6 = load i16, ptr addrspace(2) @a6, align 2 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64 + store i16 %a6, ptr %a6.i, align 2 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 40) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a7 = load i64, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a7, i32 8), align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store i64 %a7, ptr %a7.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 42) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %a8 = load i32, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a8, i32 4), align 4, !range !1, !noundef !2 + %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80 + store i32 %a8, ptr %a8.i, align 4 + + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7, ptr addrspace(2) @a8} +!1 = !{i32 0, i32 2} +!2 = !{} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/float.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/float.ll new file mode 100644 index 0000000000000..d7272b449166d --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/float.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +%__cblayout_CB = type <{ float }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@x = external local_unnamed_addr addrspace(2) global float, align 4 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %x = load float, ptr addrspace(2) @x, align 4 + store float %x, ptr %dst, align 4 + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @x} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll new file mode 100644 index 0000000000000..abe087dbe6100 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/gep-ce-two-uses.ll @@ -0,0 +1,36 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [3 x float] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 36, 0)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@a1 = external local_unnamed_addr addrspace(2) global [3 x float], align 4 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 4), align 4 + store float %a1, ptr %dst, align 32 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a2 = load float, ptr addrspace(2) getelementptr inbounds nuw (i8, ptr addrspace(2) @a1, i32 4), align 4 + store float %a2, ptr %dst, align 32 + + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/scalars.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/scalars.ll new file mode 100644 index 0000000000000..125d6b66c0107 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/scalars.ll @@ -0,0 +1,100 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB { +; float a1; // offset 0, size 4 +; int a2; // offset 4, size 4 +; bool a3; // offset 8, size 4 +; float16_t a4; // offset 12, size 2 +; uint16_t a5; // offset 14, size 2 +; double a6; // offset 16, size 8 +; int64_t a7; // offset 24, size 8 +; } +%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 32, 0, 4, 8, 12, 14, 16, 24)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@a1 = external local_unnamed_addr addrspace(2) global float, align 4 +@a2 = external local_unnamed_addr addrspace(2) global i32, align 4 +@a3 = external local_unnamed_addr addrspace(2) global i32, align 4 +@a4 = external local_unnamed_addr addrspace(2) global half, align 2 +@a5 = external local_unnamed_addr addrspace(2) global i16, align 2 +@a6 = external local_unnamed_addr addrspace(2) global double, align 8 +@a7 = external local_unnamed_addr addrspace(2) global i64, align 8 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 32, 0, 4, 8, 12, 14, 16, 24)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 32, 0, 4, 8, 12, 14, 16, 24)) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[A1]], ptr %dst + %a1 = load float, ptr addrspace(2) @a1, align 4 + store float %a1, ptr %dst, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[A2]], ptr [[PTR]] + %a2 = load i32, ptr addrspace(2) @a2, align 4 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %a2, ptr %a2.i, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store i32 [[A3]], ptr [[PTR]] + %a3 = load i32, ptr addrspace(2) @a3, align 4, !range !1, !noundef !2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store i32 %a3, ptr %a3.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12 + ; CHECK: store half [[A4]], ptr [[PTR]] + %a4 = load half, ptr addrspace(2) @a4, align 2 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12 + store half %a4, ptr %a4.i, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14 + ; CHECK: store i16 [[A5]], ptr [[PTR]] + %a5 = load i16, ptr addrspace(2) @a5, align 2 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14 + store i16 %a5, ptr %a5.i, align 2 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store double [[A6]], ptr [[PTR]] + %a6 = load double, ptr addrspace(2) @a6, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store double %a6, ptr %a6.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24 + ; CHECK: store i64 [[A7]], ptr [[PTR]] + %a7 = load i64, ptr addrspace(2) @a7, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24 + store i64 %a7, ptr %a7.i, align 8 + + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7} +!1 = !{i32 0, i32 2} +!2 = !{} diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/vectors.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/vectors.ll new file mode 100644 index 0000000000000..6addf7482ac37 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/vectors.ll @@ -0,0 +1,119 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB { +; float3 a1; // offset 0, size 12 (+4) +; double3 a2; // offset 16, size 24 +; float16_t2 a3; // offset 40, size 4 (+4) +; uint64_t3 a4; // offset 48, size 24 (+8) +; int4 a5; // offset 80, size 16 +; uint16_t3 a6; // offset 96, size 6 (+10) +; }; +%__cblayout_CB = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 102, 0, 16, 40, 48, 80, 96)) poison +; CHECK: @CB.cb = +; CHECK-NOT: external {{.*}} addrspace(2) global +@a1 = external local_unnamed_addr addrspace(2) global <3 x float>, align 16 +@a2 = external local_unnamed_addr addrspace(2) global <3 x double>, align 32 +@a3 = external local_unnamed_addr addrspace(2) global <2 x half>, align 4 +@a4 = external local_unnamed_addr addrspace(2) global <3 x i64>, align 32 +@a5 = external local_unnamed_addr addrspace(2) global <4 x i32>, align 16 +@a6 = external local_unnamed_addr addrspace(2) global <3 x i16>, align 8 + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 102, 0, 16, 40, 48, 80, 96)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 102, 0, 16, 40, 48, 80, 96)) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2 + ; CHECK: store <3 x float> [[VEC2]], ptr %dst + %a1 = load <3 x float>, ptr addrspace(2) @a1, align 16 + store <3 x float> %a1, ptr %dst, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 2) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2 = load <3 x double>, ptr addrspace(2) @a2, align 32 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store <3 x double> %a2, ptr %a2.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4 + ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5 + ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]] + %a3 = load <2 x half>, ptr addrspace(2) @a3, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store <2 x half> %a3, ptr %a3.i, align 2 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 3) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 4) + ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]] + %a4 = load <3 x i64>, ptr addrspace(2) @a4, align 32 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <3 x i64> %a4, ptr %a4.i, align 8 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5 = load <4 x i32>, ptr addrspace(2) @a5, align 16 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 6) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88 + ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]] + %a6 = load <3 x i16>, ptr addrspace(2) @a6, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88 + store <3 x i16> %a6, ptr %a6.i, align 2 + + ret void +} + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, ptr addrspace(2) @a5, ptr addrspace(2) @a6} diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll index ee70cec534bc5..b1bd9f16f4efa 100644 --- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll +++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll @@ -15,6 +15,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: DXIL Finalize Linkage ; CHECK-NEXT: DXIL Intrinsic Expansion +; CHECK-NEXT: DXIL CBuffer Access ; CHECK-NEXT: DXIL Data Scalarization ; CHECK-NEXT: DXIL Array Flattener ; CHECK-NEXT: FunctionPass Manager From 40460a5cf76c973a783fb2f5229e1076398df96e Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Wed, 16 Apr 2025 11:19:13 +0530 Subject: [PATCH 079/710] [RISCV] Add basic ISel patterns for Xqcilo instructions (#135901) This patch adds basic instruction selection patterns for generating the 48 bit load/store instructions that are a part of the Qualcomm uC Xqcilo vendor extension. --- llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 27 ++++ llvm/test/CodeGen/RISCV/xqcilo.ll | 143 ++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/xqcilo.ll diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 2458bda80b1d6..6736b0f1d0328 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -159,6 +159,11 @@ def bare_simm32_lsb0 : Operand { let OperandType = "OPERAND_PCREL"; } +def AddLike: PatFrags<(ops node:$A, node:$B), + [(add node:$A, node:$B), (or node:$A, node:$B)], [{ + return CurDAG->isBaseWithConstantOffset(SDValue(N, 0)); +}]>; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -1239,6 +1244,14 @@ class PatGprNoX0Simm32NoSimm26 : Pat<(i32 (OpNode (i32 GPRNoX0:$rs1), simm32_nosimm26:$imm)), (Inst GPRNoX0:$rs1, simm32_nosimm26:$imm)>; +class QC48LdPat + : Pat<(i32 (LoadOp (AddLike (i32 GPR:$rs1), simm26_nosimm12:$imm26))), + (Inst GPR:$rs1, simm26_nosimm12:$imm26)>; + +class QC48StPat + : Pat<(StoreOp (i32 GPR:$rs2), (AddLike (i32 GPR:$rs1), simm26_nosimm12:$imm26)), + (Inst GPR:$rs2, GPR:$rs1, simm26_nosimm12:$imm26)>; + /// Simple arithmetic operations let Predicates = [HasVendorXqcilia, IsRV32] in { @@ -1253,5 +1266,19 @@ def : PatGprNoX0Simm26NoSimm12; def : PatGprNoX0Simm26NoSimm12; } // Predicates = [HasVendorXqcilia, IsRV32] +let Predicates = [HasVendorXqcilo, IsRV32], AddedComplexity = 2 in { + def : QC48LdPat; + def : QC48LdPat; // Prefer unsigned due to no c.lb in Zcb. + def : QC48LdPat; + def : QC48LdPat; + def : QC48LdPat; + def : QC48LdPat; + def : QC48LdPat; + + def : QC48StPat; + def : QC48StPat; + def : QC48StPat; +} // Predicates = [HasVendorXqcilo, IsRV32], AddedComplexity = 2 + let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; diff --git a/llvm/test/CodeGen/RISCV/xqcilo.ll b/llvm/test/CodeGen/RISCV/xqcilo.ll new file mode 100644 index 0000000000000..fb06f21b3ab98 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcilo.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilo -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32IXQCILO + +define i32 @lb_ri(i8* %a) { +; RV32I-LABEL: lb_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lb a0, 1808(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: lb_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.lb a0, 10000(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i8, i8* %a, i32 10000 + %2 = load i8, i8* %1 + %3 = sext i8 %2 to i32 + ret i32 %3 +} + +define i32 @lbu_ri(i8* %a) { +; RV32I-LABEL: lbu_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 1048574 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lbu a0, 192(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: lbu_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.lbu a0, -8000(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i8, i8* %a, i32 -8000 + %2 = load i8, i8* %1 + %3 = zext i8 %2 to i32 + ret i32 %3 +} + +define i32 @lh_ri(i16* %a) { +; RV32I-LABEL: lh_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 11 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lhu a0, -612(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: lh_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.lhu a0, 44444(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i16, i16* %a, i32 22222 + %2 = load i16, i16* %1 + %3 = zext i16 %2 to i32 + ret i32 %3 +} + +define i32 @lhu_ri(i16* %a) { +; RV32I-LABEL: lhu_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 1048570 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lhu a0, 120(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: lhu_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.lhu a0, -24456(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i16, i16* %a, i32 -12228 + %2 = load i16, i16* %1 + %3 = zext i16 %2 to i32 + ret i32 %3 +} + +define i32 @lw_ri(i32* %a) { +; RV32I-LABEL: lw_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: lw a0, 1953(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: lw_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.lw a0, 4000(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i32, i32* %a, i32 1000 + %2 = load i32, i32* %1 + ret i32 %2 +} + +define void @sb_ri(i8* %a, i8 %b) { +; RV32I-LABEL: sb_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 2 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sb a1, 1808(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: sb_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.sb a1, 10000(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i8, i8* %a, i32 10000 + store i8 %b, i8* %1 + ret void +} + +define void @sh_ri(i16* %a, i16 %b) { +; RV32I-LABEL: sh_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 11 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sh a1, -612(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: sh_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.sh a1, 44444(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i16, i16* %a, i32 22222 + store i16 %b, i16* %1 + ret void +} + +define void @sw_ri(i32* %a, i32 %b) { +; RV32I-LABEL: sw_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: sw a1, 1953(a0) +; RV32I-NEXT: ret +; +; RV32IXQCILO-LABEL: sw_ri: +; RV32IXQCILO: # %bb.0: +; RV32IXQCILO-NEXT: qc.e.sw a1, 4000(a0) +; RV32IXQCILO-NEXT: ret + %1 = getelementptr i32, i32* %a, i32 1000 + store i32 %b, i32* %1 + ret void +} From 123b0e2a1e9de7465be8fb337a80d5d8984f93ae Mon Sep 17 00:00:00 2001 From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com> Date: Wed, 16 Apr 2025 11:28:28 +0530 Subject: [PATCH 080/710] Reapply "[AMDGPU][GlobalISel] Properly handle lane op lowering for larger vector types (#132358)" (#135758) reapply https://github.com/llvm/llvm-project/pull/132358, tests updated. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 1012 +++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 625 +++++++++- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 169 ++- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 168 +++ .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 798 +++++++++++++ 6 files changed, 2750 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 275d0193452a5..5fcbf810abcbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5580,6 +5580,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, return false; LLT PartialResTy = LLT::scalar(SplitSize); + bool NeedsBitcast = false; if (Ty.isVector()) { LLT EltTy = Ty.getElementType(); unsigned EltSize = EltTy.getSizeInBits(); @@ -5588,8 +5589,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, } else if (EltSize == 16 || EltSize == 32) { unsigned NElem = SplitSize / EltSize; PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem)); + } else { + // Handle all other cases via S32/S64 pieces + NeedsBitcast = true; } - // Handle all other cases via S32/S64 pieces; } SmallVector PartialRes; @@ -5615,7 +5618,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); } - B.buildMergeLikeInstr(DstReg, PartialRes); + if (NeedsBitcast) + B.buildBitcast(DstReg, B.buildMergeLikeInstr( + LLT::scalar(Ty.getSizeInBits()), PartialRes)); + else + B.buildMergeLikeInstr(DstReg, PartialRes); + MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index db557ff23c085..693e0ebd0280c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -9398,3 +9398,1015 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr store <8 x i16> %v, ptr addrspace(1) %out ret void } + +define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v2i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v2i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v2i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v2i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v3i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v3i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v4f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v4f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v4f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v4f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v8f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-SDAG-NEXT: v_permlane16_b32 v17, v17, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v16, v16, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v15, v15, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v14, v14, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v13, v13, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v12, v12, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v11, v11, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v10, v10, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v8f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v10, v10, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v11, v11, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v12, v12, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v13, v13, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v14, v14, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v15, v15, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v16, v16, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v17, v17, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v8f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x3 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v8f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x3 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v2i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v2i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v2i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v2i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlanex16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v3i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v3i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlanex16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v4f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v4f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v4f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v4f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlanex16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v8f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v17, v17, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v16, v16, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v15, v15, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v14, v14, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v13, v13, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v12, v12, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v11, v11, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v10, v10, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v8f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v10, v10, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v11, v11, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v12, v12, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v13, v13, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v14, v14, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v15, v15, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v16, v16, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v17, v17, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v8f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x3 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v8f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x3 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlanex16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index f23f9595446eb..6698d360aff4c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL %s declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { +define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-LABEL: test_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 @@ -17,12 +17,93 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) +; GFX11-SDAG-LABEL: test_s_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { +define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX11-SDAG-LABEL: test_s_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { +; GFX11-SDAG-LABEL: test_s_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double %src0) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { ; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 @@ -32,12 +113,115 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 99) +; GFX11-SDAG-LABEL: test_i_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { +define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) + store double %v, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-LABEL: test_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 @@ -47,11 +231,430 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_v_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} + +define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX11-SDAG-LABEL: test_v_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @test_half(ptr addrspace(1) %out, half %src0) { +; GFX11-SDAG-LABEL: test_half: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_half: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane64.f16(half %src0) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { +; GFX11-SDAG-LABEL: test_bfloat: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_bfloat: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @test_i16(ptr addrspace(1) %out, i16 %src0) { +; GFX11-SDAG-LABEL: test_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { +; GFX11-SDAG-LABEL: test_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { +; GFX11-SDAG-LABEL: test_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { +; GFX11-SDAG-LABEL: test_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { +; GFX11-SDAG-LABEL: test_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2i64(ptr addrspace(1) %out, <2 x i64> %src0) { +; GFX11-SDAG-LABEL: test_v2i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane64.v2i64(<2 x i64> %src0) + store <2 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) { +; GFX11-SDAG-LABEL: test_v3i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v3i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane64.v3i64(<3 x i64> %src0) + store <3 x i64> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) { +; GFX11-SDAG-LABEL: test_v4f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v4f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane64.v4f64(<4 x double> %src0) + store <4 x double> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) { +; GFX11-SDAG-LABEL: test_v8f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v17, v17 +; GFX11-SDAG-NEXT: v_permlane64_b32 v16, v16 +; GFX11-SDAG-NEXT: v_permlane64_b32 v15, v15 +; GFX11-SDAG-NEXT: v_permlane64_b32 v14, v14 +; GFX11-SDAG-NEXT: v_permlane64_b32 v13, v13 +; GFX11-SDAG-NEXT: v_permlane64_b32 v12, v12 +; GFX11-SDAG-NEXT: v_permlane64_b32 v11, v11 +; GFX11-SDAG-NEXT: v_permlane64_b32 v10, v10 +; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x3 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 +; GFX11-GISEL-NEXT: v_permlane64_b32 v10, v10 +; GFX11-GISEL-NEXT: v_permlane64_b32 v11, v11 +; GFX11-GISEL-NEXT: v_permlane64_b32 v12, v12 +; GFX11-GISEL-NEXT: v_permlane64_b32 v13, v13 +; GFX11-GISEL-NEXT: v_permlane64_b32 v14, v14 +; GFX11-GISEL-NEXT: v_permlane64_b32 v15, v15 +; GFX11-GISEL-NEXT: v_permlane64_b32 v16, v16 +; GFX11-GISEL-NEXT: v_permlane64_b32 v17, v17 +; GFX11-GISEL-NEXT: s_clause 0x3 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane64.v8f64(<8 x double> %src0) + store <8 x double> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index cc9e34be209b4..c0afc0a443955 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -163,30 +163,157 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ret void } -; FIXME: Broken -; define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { -; %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<2 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v2i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v2i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) + call void asm sideeffect "; use $0", "s"(<2 x i64> %x) + ret void +} -; define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { -; %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<3 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v3i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) + call void asm sideeffect "; use $0", "s"(<3 x i64> %x) + ret void +} -; define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { -; %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<4 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v4i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v4i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) + call void asm sideeffect "; use $0", "s"(<4 x i64> %x) + ret void +} -; define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { -; %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) -; call void asm sideeffect "; use $0", "s"(<8 x i64> %x) -; ret void -; } +define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v8i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v8i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) + call void asm sideeffect "; use $0", "s"(<8 x i64> %x) + ret void +} define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index f2b0959cc706e..42aab1878efa6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -972,6 +972,174 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src ret void } +define void @test_readlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v2i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v2i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x i64> @llvm.amdgcn.readlane.v2i64(<2 x i64> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<2 x i64> %x) + ret void +} + +define void @test_readlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v3i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x i64> @llvm.amdgcn.readlane.v3i64(<3 x i64> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x i64> %x) + ret void +} + +define void @test_readlane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v4f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v4f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v10 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s11 +; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s11 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x double> @llvm.amdgcn.readlane.v4f64(<4 x double> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<4 x double> %x) + ret void +} + +define void @test_readlane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v8f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s19, v17, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s18, v16, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s17, v15, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s16, v14, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s15, v13, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s14, v12, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s13, v11, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s12, v10, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v8f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v18 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s12, v10, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s13, v11, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s14, v12, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s15, v13, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s16, v14, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s17, v15, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s18, v16, s19 +; CHECK-GISEL-NEXT: v_readlane_b32 s19, v17, s19 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x double> @llvm.amdgcn.readlane.v4f64(<8 x double> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<8 x double> %x) + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 4ac2cc98970b5..8cf7497fca640 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2801,6 +2801,804 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr ret void } +define void @test_writelane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v2i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v2i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v2i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v2i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v2i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v2i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <2 x i64>, ptr addrspace(1) %out + %writelane = call <2 x i64> @llvm.amdgcn.writelane.v2i64(<2 x i64> %src, i32 %src1, <2 x i64> %oldval) + store <2 x i64> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s10, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[9:12], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[13:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b64 v[13:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[9:12], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[9:12], off +; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[13:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v3i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v17, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[9:12], v[0:1] +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[13:16], v[17:18] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s10, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[17:18], v[2:3] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v3i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[9:12], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[13:16], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s4, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s10, s5 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v3i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[9:12], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[13:16], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v5 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s0, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s6, s1 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, v14 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[9:12], off +; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <3 x i64>, ptr addrspace(1) %out + %writelane = call <3 x i64> @llvm.amdgcn.writelane.v2i64(<3 x i64> %src, i32 %src1, <3 x i64> %oldval) + store <3 x i64> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v4f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s11, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s12, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v17, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v4f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v4f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v4f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v4f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v4f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <4 x double>, ptr addrspace(1) %out + %writelane = call <4 x double> @llvm.amdgcn.writelane.v4f64(<4 x double> %src, i32 %src1, <4 x double> %oldval) + store <4 x double> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v8f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v18 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1] +; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 16, v0 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[2:5], v[22:23] +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v15 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v14 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v13 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v12 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v11 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v16 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v21, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v20, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v19, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s8, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v17 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v3, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s7, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s8, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s13, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s14, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s15, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s11, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v8f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x3 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[23:26], v[0:1], off +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:48 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:32 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s17, v13 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s18, v12 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s19, v11 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s20, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s13, v17 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s14, v16 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s15, v15 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX1010-SDAG-NEXT: v_writelane_b32 v22, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX1010-SDAG-NEXT: v_writelane_b32 v26, s9, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v30, s13, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v34, s17, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v33, s18, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v32, s19, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v31, s20, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v29, s14, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v28, s15, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v27, s16, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v25, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v24, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v23, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v21, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v20, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v19, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:32 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:48 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[19:22], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v8f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x3 +; GFX1100-SDAG-NEXT: global_load_b128 v[19:22], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[23:26], v[0:1], off +; GFX1100-SDAG-NEXT: global_load_b128 v[27:30], v[0:1], off offset:48 +; GFX1100-SDAG-NEXT: global_load_b128 v[31:34], v[0:1], off offset:32 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v18 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s13, v13 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s15, v11 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s16, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s9, v17 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s10, v16 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s11, v15 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s12, v14 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v22, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v26, s5, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v30, s9, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v34, s13, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v33, s14, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v32, s15, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v31, s16, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v29, s10, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v28, s11, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v27, s12, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v25, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v24, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v23, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v21, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v20, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v19, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x3 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[31:34], off offset:32 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[27:30], off offset:48 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[23:26], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[19:22], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v8f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v18 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[18:21], v[0:1] +; GFX802-GISEL-NEXT: v_add_u32_e32 v22, vcc, 16, v0 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[2:5], v[22:23] +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v11 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v12 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v13 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v14 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s13, v15 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s14, v16 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s15, v17 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v18, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v19, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v20, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v21, s8, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v9 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v10 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[6:9], v[18:19] +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v3, s5, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v5, s7, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v6, s8, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s11, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s12, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s13, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s14, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s15, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[6:9] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v8f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x3 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[19:22], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:32 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:48 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s13, v10 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s14, v11 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s15, v12 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s16, v13 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s17, v14 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s18, v15 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s19, v16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s20, v17 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX1010-GISEL-NEXT: v_writelane_b32 v19, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v20, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v21, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v22, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX1010-GISEL-NEXT: v_writelane_b32 v23, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v24, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v25, s11, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v26, s12, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v27, s13, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v28, s14, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v29, s15, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v30, s16, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v31, s17, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v32, s18, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v33, s19, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v34, s20, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off offset:16 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:32 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:48 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v8f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x3 +; GFX1100-GISEL-NEXT: global_load_b128 v[19:22], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: global_load_b128 v[27:30], v[0:1], off offset:32 +; GFX1100-GISEL-NEXT: global_load_b128 v[31:34], v[0:1], off offset:48 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v18 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s9, v10 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s10, v11 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s11, v12 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s12, v13 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s13, v14 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s14, v15 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s15, v16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s16, v17 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX1100-GISEL-NEXT: v_writelane_b32 v19, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v20, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v21, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v22, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v23, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v24, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v25, s7, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v26, s8, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v27, s9, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v28, s10, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v29, s11, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v30, s12, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v31, s13, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v32, s14, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v33, s15, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v34, s16, s1 +; GFX1100-GISEL-NEXT: s_clause 0x3 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[19:22], off +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[23:26], off offset:16 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[27:30], off offset:32 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[31:34], off offset:48 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <8 x double>, ptr addrspace(1) %out + %writelane = call <8 x double> @llvm.amdgcn.writelane.v8f64(<8 x double> %src, i32 %src1, <8 x double> %oldval) + store <8 x double> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } From f3c77445791b510858561cb424ffa1cd7513250b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= Date: Wed, 16 Apr 2025 08:02:42 +0200 Subject: [PATCH 081/710] [Clang][Sema] Fix -Whigher-precision-for-complex-division (#131477) - Fix false positive when divisor is a real number. - Fix false negative when divident is real, but divisor is complex. - Fix false negative when due to promotion the division is performed in higher precision than the divident. - Fix false negative in divide and assign (`a /= b`). Fixes: #131127 --------- Co-authored-by: Zahira Ammarguellat --- clang/docs/ReleaseNotes.rst | 8 ++ clang/lib/Sema/SemaExpr.cpp | 78 ++++++++-------- .../complex-div-warn-higher-precision.cpp | 93 +++++++++++++++++++ 3 files changed, 141 insertions(+), 38 deletions(-) create mode 100644 clang/test/Sema/complex-div-warn-higher-precision.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 84ad253c1ec4f..5af4c08f64cd8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -371,6 +371,14 @@ Improvements to Clang's diagnostics - An error is now emitted when a ``musttail`` call is made to a function marked with the ``not_tail_called`` attribute. (#GH133509). +- ``-Whigher-precisision-for-complex-divison`` warns when: + + - The divisor is complex. + - When the complex division happens in a higher precision type due to arithmetic promotion. + - When using the divide and assign operator (``/=``). + + Fixes #GH131127 + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index c65b4eadf9c67..6830bb5c01c7d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -10602,6 +10602,45 @@ static void checkArithmeticNull(Sema &S, ExprResult &LHS, ExprResult &RHS, << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); } +static void DetectPrecisionLossInComplexDivision(Sema &S, QualType DivisorTy, + SourceLocation OpLoc) { + // If the divisor is real, then this is real/real or complex/real division. + // Either way there can be no precision loss. + auto *CT = DivisorTy->getAs(); + if (!CT) + return; + + QualType ElementType = CT->getElementType(); + bool IsComplexRangePromoted = S.getLangOpts().getComplexRange() == + LangOptions::ComplexRangeKind::CX_Promoted; + if (!ElementType->isFloatingType() || !IsComplexRangePromoted) + return; + + ASTContext &Ctx = S.getASTContext(); + QualType HigherElementType = Ctx.GetHigherPrecisionFPType(ElementType); + const llvm::fltSemantics &ElementTypeSemantics = + Ctx.getFloatTypeSemantics(ElementType); + const llvm::fltSemantics &HigherElementTypeSemantics = + Ctx.getFloatTypeSemantics(HigherElementType); + + if ((llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 > + llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) || + (HigherElementType == Ctx.LongDoubleTy && + !Ctx.getTargetInfo().hasLongDoubleType())) { + // Retain the location of the first use of higher precision type. + if (!S.LocationOfExcessPrecisionNotSatisfied.isValid()) + S.LocationOfExcessPrecisionNotSatisfied = OpLoc; + for (auto &[Type, Num] : S.ExcessPrecisionNotSatisfied) { + if (Type == HigherElementType) { + Num++; + return; + } + } + S.ExcessPrecisionNotSatisfied.push_back(std::make_pair( + HigherElementType, S.ExcessPrecisionNotSatisfied.size())); + } +} + static void DiagnoseDivisionSizeofPointerOrArray(Sema &S, Expr *LHS, Expr *RHS, SourceLocation Loc) { const auto *LUE = dyn_cast(LHS); @@ -10696,6 +10735,7 @@ QualType Sema::CheckMultiplyDivideOperands(ExprResult &LHS, ExprResult &RHS, if (compType.isNull() || !compType->isArithmeticType()) return InvalidOperands(Loc, LHS, RHS); if (IsDiv) { + DetectPrecisionLossInComplexDivision(*this, RHS.get()->getType(), Loc); DiagnoseBadDivideOrRemainderValues(*this, LHS, RHS, Loc, IsDiv); DiagnoseDivisionSizeofPointerOrArray(*this, LHS.get(), RHS.get(), Loc); } @@ -15347,39 +15387,6 @@ static void DiagnoseBinOpPrecedence(Sema &Self, BinaryOperatorKind Opc, DiagnoseShiftCompare(Self, OpLoc, LHSExpr, RHSExpr); } -static void DetectPrecisionLossInComplexDivision(Sema &S, SourceLocation OpLoc, - Expr *Operand) { - if (auto *CT = Operand->getType()->getAs()) { - QualType ElementType = CT->getElementType(); - bool IsComplexRangePromoted = S.getLangOpts().getComplexRange() == - LangOptions::ComplexRangeKind::CX_Promoted; - if (ElementType->isFloatingType() && IsComplexRangePromoted) { - ASTContext &Ctx = S.getASTContext(); - QualType HigherElementType = Ctx.GetHigherPrecisionFPType(ElementType); - const llvm::fltSemantics &ElementTypeSemantics = - Ctx.getFloatTypeSemantics(ElementType); - const llvm::fltSemantics &HigherElementTypeSemantics = - Ctx.getFloatTypeSemantics(HigherElementType); - if ((llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 > - llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) || - (HigherElementType == Ctx.LongDoubleTy && - !Ctx.getTargetInfo().hasLongDoubleType())) { - // Retain the location of the first use of higher precision type. - if (!S.LocationOfExcessPrecisionNotSatisfied.isValid()) - S.LocationOfExcessPrecisionNotSatisfied = OpLoc; - for (auto &[Type, Num] : S.ExcessPrecisionNotSatisfied) { - if (Type == HigherElementType) { - Num++; - return; - } - } - S.ExcessPrecisionNotSatisfied.push_back(std::make_pair( - HigherElementType, S.ExcessPrecisionNotSatisfied.size())); - } - } - } -} - ExprResult Sema::ActOnBinOp(Scope *S, SourceLocation TokLoc, tok::TokenKind Kind, Expr *LHSExpr, Expr *RHSExpr) { @@ -15390,11 +15397,6 @@ ExprResult Sema::ActOnBinOp(Scope *S, SourceLocation TokLoc, // Emit warnings for tricky precedence issues, e.g. "bitfield & 0x4 == 0" DiagnoseBinOpPrecedence(*this, Opc, TokLoc, LHSExpr, RHSExpr); - // Emit warnings if the requested higher precision type equal to the current - // type precision. - if (Kind == tok::TokenKind::slash) - DetectPrecisionLossInComplexDivision(*this, TokLoc, LHSExpr); - BuiltinCountedByRefKind K = BinaryOperator::isAssignmentOp(Opc) ? AssignmentKind : BinaryExprKind; diff --git a/clang/test/Sema/complex-div-warn-higher-precision.cpp b/clang/test/Sema/complex-div-warn-higher-precision.cpp new file mode 100644 index 0000000000000..a7cc2fbc7ed21 --- /dev/null +++ b/clang/test/Sema/complex-div-warn-higher-precision.cpp @@ -0,0 +1,93 @@ +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-linux -verify=no-diag \ +// RUN: -DDIV_CC -DDIV_RC -DDIVASSIGN -DDIVMIXEDFD -DDIVMIXEDFD2 -DDIVMIXEDID -DDIVASSIGN_MIXEDFD + +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify=no-diag +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIV_CC +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIV_RC +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIVASSIGN +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIVMIXEDFD +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIVMIXEDFD2 +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIVMIXEDID +// RUN: %clang_cc1 %s -complex-range=promoted -fsyntax-only -triple x86_64-unknown-windows -verify -DDIVASSIGN_MIXEDFD + +_Complex double div_ccf(_Complex float a, _Complex float b) { + return a / b; +} + +_Complex double div_cr(_Complex double a, double b) { + return a / b; +} + +_Complex double div_cr_mixed1(_Complex double a, float b) { + return a / b; +} + +_Complex double div_cr_mixed2(_Complex float a, double b) { + return a / b; +} + +_Complex double div_rr(double a, double b) { + return a / b; +} + +_Complex int div_ii(_Complex int a, _Complex int b) { + return a / b; +} + +struct UserT { + friend UserT operator/(UserT, _Complex double); + friend UserT operator/(_Complex double, UserT); +}; + +UserT div_uc(UserT a, _Complex double b) { + return a / b; +} + +UserT div_cu(_Complex double a, UserT b) { + return a / b; +} + +#ifdef DIV_CC +_Complex double div_cc(_Complex double a, const _Complex double b) { + return a / b; // #1 +} +#endif // DIV_CC + +#ifdef DIV_RC +_Complex double div_rc(double a, _Complex float b) { + return a / b; // #1 +} +#endif // DIV_RC + +#ifdef DIVASSIGN +_Complex double divassign(_Complex double a, _Complex double b) { + return a /= b; // #1 +} +#endif // DIVASSIGN + +#ifdef DIVMIXEDFD +_Complex double divmixedfd(_Complex float a, _Complex double b) { + return a / b; // #1 +} +#endif // DIVMIXEDFD + +#ifdef DIVMIXEDFD2 +_Complex double divmixedfd2(_Complex double a, _Complex float b) { + return a / b; // #1 +} +#endif // DIVMIXEDFD2 + +#ifdef DIVMIXEDID +_Complex double divmixedid(_Complex int a, _Complex double b) { + return a / b; // #1 +} +#endif // DIVMIXEDID + +#ifdef DIVASSIGN_MIXEDFD +_Complex double divassign_mixedfd(_Complex float a, _Complex double b) { + return a /= b; // #1 +} +#endif // DIVMIXEDFD + +// no-diag-no-diagnostics +// expected-warning@#1 {{excess precision is requested but the target does not support excess precision which may result in observable differences in complex division behavior}} From dfb5b6e27ca3f8b79ebd3346d11b3088c1600b81 Mon Sep 17 00:00:00 2001 From: leecheechen Date: Wed, 16 Apr 2025 14:12:00 +0800 Subject: [PATCH 082/710] [LoongArch] Don't crash on instruction prefetch intrinsics (#135760) Instead of failing to select during isel, drop the intrinsic in lowering. Similar as the X86's PR. Seeing: https://reviews.llvm.org/D151050. Fixes #134624 --- .../LoongArch/LoongArchISelLowering.cpp | 16 ++++++++- .../Target/LoongArch/LoongArchISelLowering.h | 1 + llvm/test/CodeGen/LoongArch/prefetchi.ll | 33 +++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/LoongArch/prefetchi.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e5ccbe897d19c..f72b55e1d175c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -99,7 +99,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::PREFETCH, MVT::Other, Legal); + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Expand bitreverse.i16 with native-width bitrev and shift for now, before // we get to know which of sll and revb.2h is faster. @@ -469,10 +469,24 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerBITREVERSE(Op, DAG); case ISD::SCALAR_TO_VECTOR: return lowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::PREFETCH: + return lowerPREFETCH(Op, DAG); } return SDValue(); } +SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, + SelectionDAG &DAG) const { + unsigned IsData = Op.getConstantOperandVal(4); + + // We don't support non-data prefetch. + // Just preserve the chain. + if (!IsData) + return Op.getOperand(0); + + return Op; +} + SDValue LoongArchTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 52d88b9b24a6b..8c2d2597a26ec 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -353,6 +353,7 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/test/CodeGen/LoongArch/prefetchi.ll b/llvm/test/CodeGen/LoongArch/prefetchi.ll new file mode 100644 index 0000000000000..a00f6f8161862 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/prefetchi.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64 + +declare void @llvm.prefetch(ptr, i32, i32, i32) nounwind + +define dso_local void @prefetch_no_offset(ptr %ptr) nounwind { +; LA32-LABEL: prefetch_no_offset: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ret +; +; LA64-LABEL: prefetch_no_offset: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ret +entry: + tail call void @llvm.prefetch(ptr %ptr, i32 0, i32 3, i32 0) + ret void +} + + +define dso_local void @prefetch_with_offset(ptr %ptr) nounwind { +; LA32-LABEL: prefetch_with_offset: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ret +; +; LA64-LABEL: prefetch_with_offset: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ret +entry: + %addr = getelementptr i8, ptr %ptr, i64 200 + tail call void @llvm.prefetch(ptr %addr, i32 0, i32 3, i32 0) + ret void +} From 9e650349bfdcd34998c461e1441f02a79c664d38 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 15 Apr 2025 23:16:52 -0700 Subject: [PATCH 083/710] [mlir] Construct SmallVector with ArrayRef (NFC) (#135899) --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index bce5b226635f3..9a1dfd8e17b85 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -483,8 +483,7 @@ LogicalResult tosa::ArgMaxOp::verify() { const ArrayRef inputShape = inputType.getShape(); const ArrayRef outputShape = resultType.getShape(); - llvm::SmallVector expectedOutputShape(inputShape.begin(), - inputShape.end()); + llvm::SmallVector expectedOutputShape(inputShape); expectedOutputShape.erase(expectedOutputShape.begin() + axis); if (failed(verifyCompatibleShape(expectedOutputShape, outputShape))) return emitOpError("expected output shape '") From 52e3f3d68cbabf81c4c118cfb823828f03b712c4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 15 Apr 2025 23:17:33 -0700 Subject: [PATCH 084/710] [mlir] Use llvm::make_first_range (NFC) (#135900) --- mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp index 0f2057cb32ff1..1052946d4550b 100644 --- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp +++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp @@ -1984,8 +1984,7 @@ LogicalResult BytecodeReader::Impl::sortUseListOrder(Value value) { // If the bytecode file did not contain any custom use-list order, it means // that the order was descending useID. Hence, shuffle by the first index // of the `currentOrder` pair. - SmallVector shuffle = SmallVector( - llvm::map_range(currentOrder, [&](auto item) { return item.first; })); + SmallVector shuffle(llvm::make_first_range(currentOrder)); value.shuffleUseList(shuffle); return success(); } From ac4712482e3ff886eee7c044dd33dd4b5d648036 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 15 Apr 2025 23:55:49 -0700 Subject: [PATCH 085/710] [LoongArch] Use FirstRelocationKind to remove ELFObjectWriter::recordRelocation special case The current implementation of R_LARCH_SUB{8,16,32,64} and TLS relocation types relies on fixup kinds FirstLiteralRelocationKind + offset (originally intended for .reloc directives). While this is clever and prevents switch cases like ``` case fixup_...sub8: return ELF::R_LARCH_SUB8; ``` it needs revision. GNU Assembler treats .reloc directives differently from standard relocations, notably by skipping * Skipping STT_SECTION adjustments (when a referenced symbol is local and satisfies certain conditions, it can be redirected to a section symbol). * Skipping STT_TLS symbol type setting for TLS relocations. Encode relocatin type t with FirstRelocationKind+t instead of FirstLiteralRelocationKind+t. The new value is less than FirstLiteralRelocationKind and will not be treated as a .reloc directive. Close #135521 --- llvm/include/llvm/MC/MCFixup.h | 5 ++- llvm/lib/MC/ELFObjectWriter.cpp | 3 +- .../MCTargetDesc/LoongArchAsmBackend.cpp | 34 +++++++++---------- .../MCTargetDesc/LoongArchELFObjectWriter.cpp | 14 ++++---- .../MCTargetDesc/LoongArchFixupKinds.h | 15 ++++---- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/llvm/include/llvm/MC/MCFixup.h b/llvm/include/llvm/MC/MCFixup.h index 7cf8ac2e39092..f27ddeae8b173 100644 --- a/llvm/include/llvm/MC/MCFixup.h +++ b/llvm/include/llvm/MC/MCFixup.h @@ -36,10 +36,13 @@ enum MCFixupKind { FirstTargetFixupKind = 128, + /// Targets can use FirstRelocationKind+t to encode relocation type t. + FirstRelocationKind = 256, + /// The range [FirstLiteralRelocationKind, MaxTargetFixupKind) is used for /// relocations coming from .reloc directive. Fixup kind /// FirstLiteralRelocationKind+V represents the relocation type with number V. - FirstLiteralRelocationKind = 256, + FirstLiteralRelocationKind = 256 + 1032 + 32, /// Set limit to accommodate the highest reloc type in use for all Targets, /// currently R_AARCH64_IRELATIVE at 1032, including room for expansion. diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index db4b41e754581..46c71e28ebc13 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1385,8 +1385,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, auto EMachine = TargetObjectWriter->getEMachine(); unsigned Type; - if (Fixup.getKind() >= FirstLiteralRelocationKind && - EMachine != ELF::EM_LOONGARCH) + if (Fixup.getKind() >= FirstLiteralRelocationKind) Type = Fixup.getKind() - FirstLiteralRelocationKind; else Type = TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index b6a98b3ff9aeb..78a54a9385b08 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -70,7 +70,7 @@ LoongArchAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { // Fixup kinds from .reloc directive are like R_LARCH_NONE. They // do not require any extra processing. - if (Kind >= FirstLiteralRelocationKind) + if (unsigned(Kind) >= FirstRelocationKind) return MCAsmBackend::getFixupKindInfo(FK_NONE); if (Kind < FirstTargetFixupKind) @@ -152,10 +152,10 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm, if (!Value) return; // Doesn't change encoding. - MCFixupKind Kind = Fixup.getKind(); - if (Kind >= FirstLiteralRelocationKind) + auto Kind = Fixup.getTargetKind(); + if (Kind >= FirstRelocationKind) return; - MCFixupKindInfo Info = getFixupKindInfo(Kind); + MCFixupKindInfo Info = getFixupKindInfo(MCFixupKind(Kind)); MCContext &Ctx = Asm.getContext(); // Fixup leb128 separately. @@ -271,29 +271,27 @@ getRelocPairForSize(unsigned Size) { default: llvm_unreachable("unsupported fixup size"); case 6: - return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD6), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB6)); + return std::make_pair(MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD6), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB6)); case 8: - return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD8), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB8)); + return std::make_pair(MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD8), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB8)); case 16: return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD16), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB16)); + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD16), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB16)); case 32: return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD32), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB32)); + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD32), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB32)); case 64: return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64)); + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD64), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB64)); case 128: return std::make_pair( - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD_ULEB128), - MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB_ULEB128)); + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_ADD_ULEB128), + MCFixupKind(FirstRelocationKind + ELF::R_LARCH_SUB_ULEB128)); } } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 2e2a503d5304f..c117e9a60939f 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -50,6 +50,12 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { + // Determine the type of the relocation + unsigned Kind = Fixup.getTargetKind(); + + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; + switch (Target.getSpecifier()) { case LoongArchMCExpr::VK_TLS_LE_HI20: case LoongArchMCExpr::VK_TLS_IE_PC_HI20: @@ -71,12 +77,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, break; } - // Determine the type of the relocation - unsigned Kind = Fixup.getTargetKind(); - - if (Kind >= FirstLiteralRelocationKind) - return Kind - FirstLiteralRelocationKind; - + if (Kind >= FirstRelocationKind) + return Kind - FirstRelocationKind; switch (Kind) { default: Ctx.reportError(Fixup.getLoc(), "Unsupported relocation type"); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h index 370f5b0189b51..aac295c35db69 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h @@ -52,11 +52,10 @@ enum Fixups { fixup_loongarch_invalid, NumTargetFixupKinds = fixup_loongarch_invalid - FirstTargetFixupKind, - // Define fixups for force relocation as FirstLiteralRelocationKind+V + // Define fixups for force relocation as FirstRelocationKind+V // represents the relocation type with number V. // 20-bit fixup corresponding to %pc_hi20(foo) for instruction pcalau12i. - fixup_loongarch_pcala_hi20 = - FirstLiteralRelocationKind + ELF::R_LARCH_PCALA_HI20, + fixup_loongarch_pcala_hi20 = FirstRelocationKind + ELF::R_LARCH_PCALA_HI20, // 12-bit fixup corresponding to %pc_lo12(foo) for instructions like addi.w/d. fixup_loongarch_pcala_lo12, // 20-bit fixup corresponding to %pc64_lo20(foo) for instruction lu32i.d. @@ -83,7 +82,7 @@ enum Fixups { // Skip R_LARCH_TLS_LE_*. // 20-bit fixup corresponding to %ie_pc_hi20(foo) for instruction pcalau12i. fixup_loongarch_tls_ie_pc_hi20 = - FirstLiteralRelocationKind + ELF::R_LARCH_TLS_IE_PC_HI20, + FirstRelocationKind + ELF::R_LARCH_TLS_IE_PC_HI20, // 12-bit fixup corresponding to %ie_pc_lo12(foo) for instructions // ld.w/ld.d/add.d. fixup_loongarch_tls_ie_pc_lo12, @@ -108,17 +107,17 @@ enum Fixups { // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w. fixup_loongarch_tls_gd_hi20, // Generate an R_LARCH_RELAX which indicates the linker may relax here. - fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX, + fixup_loongarch_relax = FirstRelocationKind + ELF::R_LARCH_RELAX, // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here. - fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN, + fixup_loongarch_align = FirstRelocationKind + ELF::R_LARCH_ALIGN, // 20-bit fixup corresponding to %pcrel_20(foo) for instruction pcaddi. fixup_loongarch_pcrel20_s2, // 36-bit fixup corresponding to %call36(foo) for a pair instructions: // pcaddu18i+jirl. - fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36, + fixup_loongarch_call36 = FirstRelocationKind + ELF::R_LARCH_CALL36, // 20-bit fixup corresponding to %desc_pc_hi20(foo) for instruction pcalau12i. fixup_loongarch_tls_desc_pc_hi20 = - FirstLiteralRelocationKind + ELF::R_LARCH_TLS_DESC_PC_HI20, + FirstRelocationKind + ELF::R_LARCH_TLS_DESC_PC_HI20, // 12-bit fixup corresponding to %desc_pc_lo12(foo) for instructions like // addi.w/d. fixup_loongarch_tls_desc_pc_lo12, From a56f966417bc53051fa39e3db6fcc95f9abf0b5c Mon Sep 17 00:00:00 2001 From: Mythreya Date: Wed, 16 Apr 2025 00:00:14 -0700 Subject: [PATCH 086/710] [clangd][docs] Fix incorrect docstring for header-insertion "Never" (#135921) Docstring fix for changes introduced in PR #128503 --- clang-tools-extra/clangd/ConfigFragment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h index f05ed4d1acdfc..2363b483ab96d 100644 --- a/clang-tools-extra/clangd/ConfigFragment.h +++ b/clang-tools-extra/clangd/ConfigFragment.h @@ -347,7 +347,7 @@ struct Fragment { /// "IWYU": Include what you use. Insert the owning header for top-level /// symbols, unless the header is already directly included or the /// symbol is forward-declared - /// "NeverInsert": Never insert headers + /// "Never": Never insert headers std::optional> HeaderInsertion; }; CompletionBlock Completion; From 05eafd9f2b14f2e8d2d95f46465c5cc53aafbc56 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Apr 2025 09:00:52 +0200 Subject: [PATCH 087/710] [clang][bytecode] Explicitly mark constexpr-unknown variables as such (#135806) Instead of trying to figure out what's constexpr-unknown later on. --- clang/lib/AST/ByteCode/Compiler.cpp | 24 +++++++++++++------- clang/lib/AST/ByteCode/Compiler.h | 12 ++++++---- clang/lib/AST/ByteCode/Descriptor.h | 1 + clang/lib/AST/ByteCode/Disasm.cpp | 2 ++ clang/lib/AST/ByteCode/Interp.cpp | 9 ++++---- clang/lib/AST/ByteCode/Program.cpp | 2 +- clang/test/AST/ByteCode/codegen.cpp | 35 +++++++++++++++++++++++++++-- 7 files changed, 65 insertions(+), 20 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 2e22c85ed5f6d..afd8d09a088cd 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4293,7 +4293,8 @@ bool Compiler::emitConst(const APSInt &Value, const Expr *E) { template unsigned Compiler::allocateLocalPrimitive( - DeclTy &&Src, PrimType Ty, bool IsConst, const ValueDecl *ExtendingDecl) { + DeclTy &&Src, PrimType Ty, bool IsConst, const ValueDecl *ExtendingDecl, + bool IsConstexprUnknown) { // Make sure we don't accidentally register the same decl twice. if (const auto *VD = dyn_cast_if_present(Src.dyn_cast())) { @@ -4307,6 +4308,7 @@ unsigned Compiler::allocateLocalPrimitive( // or isa(). Descriptor *D = P.createDescriptor(Src, Ty, nullptr, Descriptor::InlineDescMD, IsConst, isa(Src)); + D->IsConstexprUnknown = IsConstexprUnknown; Scope::Local Local = this->createLocal(D); if (auto *VD = dyn_cast_if_present(Src.dyn_cast())) Locals.insert({VD, Local}); @@ -4320,7 +4322,8 @@ unsigned Compiler::allocateLocalPrimitive( template std::optional Compiler::allocateLocal(DeclTy &&Src, QualType Ty, - const ValueDecl *ExtendingDecl) { + const ValueDecl *ExtendingDecl, + bool IsConstexprUnknown) { // Make sure we don't accidentally register the same decl twice. if ([[maybe_unused]] const auto *VD = dyn_cast_if_present(Src.dyn_cast())) { @@ -4349,6 +4352,7 @@ Compiler::allocateLocal(DeclTy &&Src, QualType Ty, IsTemporary, /*IsMutable=*/false, Init); if (!D) return std::nullopt; + D->IsConstexprUnknown = IsConstexprUnknown; Scope::Local Local = this->createLocal(D); if (Key) @@ -4460,9 +4464,10 @@ bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { } template -VarCreationState Compiler::visitDecl(const VarDecl *VD) { +VarCreationState Compiler::visitDecl(const VarDecl *VD, + bool IsConstexprUnknown) { - auto R = this->visitVarDecl(VD, /*Toplevel=*/true); + auto R = this->visitVarDecl(VD, /*Toplevel=*/true, IsConstexprUnknown); if (R.notCreated()) return R; @@ -4550,7 +4555,8 @@ bool Compiler::visitDeclAndReturn(const VarDecl *VD, template VarCreationState Compiler::visitVarDecl(const VarDecl *VD, - bool Toplevel) { + bool Toplevel, + bool IsConstexprUnknown) { // We don't know what to do with these, so just return false. if (VD->getType().isNull()) return false; @@ -4620,7 +4626,8 @@ VarCreationState Compiler::visitVarDecl(const VarDecl *VD, if (VarT) { unsigned Offset = this->allocateLocalPrimitive( - VD, *VarT, VD->getType().isConstQualified()); + VD, *VarT, VD->getType().isConstQualified(), nullptr, + IsConstexprUnknown); if (Init) { // If this is a toplevel declaration, create a scope for the // initializer. @@ -4636,7 +4643,8 @@ VarCreationState Compiler::visitVarDecl(const VarDecl *VD, } } } else { - if (std::optional Offset = this->allocateLocal(VD)) { + if (std::optional Offset = this->allocateLocal( + VD, VD->getType(), nullptr, IsConstexprUnknown)) { if (!Init) return true; @@ -6461,7 +6469,7 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { // In case we need to re-visit a declaration. auto revisit = [&](const VarDecl *VD) -> bool { - auto VarState = this->visitDecl(VD); + auto VarState = this->visitDecl(VD, /*IsConstexprUnknown=*/true); if (VarState.notCreated()) return true; diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 858957367d85d..a3090a8a31189 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -286,8 +286,10 @@ class Compiler : public ConstStmtVisitor, bool>, /// intact. bool delegate(const Expr *E); /// Creates and initializes a variable from the given decl. - VarCreationState visitVarDecl(const VarDecl *VD, bool Toplevel = false); - VarCreationState visitDecl(const VarDecl *VD); + VarCreationState visitVarDecl(const VarDecl *VD, bool Toplevel = false, + bool IsConstexprUnknown = false); + VarCreationState visitDecl(const VarDecl *VD, + bool IsConstexprUnknown = false); /// Visit an APValue. bool visitAPValue(const APValue &Val, PrimType ValType, const Expr *E); bool visitAPValueInitializer(const APValue &Val, const Expr *E, QualType T); @@ -303,12 +305,14 @@ class Compiler : public ConstStmtVisitor, bool>, /// Creates a local primitive value. unsigned allocateLocalPrimitive(DeclTy &&Decl, PrimType Ty, bool IsConst, - const ValueDecl *ExtendingDecl = nullptr); + const ValueDecl *ExtendingDecl = nullptr, + bool IsConstexprUnknown = false); /// Allocates a space storing a local given its type. std::optional allocateLocal(DeclTy &&Decl, QualType Ty = QualType(), - const ValueDecl *ExtendingDecl = nullptr); + const ValueDecl *ExtendingDecl = nullptr, + bool IsConstexprUnknown = false); std::optional allocateTemporary(const Expr *E); private: diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h index 9acabfd31d80b..a0705cc8c377f 100644 --- a/clang/lib/AST/ByteCode/Descriptor.h +++ b/clang/lib/AST/ByteCode/Descriptor.h @@ -168,6 +168,7 @@ struct Descriptor final { const bool IsArray = false; /// Flag indicating if this is a dummy descriptor. bool IsDummy = false; + bool IsConstexprUnknown = false; /// Storage management methods. const BlockCtorFn CtorFn = nullptr; diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index d4c9ce6050b85..4bdf0f0afb1b0 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -376,6 +376,8 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const { if (isDummy()) OS << " dummy"; + if (IsConstexprUnknown) + OS << " constexpr-unknown"; } /// Dump descriptor, including all valid offsets. diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 3e1f36da8925f..4625154ac353d 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -299,15 +299,14 @@ void cleanupAfterFunctionCall(InterpState &S, CodePtr OpPC, TYPE_SWITCH(Ty, S.Stk.discard()); } -// FIXME: Instead of using this fairly expensive test, we should -// just mark constexpr-unknown values when creating them. bool isConstexprUnknown(const Pointer &P) { if (!P.isBlockPointer()) return false; + if (P.isDummy()) - return false; - const VarDecl *VD = P.block()->getDescriptor()->asVarDecl(); - return VD && VD->hasLocalStorage() && !isa(VD); + return isa_and_nonnull(P.getDeclDesc()->asValueDecl()); + + return P.getDeclDesc()->IsConstexprUnknown; } bool CheckBCPResult(InterpState &S, const Pointer &Ptr) { diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index c83f5579fd55f..fdb7b960ff06d 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -156,7 +156,7 @@ unsigned Program::getOrCreateDummy(const DeclTy &D) { if (const auto *E = dyn_cast(D)) { QT = E->getType(); } else { - const ValueDecl *VD = cast(cast(D)); + const auto *VD = cast(cast(D)); IsWeak = VD->isWeak(); QT = VD->getType(); if (const auto *RT = QT->getAs()) diff --git a/clang/test/AST/ByteCode/codegen.cpp b/clang/test/AST/ByteCode/codegen.cpp index 7c853a20362b8..6f9e75eac6026 100644 --- a/clang/test/AST/ByteCode/codegen.cpp +++ b/clang/test/AST/ByteCode/codegen.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fcxx-exceptions | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fcxx-exceptions -fexperimental-new-constant-interpreter | FileCheck %s #ifdef __SIZEOF_INT128__ // CHECK: @PR11705 = global i128 0 @@ -104,3 +104,34 @@ int notdead() { } // CHECK: _ZZ7notdeadvEN3$_0clEv // CHECK: ret i32 %cond + +/// The conmparison of those two parameters should NOT work. +bool paramcmp(const int& lhs, const int& rhs) { + if (&lhs == &rhs) + return true; + return false; +} +// CHECK: _Z8paramcmpRKiS0_ +// CHECK: if.then +// CHECK: if.end + +/// &x == &OuterX should work and return 0. +class X { +public: + X(); + X(const X&); + X(const volatile X &); + ~X(); +}; + +extern X OuterX; + +X test24() { + X x; + if (&x == &OuterX) + throw 0; + return x; +} +// CHECK: _Z6test24v +// CHECK-NOT: eh.resume +// CHECK-NOT: unreachable From c1fc4c6a69c6ca0ae6aa060e4377ab5040505a66 Mon Sep 17 00:00:00 2001 From: Clo91eaf Date: Wed, 16 Apr 2025 15:05:15 +0800 Subject: [PATCH 088/710] [mlir][SMT] fix the operation name in ArrayBroadcastOp description (#135746) --- mlir/include/mlir/Dialect/SMT/IR/SMTArrayOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/SMT/IR/SMTArrayOps.td b/mlir/include/mlir/Dialect/SMT/IR/SMTArrayOps.td index 1869f4ae81595..ac1c2f3ed409a 100644 --- a/mlir/include/mlir/Dialect/SMT/IR/SMTArrayOps.td +++ b/mlir/include/mlir/Dialect/SMT/IR/SMTArrayOps.td @@ -75,7 +75,7 @@ def ArrayBroadcastOp : SMTArrayOp<"broadcast", [ This operation represents a broadcast of the 'value' operand to all indices of the array. It is equivalent to ``` - %0 = smt.declare "array" : !smt.array<[!smt.int -> !smt.bool]> + %0 = smt.declare_fun "array" : !smt.array<[!smt.int -> !smt.bool]> %1 = smt.forall ["idx"] { ^bb0(%idx: !smt.int): %2 = smt.array.select %0[%idx] : !smt.array<[!smt.int -> !smt.bool]> From a630ef71e84a6bf09a99053ea42d37632ca0d18a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 16 Apr 2025 00:07:16 -0700 Subject: [PATCH 089/710] ELFObjectWriter: Disable STT_SECTION adjustment for .reloc ... to match GNU Assembler. This generalizes the SHT_LLVM_CALL_GRAPH_PROFILE special case (which uses .reloc with BFD_RELOC_NONE https://reviews.llvm.org/D104080). Targets that want STT_SECTION adjustment cannot use FirstLiteralRelocationKind derived fixup kinds. Depends on the fix of #135521 Pull Request: https://github.com/llvm/llvm-project/pull/135519 --- llvm/lib/MC/ELFObjectWriter.cpp | 5 ++--- llvm/test/MC/ELF/reloc-directive.s | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 46c71e28ebc13..2ae9c21271624 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1395,9 +1395,8 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, if (UseSectionSym) { UseSectionSym = useSectionSymbol(Asm, Target, SymA, C, Type); - // Disable STT_SECTION adjustment for CG Profile to help with --cg-profile. - const auto *Parent = cast(Fragment->getParent()); - UseSectionSym &= Parent->getType() != ELF::SHT_LLVM_CALL_GRAPH_PROFILE; + // Disable STT_SECTION adjustment for .reloc directives. + UseSectionSym &= Fixup.getKind() < FirstLiteralRelocationKind; } uint64_t Addend = UseSectionSym ? C + Asm.getSymbolOffset(*SymA) : C; diff --git a/llvm/test/MC/ELF/reloc-directive.s b/llvm/test/MC/ELF/reloc-directive.s index f4121ef071810..42995aa9e7d81 100644 --- a/llvm/test/MC/ELF/reloc-directive.s +++ b/llvm/test/MC/ELF/reloc-directive.s @@ -8,24 +8,25 @@ # ASM-NEXT: .Ltmp1: # ASM-NEXT: .reloc .Ltmp1-1, R_X86_64_NONE, foo # ASM-NEXT: .Ltmp2: -# ASM-NEXT: .reloc 2+.Ltmp2, R_X86_64_NONE, foo +# ASM-NEXT: .reloc 2+.Ltmp2, R_X86_64_NONE, local # ASM-NEXT: .reloc 1+foo+3, R_X86_64_NONE, data+1 # ASM-NEXT: .Ltmp3: # ASM-NEXT: .reloc .Ltmp3, BFD_RELOC_NONE, unused # CHECK: 0x2 R_X86_64_NONE foo 0x0 # CHECK-NEXT: 0x0 R_X86_64_NONE foo 0x0 -# CHECK-NEXT: 0x3 R_X86_64_NONE foo 0x0 +# CHECK-NEXT: 0x3 R_X86_64_NONE local 0x0 # CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1 # CHECK-NEXT: 0x1 R_X86_64_NONE unused 0x0 .text .globl foo foo: +local: ret .reloc .+3-2, R_X86_64_NONE, foo .reloc .-1, R_X86_64_NONE, foo - .reloc 2+., R_X86_64_NONE, foo + .reloc 2+., R_X86_64_NONE, local .reloc 1+foo+3, R_X86_64_NONE, data+1 .reloc ., BFD_RELOC_NONE, unused From 507d7dc651b28d8a975ba8ca6e8f5906b07e37e7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 16 Apr 2025 00:09:07 -0700 Subject: [PATCH 090/710] [LoongArch] Simplify getRelocType --- .../LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index c117e9a60939f..d129e0e2497e4 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -50,12 +50,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // Determine the type of the relocation - unsigned Kind = Fixup.getTargetKind(); - - if (Kind >= FirstLiteralRelocationKind) - return Kind - FirstLiteralRelocationKind; - switch (Target.getSpecifier()) { case LoongArchMCExpr::VK_TLS_LE_HI20: case LoongArchMCExpr::VK_TLS_IE_PC_HI20: @@ -77,6 +71,7 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, break; } + unsigned Kind = Fixup.getTargetKind(); if (Kind >= FirstRelocationKind) return Kind - FirstRelocationKind; switch (Kind) { From 3d97d71e66036f51cf0b45cc7d5f3a0a14192eb4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 00:33:29 -0700 Subject: [PATCH 091/710] [Lex] Use llvm::make_second_range (NFC) (#135902) --- clang/lib/Lex/HeaderSearch.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index 9283a0f4fce55..2665580e5afce 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -1932,8 +1932,7 @@ void HeaderSearch::collectAllModules(SmallVectorImpl &Modules) { } // Populate the list of modules. - llvm::transform(ModMap.modules(), std::back_inserter(Modules), - [](const auto &NameAndMod) { return NameAndMod.second; }); + llvm::append_range(Modules, llvm::make_second_range(ModMap.modules())); } void HeaderSearch::loadTopLevelSystemModules() { From 1256ca04c2064f2ef05625ff93a7954642af84a1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 00:36:36 -0700 Subject: [PATCH 092/710] [CodeGen] Call DenseMap::erase directly (NFC) (#135898) --- llvm/lib/CodeGen/MachineFunction.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index eb0a16732b058..833b765b14d35 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -946,9 +946,7 @@ void MachineFunction::eraseAdditionalCallInfo(const MachineInstr *MI) { if (CSIt != CallSitesInfo.end()) CallSitesInfo.erase(CSIt); - CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(CallMI); - if (CGIt != CalledGlobalsInfo.end()) - CalledGlobalsInfo.erase(CGIt); + CalledGlobalsInfo.erase(CallMI); } void MachineFunction::copyAdditionalCallInfo(const MachineInstr *Old, From bf0de88696095342aaa58e5e0a0105403d5ebd5e Mon Sep 17 00:00:00 2001 From: Nuno Lopes Date: Wed, 16 Apr 2025 08:38:37 +0100 Subject: [PATCH 093/710] code format checker: fix python error when the diff becomes empty --- llvm/utils/git/code-format-helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 69d654b87e856..ed102b54f9b52 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -385,7 +385,9 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str # Each file is prefixed like: # diff --git a/file b/file for file in re.split("^diff --git ", stdout, 0, re.MULTILINE): - filename = re.match("a/([^ ]+)", file.splitlines()[0])[1] + lines = file.splitlines() + match = re.match("a/([^ ]+)", lines[0] if lines else "") + filename = match[1] if match else "" if filename.endswith(".ll"): undef_regex = r"(? Date: Wed, 16 Apr 2025 13:10:56 +0530 Subject: [PATCH 094/710] [RISCV] Add basic ISel patterns for Xqcisls instructions (#135918) This patch adds basic instruction selection patterns for generating the scaled load/store instructions that are a part of the Qualcomm uC Xqcisls vendor extension. --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 28 +++ llvm/test/CodeGen/RISCV/xqcisls.ll | 207 ++++++++++++++++++++ 3 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/xqcisls.ll diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 1104d9089536f..baf2bae367df1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -248,7 +248,7 @@ def InsnDirectiveOpcode : AsmOperandClass { def uimm1 : RISCVUImmLeafOp<1>; def uimm2 : RISCVUImmLeafOp<2>; -def uimm3 : RISCVUImmOp<3>; +def uimm3 : RISCVUImmLeafOp<3>; def uimm4 : RISCVUImmLeafOp<4>; def uimm5 : RISCVUImmLeafOp<5>; def uimm6 : RISCVUImmLeafOp<6>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 6736b0f1d0328..bb0818b001d38 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -164,6 +164,9 @@ def AddLike: PatFrags<(ops node:$A, node:$B), return CurDAG->isBaseWithConstantOffset(SDValue(N, 0)); }]>; +def AddShl : PatFrag<(ops node:$Ra, node:$Rb, node:$SH3), + (add node:$Ra, (shl node:$Rb, node:$SH3))>; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -1252,6 +1255,14 @@ class QC48StPat : Pat<(StoreOp (i32 GPR:$rs2), (AddLike (i32 GPR:$rs1), simm26_nosimm12:$imm26)), (Inst GPR:$rs2, GPR:$rs1, simm26_nosimm12:$imm26)>; +class QCScaledLdPat + : Pat<(i32 (LoadOp (AddShl (i32 GPRMem:$rs1), (i32 GPRNoX0:$rs2), uimm3:$shamt))), + (Inst GPRMem:$rs1, GPRNoX0:$rs2, uimm3:$shamt)>; + +class QCScaledStPat + : Pat<(StoreOp (i32 GPR:$rd), (AddShl (i32 GPRMem:$rs1), (i32 GPRNoX0:$rs2), uimm3:$shamt)), + (Inst GPR:$rd, GPRMem:$rs1, GPRNoX0:$rs2, uimm3:$shamt)>; + /// Simple arithmetic operations let Predicates = [HasVendorXqcilia, IsRV32] in { @@ -1266,6 +1277,8 @@ def : PatGprNoX0Simm26NoSimm12; def : PatGprNoX0Simm26NoSimm12; } // Predicates = [HasVendorXqcilia, IsRV32] +/// Load/Store operations + let Predicates = [HasVendorXqcilo, IsRV32], AddedComplexity = 2 in { def : QC48LdPat; def : QC48LdPat; // Prefer unsigned due to no c.lb in Zcb. @@ -1280,5 +1293,20 @@ let Predicates = [HasVendorXqcilo, IsRV32], AddedComplexity = 2 in { def : QC48StPat; } // Predicates = [HasVendorXqcilo, IsRV32], AddedComplexity = 2 + +let Predicates = [HasVendorXqcisls, IsRV32], AddedComplexity = 1 in { + def : QCScaledLdPat; + def : QCScaledLdPat; + def : QCScaledLdPat; + def : QCScaledLdPat; + def : QCScaledLdPat; + def : QCScaledLdPat; + def : QCScaledLdPat; + + def : QCScaledStPat; + def : QCScaledStPat; + def : QCScaledStPat; +} // Predicates = [HasVendorXqcisls, IsRV32], AddedComplexity = 1 + let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll new file mode 100644 index 0000000000000..b9263d487b60f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcisls.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 --mattr=+zba -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32IZBA +; RUN: llc -mtriple=riscv32 -mattr=+zba,+experimental-xqcisls -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32IZBAXQCISLS + +define i32 @lb_ri(i8* %a, i32 %b) { +; RV32I-LABEL: lb_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lb a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: lb_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: sh3add a0, a1, a0 +; RV32IZBA-NEXT: lb a0, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: lb_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.lrb a0, a0, a1, 3 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %b, 3 + %1 = getelementptr i8, i8* %a, i32 %shl + %2 = load i8, i8* %1 + %3 = sext i8 %2 to i32 + ret i32 %3 +} + +define i32 @lbu_ri(i8* %a, i32 %b) { +; RV32I-LABEL: lbu_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: lbu_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: sh2add a0, a1, a0 +; RV32IZBA-NEXT: lbu a0, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: lbu_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.lrbu a0, a0, a1, 2 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %b, 2 + %1 = getelementptr i8, i8* %a, i32 %shl + %2 = load i8, i8* %1 + %3 = zext i8 %2 to i32 + ret i32 %3 +} + +define i32 @lh_ri(i16* %a, i32 %b) { +; RV32I-LABEL: lh_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lh a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: lh_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: slli a1, a1, 5 +; RV32IZBA-NEXT: add a0, a0, a1 +; RV32IZBA-NEXT: lh a0, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: lh_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.lrh a0, a0, a1, 5 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %b, 4 + %1 = getelementptr i16, i16* %a, i32 %shl + %2 = load i16, i16* %1 + %3 = sext i16 %2 to i32 + ret i32 %3 +} + +define i32 @lhu_ri(i16* %a, i32 %b) { +; RV32I-LABEL: lhu_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 6 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: lhu_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: slli a1, a1, 6 +; RV32IZBA-NEXT: add a0, a0, a1 +; RV32IZBA-NEXT: lhu a0, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: lhu_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.lrhu a0, a0, a1, 6 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %b, 5 + %1 = getelementptr i16, i16* %a, i32 %shl + %2 = load i16, i16* %1 + %3 = zext i16 %2 to i32 + ret i32 %3 +} + +define i32 @lw_ri(i32* %a, i32 %b) { +; RV32I-LABEL: lw_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 6 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: lw_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: slli a1, a1, 6 +; RV32IZBA-NEXT: add a0, a0, a1 +; RV32IZBA-NEXT: lw a0, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: lw_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.lrw a0, a0, a1, 6 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %b, 4 + %1 = getelementptr i32, i32* %a, i32 %shl + %2 = load i32, i32* %1 + ret i32 %2 +} + +define void @sb_ri(i8* %a, i8 %b, i32 %c) { +; RV32I-LABEL: sb_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a2, a2, 7 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sb a1, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: sb_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: slli a2, a2, 7 +; RV32IZBA-NEXT: add a0, a0, a2 +; RV32IZBA-NEXT: sb a1, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: sb_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.srb a1, a0, a2, 7 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %c, 7 + %1 = getelementptr i8, i8* %a, i32 %shl + store i8 %b, i8* %1 + ret void +} + +define void @sh_ri(i16* %a, i16 %b, i32 %c) { +; RV32I-LABEL: sh_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sh a1, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: sh_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: sh3add a0, a2, a0 +; RV32IZBA-NEXT: sh a1, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: sh_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.srh a1, a0, a2, 3 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %c, 2 + %1 = getelementptr i16, i16* %a, i32 %shl + store i16 %b, i16* %1 + ret void +} + +define void @sw_ri(i32* %a, i32 %b, i32 %c) { +; RV32I-LABEL: sw_ri: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: sw_ri: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: sh3add a0, a2, a0 +; RV32IZBA-NEXT: sw a1, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: sw_ri: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: qc.srw a1, a0, a2, 3 +; RV32IZBAXQCISLS-NEXT: ret + %shl = shl i32 %c, 1 + %1 = getelementptr i32, i32* %a, i32 %shl + store i32 %b, i32* %1 + ret void +} From 559df834df4c5653227cb85129904004164e4c4f Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Apr 2025 10:48:42 +0200 Subject: [PATCH 095/710] [clang][bytecode] Fix subtracting zero-sized pointers (#135929) Add the appropriate diagnostic and fix the d-d case. --- clang/lib/AST/ByteCode/Interp.h | 30 ++++++++++++-------------- clang/lib/AST/ByteCode/Pointer.h | 9 ++++++-- clang/test/AST/ByteCode/arrays.cpp | 6 +++++- clang/test/AST/ByteCode/new-delete.cpp | 25 +++++++++++++++++++++ 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index b4e15b3ffbe68..88a011efe708e 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2141,12 +2141,25 @@ static inline bool DecPtr(InterpState &S, CodePtr OpPC) { /// 1) Pops a Pointer from the stack. /// 2) Pops another Pointer from the stack. -/// 3) Pushes the different of the indices of the two pointers on the stack. +/// 3) Pushes the difference of the indices of the two pointers on the stack. template ::T> inline bool SubPtr(InterpState &S, CodePtr OpPC) { const Pointer &LHS = S.Stk.pop(); const Pointer &RHS = S.Stk.pop(); + if (!Pointer::hasSameBase(LHS, RHS) && S.getLangOpts().CPlusPlus) { + S.FFDiag(S.Current->getSource(OpPC), + diag::note_constexpr_pointer_arith_unspecified) + << LHS.toDiagnosticString(S.getASTContext()) + << RHS.toDiagnosticString(S.getASTContext()); + return false; + } + + if (LHS == RHS) { + S.Stk.push(); + return true; + } + for (const Pointer &P : {LHS, RHS}) { if (P.isZeroSizeArray()) { QualType PtrT = P.getType(); @@ -2163,21 +2176,6 @@ inline bool SubPtr(InterpState &S, CodePtr OpPC) { } } - if (RHS.isZero()) { - S.Stk.push(T::from(LHS.getIndex())); - return true; - } - - if (!Pointer::hasSameBase(LHS, RHS) && S.getLangOpts().CPlusPlus) { - // TODO: Diagnose. - return false; - } - - if (LHS.isZero() && RHS.isZero()) { - S.Stk.push(); - return true; - } - T A = LHS.isBlockPointer() ? (LHS.isElementPastEnd() ? T::from(LHS.getNumElems()) : T::from(LHS.getIndex())) diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index 5eef9d2e1885e..8ede706f2736f 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -129,12 +129,17 @@ class Pointer { return false; if (isIntegralPointer()) return P.asIntPointer().Value == asIntPointer().Value && - Offset == P.Offset; + P.asIntPointer().Desc == asIntPointer().Desc && P.Offset == Offset; + + if (isFunctionPointer()) + return P.asFunctionPointer().getFunction() == + asFunctionPointer().getFunction() && + P.Offset == Offset; assert(isBlockPointer()); return P.asBlockPointer().Pointee == asBlockPointer().Pointee && P.asBlockPointer().Base == asBlockPointer().Base && - Offset == P.Offset; + P.Offset == Offset; } bool operator!=(const Pointer &P) const { return !(P == *this); } diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp index 8af82163fd815..f60cc19b09bd2 100644 --- a/clang/test/AST/ByteCode/arrays.cpp +++ b/clang/test/AST/ByteCode/arrays.cpp @@ -106,7 +106,8 @@ constexpr int k1 = &arr[1] - &arr[0]; static_assert(k1 == 1, ""); static_assert((&arr[0] - &arr[1]) == -1, ""); -constexpr int k2 = &arr2[1] - &arr[0]; // both-error {{must be initialized by a constant expression}} +constexpr int k2 = &arr2[1] - &arr[0]; // both-error {{must be initialized by a constant expression}} \ + // expected-note {{arithmetic involving unrelated objects}} static_assert((arr + 0) == arr, ""); static_assert(&arr[0] == arr, ""); @@ -735,6 +736,9 @@ namespace ZeroSizeTypes { return &arr[3] - &arr[0]; // both-note {{subtraction of pointers to type 'int[0]' of zero size}} \ // both-warning {{subtraction of pointers to type 'int[0]' of zero size has undefined behavior}} } + + constexpr int z[0]{}; + static_assert((z - z) == 0); } namespace InvalidIndex { diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index bd7351cbc3d4c..5ddd7070f6710 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -967,6 +967,31 @@ namespace PR45350 { static_assert(f(6) == 543210); } +namespace ZeroSizeSub { + consteval unsigned ptr_diff1() { + int *b = new int[0]; + unsigned d = 0; + d = b - b; + delete[] b; + + return d; + } + static_assert(ptr_diff1() == 0); + + + consteval unsigned ptr_diff2() { // both-error {{never produces a constant expression}} + int *a = new int[0]; + int *b = new int[0]; + + unsigned d = a - b; // both-note 2{{arithmetic involving unrelated objects}} + delete[] b; + delete[] a; + return d; + } + static_assert(ptr_diff2() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} From 70d34e4bfd32d8518a70226d3d68398d94c1d68f Mon Sep 17 00:00:00 2001 From: Hendrik_Klug <43926224+Jimmy2027@users.noreply.github.com> Date: Wed, 16 Apr 2025 11:00:48 +0200 Subject: [PATCH 096/710] [MLIR][Linalg] Remove debug print from FoldIntoElementwise pass (#135928) --- mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp b/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp index bdd4f6025b051..fcf049e9ce722 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp @@ -72,7 +72,6 @@ struct LinalgFoldIntoElementwisePass LinalgFoldIntoElementwisePass>::LinalgFoldIntoElementwisePassBase; void runOnOperation() override { - llvm::outs() << "Hellow from fold into elemenwise \n"; Operation *op = getOperation(); RewritePatternSet patterns(op->getContext()); populateLinalgFoldIntoElementwisePatterns(patterns); @@ -86,4 +85,4 @@ struct LinalgFoldIntoElementwisePass void mlir::linalg::populateLinalgFoldIntoElementwisePatterns( RewritePatternSet &patterns) { patterns.add(patterns.getContext()); -} \ No newline at end of file +} From 51b8c66b0867154730d07e1ee4016b5116440293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=98=D0=B7?= =?UTF-8?q?=D0=B2=D0=BE=D0=BB=D0=BE=D0=B2?= Date: Wed, 16 Apr 2025 12:34:11 +0300 Subject: [PATCH 097/710] [libc++] Extend the scope of radix sorting inside std::stable_sort to floating-point types (#129452) These changes speed up `std::stable_sort` in the case of sorting floating-point types. This applies only to IEEE 754 floats. The speedup is similar to that achieved for integers in PR #104683 (see benchmarks below). Why does this worth doing? Previously, `std::stable_sort` had almost no chance of beating `std::sort`. Now there are cases when `std::stable_sort` is preferrable, and the difference is significant. ``` --------------------------------------------------------------------------- Benchmark | std::stable_sort | std::sort | std::stable_sort | without radix_sort | | with radix_sort --------------------------------------------------------------------------- float_Random_1 | 1.62 ns | 2.15 ns | 1.61 ns float_Random_4 | 18.0 ns | 2.71 ns | 16.3 ns float_Random_16 | 118 ns | 113 ns | 112 ns float_Random_64 | 751 ns | 647 ns | 730 ns float_Random_256 | 4715 ns | 2937 ns | 4669 ns float_Random_1024 | 25713 ns | 13172 ns | 5959 ns <-- float_Random_4096 | 131307 ns | 56870 ns | 19294 ns <-- float_Random_16384 | 624996 ns | 242953 ns | 64264 ns <-- float_Random_65536 | 2895661 ns | 1027279 ns | 288553 ns <-- float_Random_262144 | 13285372 ns | 4342593 ns | 3022377 ns <-- float_Random_1048576 | 60595871 ns | 19087591 ns | 18690457 ns <-- float_Random_2097152 | 131336117 ns | 38800396 ns | 52325016 ns float_Random_4194304 | 270043042 ns | 79978019 ns | 102907726 ns double_Random_1 | 1.60 ns | 2.15 ns | 1.61 ns double_Random_4 | 15.2 ns | 2.70 ns | 16.9 ns double_Random_16 | 104 ns | 112 ns | 119 ns double_Random_64 | 712 ns | 614 ns | 755 ns double_Random_256 | 4496 ns | 2966 ns | 4820 ns double_Random_1024 | 24722 ns | 12679 ns | 6189 ns <-- double_Random_4096 | 126075 ns | 54484 ns | 20999 ns <-- double_Random_16384 | 613782 ns | 232557 ns | 110276 ns <-- double_Random_65536 | 2894972 ns | 988531 ns | 774302 ns <-- double_Random_262144 | 13460273 ns | 4278059 ns | 5115123 ns double_Random_1048576 | 61119996 ns | 18408462 ns | 27166574 ns double_Random_2097152 | 132511525 ns | 37986158 ns | 54423869 ns double_Random_4194304 | 272949862 ns | 77912616 ns | 147670834 ns ``` Comparison for only `std::stable_sort`: ``` Benchmark Time Time Old Time New -------------------------------------------------------------------------------------------------- BM_StableSort_float_Random_1024 -0.7997 25438 5096 BM_StableSort_float_Random_4096 -0.8731 128157 16260 BM_StableSort_float_Random_16384 -0.9024 621271 60623 BM_StableSort_float_Random_65536 -0.9081 2922413 268619 BM_StableSort_float_Random_262144 -0.7766 13386345 2990408 BM_StableSort_float_Random_1048576 -0.6954 60673010 18481751 BM_StableSort_float_Random_2097152 -0.6026 130977358 52052182 BM_StableSort_float_Random_4194304 -0.6252 271556583 101770500 BM_StableSort_float_Ascending_1024 -0.6430 6711 2396 BM_StableSort_float_Ascending_4096 -0.7979 38460 7773 BM_StableSort_float_Ascending_16384 -0.8471 191069 29222 BM_StableSort_float_Ascending_65536 -0.8683 882321 116194 BM_StableSort_float_Ascending_262144 -0.8346 3868552 639937 BM_StableSort_float_Ascending_1048576 -0.7460 16521233 4195953 BM_StableSort_float_Ascending_2097152 -0.5439 21757532 9922776 BM_StableSort_float_Ascending_4194304 -0.7525 67847496 16791582 BM_StableSort_float_Descending_1024 -0.6359 15038 5475 BM_StableSort_float_Descending_4096 -0.7090 62810 18278 BM_StableSort_float_Descending_16384 -0.7763 311844 69750 BM_StableSort_float_Descending_65536 -0.7228 1270513 352202 BM_StableSort_float_Descending_262144 -0.6785 5484173 1763045 BM_StableSort_float_Descending_1048576 -0.5084 20223149 9941852 BM_StableSort_float_Descending_2097152 -0.7646 60523254 14247014 BM_StableSort_float_Descending_4194304 -0.5638 95706839 41748858 BM_StableSort_float_SingleElement_1024 +0.3715 1732 2375 BM_StableSort_float_SingleElement_4096 -0.1685 9357 7781 BM_StableSort_float_SingleElement_16384 -0.3793 47307 29362 BM_StableSort_float_SingleElement_65536 -0.4925 227666 115536 BM_StableSort_float_SingleElement_262144 -0.4271 1075853 616387 BM_StableSort_float_SingleElement_1048576 -0.3736 5097599 3193279 BM_StableSort_float_SingleElement_2097152 -0.2470 9854161 7420158 BM_StableSort_float_SingleElement_4194304 -0.3384 22175964 14670720 BM_StableSort_float_PipeOrgan_1024 -0.4885 10664 5455 BM_StableSort_float_PipeOrgan_4096 -0.6340 50095 18337 BM_StableSort_float_PipeOrgan_16384 -0.7078 238700 69739 BM_StableSort_float_PipeOrgan_65536 -0.6740 1102419 359378 BM_StableSort_float_PipeOrgan_262144 -0.7460 4698739 1193511 BM_StableSort_float_PipeOrgan_1048576 -0.5657 18493972 8032392 BM_StableSort_float_PipeOrgan_2097152 -0.7116 41089206 11850349 BM_StableSort_float_PipeOrgan_4194304 -0.6650 83445011 27955737 BM_StableSort_float_QuickSortAdversary_1024 -0.6863 17402 5460 BM_StableSort_float_QuickSortAdversary_4096 -0.7715 79864 18247 BM_StableSort_float_QuickSortAdversary_16384 -0.7800 317480 69839 BM_StableSort_float_QuickSortAdversary_65536 -0.7400 1357601 352967 BM_StableSort_float_QuickSortAdversary_262144 -0.6450 5662094 2009769 BM_StableSort_float_QuickSortAdversary_1048576 -0.5092 21173627 10392107 BM_StableSort_float_QuickSortAdversary_2097152 -0.7333 61748178 16469993 BM_StableSort_float_QuickSortAdversary_4194304 -0.5607 98459863 43250182 BM_StableSort_double_Random_1024 -0.7657 24769 5802 BM_StableSort_double_Random_4096 -0.8441 126449 19717 BM_StableSort_double_Random_16384 -0.8269 614910 106447 BM_StableSort_double_Random_65536 -0.7413 2905000 751427 BM_StableSort_double_Random_262144 -0.6287 13449514 4994348 BM_StableSort_double_Random_1048576 -0.5635 60863246 26568349 BM_StableSort_double_Random_2097152 -0.5959 130293892 52654532 BM_StableSort_double_Random_4194304 -0.4772 272616445 142526267 BM_StableSort_double_Ascending_1024 -0.4870 6757 3466 BM_StableSort_double_Ascending_4096 -0.7360 37592 9923 BM_StableSort_double_Ascending_16384 -0.7971 183967 37324 BM_StableSort_double_Ascending_65536 -0.7465 897116 227398 BM_StableSort_double_Ascending_262144 -0.6764 4020980 1301033 BM_StableSort_double_Ascending_1048576 -0.6407 16421799 5900751 BM_StableSort_double_Ascending_2097152 -0.6380 29347139 10622419 BM_StableSort_double_Ascending_4194304 -0.5934 70439925 28644185 BM_StableSort_double_Descending_1024 -0.5988 15216 6105 BM_StableSort_double_Descending_4096 -0.6857 65069 20449 BM_StableSort_double_Descending_16384 -0.6922 329321 101381 BM_StableSort_double_Descending_65536 -0.7038 1367970 405242 BM_StableSort_double_Descending_262144 -0.6472 5361644 1891429 BM_StableSort_double_Descending_1048576 -0.6656 22031404 7366459 BM_StableSort_double_Descending_2097152 -0.7593 68922467 16591242 BM_StableSort_double_Descending_4194304 -0.6392 96283643 34743223 BM_StableSort_double_SingleElement_1024 +0.9128 1895 3625 BM_StableSort_double_SingleElement_4096 +0.1475 10013 11490 BM_StableSort_double_SingleElement_16384 -0.1901 52382 42424 BM_StableSort_double_SingleElement_65536 -0.2096 254698 201313 BM_StableSort_double_SingleElement_262144 -0.1833 1248478 1019648 BM_StableSort_double_SingleElement_1048576 -0.1741 5703397 4710603 BM_StableSort_double_SingleElement_2097152 -0.1751 10922197 9009835 BM_StableSort_double_SingleElement_4194304 -0.1538 26571923 22485137 BM_StableSort_double_PipeOrgan_1024 -0.4406 10752 6014 BM_StableSort_double_PipeOrgan_4096 -0.5917 49456 20195 BM_StableSort_double_PipeOrgan_16384 -0.6258 270515 101221 BM_StableSort_double_PipeOrgan_65536 -0.7098 1159462 336457 BM_StableSort_double_PipeOrgan_262144 -0.6591 4735711 1614433 BM_StableSort_double_PipeOrgan_1048576 -0.6620 19353110 6541172 BM_StableSort_double_PipeOrgan_2097152 -0.7288 49131812 13323391 BM_StableSort_double_PipeOrgan_4194304 -0.5988 81958974 32878171 BM_StableSort_double_QuickSortAdversary_1024 -0.6516 17948 6254 BM_StableSort_double_QuickSortAdversary_4096 -0.7527 82359 20363 BM_StableSort_double_QuickSortAdversary_16384 -0.7009 340410 101811 BM_StableSort_double_QuickSortAdversary_65536 -0.6854 1487480 467928 BM_StableSort_double_QuickSortAdversary_262144 -0.6386 5648460 2041377 BM_StableSort_double_QuickSortAdversary_1048576 -0.6127 22859142 8852587 BM_StableSort_double_QuickSortAdversary_2097152 -0.7161 68693975 19499381 BM_StableSort_double_QuickSortAdversary_4194304 -0.5909 95532179 39077491 OVERALL_GEOMEAN -0.6472 0 0 ``` --- libcxx/docs/ReleaseNotes/21.rst | 3 + libcxx/include/__algorithm/radix_sort.h | 100 +++++++++++++++++- libcxx/include/__algorithm/stable_sort.h | 15 ++- .../alg.sort/stable.sort/stable_sort.pass.cpp | 59 ++++++++++- 4 files changed, 166 insertions(+), 11 deletions(-) diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index ac54922eb4c59..2091a713ea200 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -64,6 +64,9 @@ Improvements and New Features - The ``num_put::do_put`` integral overloads have been optimized, resulting in a performance improvement of up to 2.4x. +- The ``std::stable_sort`` algorithm uses radix sort for floating-point types now, which can improve the performance + up to 10x, depending on type of sorted elements and the initial state of the sorted array. + Deprecations and Removals ------------------------- diff --git a/libcxx/include/__algorithm/radix_sort.h b/libcxx/include/__algorithm/radix_sort.h index f6d9fb1ad7ca9..055d8a0765d7c 100644 --- a/libcxx/include/__algorithm/radix_sort.h +++ b/libcxx/include/__algorithm/radix_sort.h @@ -29,9 +29,10 @@ #include <__algorithm/for_each.h> #include <__algorithm/move.h> +#include <__bit/bit_cast.h> #include <__bit/bit_log2.h> -#include <__bit/countl.h> #include <__config> +#include <__cstddef/size_t.h> #include <__functional/identity.h> #include <__iterator/access.h> #include <__iterator/distance.h> @@ -44,9 +45,12 @@ #include <__type_traits/enable_if.h> #include <__type_traits/invoke.h> #include <__type_traits/is_assignable.h> +#include <__type_traits/is_enum.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_unsigned.h> #include <__type_traits/make_unsigned.h> +#include <__type_traits/void_t.h> +#include <__utility/declval.h> #include <__utility/forward.h> #include <__utility/integer_sequence.h> #include <__utility/move.h> @@ -298,6 +302,96 @@ _LIBCPP_HIDE_FROM_ABI constexpr auto __shift_to_unsigned(_Ip __n) { return static_cast >(__n ^ __min_value); } +template +struct __unsigned_integer_of_size; + +template <> +struct __unsigned_integer_of_size<1> { + using type _LIBCPP_NODEBUG = uint8_t; +}; + +template <> +struct __unsigned_integer_of_size<2> { + using type _LIBCPP_NODEBUG = uint16_t; +}; + +template <> +struct __unsigned_integer_of_size<4> { + using type _LIBCPP_NODEBUG = uint32_t; +}; + +template <> +struct __unsigned_integer_of_size<8> { + using type _LIBCPP_NODEBUG = uint64_t; +}; + +# if _LIBCPP_HAS_INT128 +template <> +struct __unsigned_integer_of_size<16> { + using type _LIBCPP_NODEBUG = unsigned __int128; +}; +# endif + +template +using __unsigned_integer_of_size_t _LIBCPP_NODEBUG = typename __unsigned_integer_of_size<_Size>::type; + +template +using __unsigned_representation_for_t _LIBCPP_NODEBUG = __unsigned_integer_of_size_t; + +// The function `__to_ordered_integral` is defined for integers and IEEE 754 floating-point numbers. +// Returns an integer representation such that for any `x` and `y` such that `x < y`, the expression +// `__to_ordered_integral(x) < __to_ordered_integral(y)` is true, where `x`, `y` are integers or IEEE 754 floats. +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr auto __to_ordered_integral(_Integral __n) { + return __n; +} + +// An overload for IEEE 754 floating-point numbers + +// For the floats conforming to IEEE 754 (IEC 559) standard, we know that: +// 1. The bit representation of positive floats directly reflects their order: +// When comparing floats by magnitude, the number with the larger exponent is greater, and if the exponents are +// equal, the one with the larger mantissa is greater. +// 2. The bit representation of negative floats reflects their reverse order (for the same reasons). +// 3. The most significant bit (sign bit) is zero for positive floats and one for negative floats. Therefore, in the raw +// bit representation, any negative number will be greater than any positive number. + +// The only exception from this rule is `NaN`, which is unordered by definition. + +// Based on the above, to obtain correctly ordered integral representation of floating-point numbers, we need to: +// 1. Invert the bit representation (including the sign bit) of negative floats to switch from reverse order to direct +// order; +// 2. Invert the sign bit for positive floats. + +// Thus, in final integral representation, we have reversed the order for negative floats and made all negative floats +// smaller than all positive numbers (by inverting the sign bit). +template ::is_iec559, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr auto __to_ordered_integral(_Floating __f) { + using __integral_type = __unsigned_representation_for_t<_Floating>; + constexpr auto __bit_count = std::numeric_limits<__integral_type>::digits; + constexpr auto __sign_bit_mask = static_cast<__integral_type>(__integral_type{1} << (__bit_count - 1)); + + const auto __u = std::__bit_cast<__integral_type>(__f); + + return static_cast<__integral_type>(__u & __sign_bit_mask ? ~__u : __u ^ __sign_bit_mask); +} + +// There may exist user-defined comparison for enum, so we cannot compare enums just like integers. +template ::value, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr auto __to_ordered_integral(_Enum __e) = delete; + +// `long double` varies significantly across platforms and compilers, making it practically +// impossible to determine its actual bit width for conversion to an ordered integer. +inline _LIBCPP_HIDE_FROM_ABI constexpr auto __to_ordered_integral(long double) = delete; + +template +inline const bool __is_ordered_integer_representable_v = false; + +template +inline const bool + __is_ordered_integer_representable_v<_Tp, __void_t()))>> = + true; + struct __low_byte_fn { template _LIBCPP_HIDE_FROM_ABI constexpr uint8_t operator()(_Ip __integer) const { @@ -314,7 +408,9 @@ __radix_sort(_RandomAccessIterator1 __first, _RandomAccessIterator2 __buffer, _Map __map, _Radix __radix) { - auto __map_to_unsigned = [__map = std::move(__map)](const auto& __x) { return std::__shift_to_unsigned(__map(__x)); }; + auto __map_to_unsigned = [__map = std::move(__map)](const auto& __x) { + return std::__shift_to_unsigned(__map(std::__to_ordered_integral(__x))); + }; std::__radix_sort_impl(__first, __last, __buffer, __map_to_unsigned, __radix); } diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h index c7f9780e3f627..1ca66f6a51687 100644 --- a/libcxx/include/__algorithm/stable_sort.h +++ b/libcxx/include/__algorithm/stable_sort.h @@ -26,7 +26,6 @@ #include <__type_traits/desugars_to.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_integral.h> #include <__type_traits/is_same.h> #include <__type_traits/is_trivially_assignable.h> #include <__utility/move.h> @@ -201,7 +200,7 @@ struct __stable_sort_switch { #if _LIBCPP_STD_VER >= 17 template _LIBCPP_HIDE_FROM_ABI constexpr unsigned __radix_sort_min_bound() { - static_assert(is_integral<_Tp>::value); + static_assert(__is_ordered_integer_representable_v<_Tp>); if constexpr (sizeof(_Tp) == 1) { return 1 << 8; } @@ -211,7 +210,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr unsigned __radix_sort_min_bound() { template _LIBCPP_HIDE_FROM_ABI constexpr unsigned __radix_sort_max_bound() { - static_assert(is_integral<_Tp>::value); + static_assert(__is_ordered_integer_representable_v<_Tp>); if constexpr (sizeof(_Tp) >= 8) { return 1 << 15; } @@ -245,11 +244,11 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void __stable_sort( } #if _LIBCPP_STD_VER >= 17 - constexpr auto __default_comp = __desugars_to_v<__totally_ordered_less_tag, _Compare, value_type, value_type >; - constexpr auto __integral_value = - is_integral_v && is_same_v< value_type&, __iter_reference<_RandomAccessIterator>>; - constexpr auto __allowed_radix_sort = __default_comp && __integral_value; - if constexpr (__allowed_radix_sort) { + constexpr auto __default_comp = __desugars_to_v<__less_tag, _Compare, value_type, value_type >; + constexpr auto __radix_sortable = + __is_ordered_integer_representable_v && + is_same_v< value_type&, __iter_reference<_RandomAccessIterator>>; + if constexpr (__default_comp && __radix_sortable) { if (__len <= __buff_size && __len >= static_cast(std::__radix_sort_min_bound()) && __len <= static_cast(std::__radix_sort_max_bound())) { if (__libcpp_is_constant_evaluated()) { diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/stable_sort.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/stable_sort.pass.cpp index 4ee1d795a23b2..e05457492db32 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/stable_sort.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/stable_sort.pass.cpp @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include "count_new.h" @@ -68,6 +70,13 @@ TEST_CONSTEXPR_CXX26 std::vector generate_sawtooth(int N, int M) { if (++x == M) x = 0; } + + if (std::is_signed::value) { + for (auto& a : v) { + a -= (M / 2); + } + } + return v; } @@ -193,12 +202,60 @@ TEST_CONSTEXPR_CXX26 bool test() { return true; } +template +bool test_floating_special_values() { + static_assert(std::is_floating_point::value, ""); + + auto v = generate_sawtooth(1024, 512); + v.insert(v.end(), 256, static_cast(0.0)); + v.insert(v.end(), 256, static_cast(-0.0)); + v.insert(v.end(), 256, std::numeric_limits::infinity()); + v.insert(v.end(), 256, -std::numeric_limits::infinity()); + + std::mt19937 randomness; + std::shuffle(v.begin(), v.end(), randomness); + + std::stable_sort(v.begin(), v.end()); + assert(std::is_sorted(v.begin(), v.end())); + + return true; +} + +template +bool test_floating() { + return test() && test_floating_special_values(); +} + +enum struct Enum : int { a, b, c, d, e, f, g, h }; +TEST_CONSTEXPR_CXX26 bool operator<(Enum x, Enum y) { return static_cast(x) > static_cast(y); } + +TEST_CONSTEXPR_CXX26 bool test_enum() { + auto v = std::vector(128, Enum::a); + v.resize(v.size() + 128, Enum::b); + v.resize(v.size() + 128, Enum::c); + v.resize(v.size() + 128, Enum::d); + v.resize(v.size() + 128, Enum::e); + v.resize(v.size() + 128, Enum::f); + v.resize(v.size() + 128, Enum::g); + v.resize(v.size() + 128, Enum::h); + + // Order is reversed by definition + std::stable_sort(v.begin(), v.end()); + assert(std::is_sorted(v.begin(), v.end())); + + return true; +} + int main(int, char**) { test(); - test(); + test_floating(); + test_floating(); + test_floating(); + test_enum(); #if TEST_STD_VER >= 26 static_assert(test()); static_assert(test()); + static_assert(test()); // test constexprness of radix sort branch static_assert(test()); #endif From 1f96aea037e612d87b4e2e20825973e45680921c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 16 Apr 2025 10:44:40 +0100 Subject: [PATCH 098/710] [X86] Add test coverage for #135917 --- llvm/test/CodeGen/X86/pr135917.ll | 58 +++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr135917.ll diff --git a/llvm/test/CodeGen/X86/pr135917.ll b/llvm/test/CodeGen/X86/pr135917.ll new file mode 100644 index 0000000000000..9eed955128b74 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr135917.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefix=SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefix=AVX512 + +define i32 @PR135917(i1 %a0) { +; SSE2-LABEL: PR135917: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: addl %ecx, %eax +; SSE2-NEXT: retq +; +; SSE4-LABEL: PR135917: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %edi, %xmm0 +; SSE4-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: movd %xmm0, %ecx +; SSE4-NEXT: pextrd $1, %xmm0, %eax +; SSE4-NEXT: addl %ecx, %eax +; SSE4-NEXT: retq +; +; AVX2-LABEL: PR135917: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR135917: +; AVX512: # %bb.0: +; AVX512-NEXT: andb $1, %dil +; AVX512-NEXT: negb %dil +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: knotw %k0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpsrld $31, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: addl %ecx, %eax +; AVX512-NEXT: retq + %splat = insertelement <4 x i1> poison, i1 %a0, i64 0 + %xor = xor <4 x i1> %splat, splat (i1 true) + %not = shufflevector <4 x i1> %xor, <4 x i1> poison, <4 x i32> zeroinitializer + %zext = zext <4 x i1> %not to <4 x i32> + %elt0 = extractelement <4 x i32> %zext, i64 0 + %elt1 = extractelement <4 x i32> %zext, i64 1 + %res = add i32 %elt0, %elt1 + ret i32 %res +} + From b9ce185d4e542dde5e8d152f30314b6637a0d87b Mon Sep 17 00:00:00 2001 From: Robert Konicar Date: Wed, 16 Apr 2025 11:49:19 +0200 Subject: [PATCH 099/710] [MLIR][LLVM] Fix #llvm.constant_range crashing in storage uniquer (#135772) Add APIntParameter with custom implementation for comparison and use it in llvm.constant_range attribute. This is necessary because the default equality operator of APInt asserts when the bit widths of the compared APInts differ. The comparison is used by StorageUniquer when hashes of two ranges with different bit widths collide. --- mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 4 ++-- mlir/include/mlir/IR/AttrTypeBase.td | 8 ++++++++ mlir/test/Dialect/LLVMIR/range-attr.mlir | 10 ++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 mlir/test/Dialect/LLVMIR/range-attr.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 690243525ede4..bb528fec8c684 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1095,8 +1095,8 @@ def LLVM_TBAATagArrayAttr //===----------------------------------------------------------------------===// def LLVM_ConstantRangeAttr : LLVM_Attr<"ConstantRange", "constant_range"> { let parameters = (ins - "::llvm::APInt":$lower, - "::llvm::APInt":$upper + APIntParameter<"">:$lower, + APIntParameter<"">:$upper ); let summary = "A range of two integers, corresponding to LLVM's ConstantRange"; let description = [{ diff --git a/mlir/include/mlir/IR/AttrTypeBase.td b/mlir/include/mlir/IR/AttrTypeBase.td index 38d38cf098df3..f6ec4989c9787 100644 --- a/mlir/include/mlir/IR/AttrTypeBase.td +++ b/mlir/include/mlir/IR/AttrTypeBase.td @@ -383,6 +383,14 @@ class StringRefParameter : let defaultValue = value; } +// For APInts, which require comparison supporting different bitwidths. The +// default APInt comparison operator asserts when the bitwidths differ, so +// a custom implementation is necessary. +class APIntParameter : + AttrOrTypeParameter<"::llvm::APInt", desc> { + let comparator = "$_lhs.getBitWidth() == $_rhs.getBitWidth() && $_lhs == $_rhs"; +} + // For APFloats, which require comparison. class APFloatParameter : AttrOrTypeParameter<"::llvm::APFloat", desc> { diff --git a/mlir/test/Dialect/LLVMIR/range-attr.mlir b/mlir/test/Dialect/LLVMIR/range-attr.mlir new file mode 100644 index 0000000000000..5f2b67609743b --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/range-attr.mlir @@ -0,0 +1,10 @@ +// RUN: mlir-opt %s -o - | FileCheck %s + +// CHECK: #llvm.constant_range +llvm.func external @foo1(!llvm.ptr, i64) -> (i32 {llvm.range = #llvm.constant_range}) +// CHECK: #llvm.constant_range +llvm.func external @foo2(!llvm.ptr, i64) -> (i8 {llvm.range = #llvm.constant_range}) +// CHECK: #llvm.constant_range +llvm.func external @foo3(!llvm.ptr, i64) -> (i64 {llvm.range = #llvm.constant_range}) +// CHECK: #llvm.constant_range +llvm.func external @foo4(!llvm.ptr, i64) -> (i32 {llvm.range = #llvm.constant_range}) From 3d7e56fd28cd2195e7f330f933d491530e274401 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray Date: Wed, 16 Apr 2025 10:59:07 +0100 Subject: [PATCH 100/710] [AArch64][clang][llvm] Add structured sparsity outer product (TMOP) intrinsics (#135145) Implement all {BF/F/S/U/SU/US}TMOP intrinsics in clang and llvm following the ACLE in https://github.com/ARM-software/acle/pull/380/files --- clang/include/clang/Basic/arm_sme.td | 27 +++ .../AArch64/sme2-intrinsics/acle_sme2_tmop.c | 202 ++++++++++++++++++ .../acle_sme2_tmop.cpp | 120 +++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 18 ++ .../lib/Target/AArch64/AArch64RegisterInfo.td | 6 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 53 +++-- llvm/lib/Target/AArch64/SMEInstrFormats.td | 68 +++--- .../AArch64/GlobalISel/regbank-inlineasm.mir | 8 +- .../emit_fneg_with_non_register_operand.mir | 8 +- .../CodeGen/AArch64/peephole-insvigpr.mir | 4 +- .../CodeGen/AArch64/sme2-intrinsics-tmop.ll | 162 ++++++++++++++ 11 files changed, 605 insertions(+), 71 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_tmop.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 1bfcf4c31d552..3647fc7a27d83 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -907,6 +907,33 @@ let SMETargetGuard = "sme-f16f16" in { } +//////////////////////////////////////////////////////////////////////////////// +// SME2 - TMOP, SUTMOP, USTMOP + +let SMETargetGuard = "sme2,sme-tmop" in { + def SVTMOPA_ZA32 : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2d[i", "hbf", MergeNone, "aarch64_sme_ftmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; + def SVSTMOPA_ZA32 : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2d[i", "cs", MergeNone, "aarch64_sme_stmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; + def SVUTMOPA_ZA32 : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2d[i", "UcUs", MergeNone, "aarch64_sme_utmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; + def SVSUTMOPA_ZA32 : Inst<"svtmopa_lane_za32[_{d}_{3}]", "vi2u[i", "c", MergeNone, "aarch64_sme_sutmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; + def SVUSTMOPA_ZA32 : Inst<"svtmopa_lane_za32[_{d}_{3}]", "vi2x[i", "Uc", MergeNone, "aarch64_sme_ustmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-tmop,sme-f16f16" in { + def SVTMOPA_F16 : Inst<"svtmopa_lane_za16[_{d}_{d}]", "vi2d[i", "h", MergeNone, "aarch64_sme_ftmopa_za16", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_1>, ImmCheck<4, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-tmop,sme-b16b16" in { + def SVTMOPA_BF16 : Inst<"svtmopa_lane_za16[_{d}_{d}]", "vi2d[i", "b", MergeNone, "aarch64_sme_ftmopa_za16", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_1>, ImmCheck<4, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-tmop,sme-f8f16" in { + def SVTMOPA_ZA16_FPM : Inst<"svtmopa_lane_za16[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_ftmopa_za16", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_1>, ImmCheck<4, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-tmop,sme-f8f32" in { + def SVTMOPA_ZA32_FPM : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_ftmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; +} + multiclass ZAReadz ch> { let SMETargetGuard = "sme2p1" in { def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t, diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c new file mode 100644 index 0000000000000..55d0074663bc9 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c @@ -0,0 +1,202 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-f8f32 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-f8f32 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-f8f32 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-f8f32 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-f8f32 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svtmopa_lane_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.stmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svtmopa_lane_za32_s8_s810svint8x2_tu10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.stmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_s8_s8(svint8x2_t zn, svint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_s8_s8,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.utmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svtmopa_lane_za32_u8_u811svuint8x2_tu11__SVUint8_tS0_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.utmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_u8_u8(svuint8x2_t zn, svuint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_u8_u8,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sutmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svtmopa_lane_za32_s8_u810svint8x2_tu11__SVUint8_tS0_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sutmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_s8_u8(svint8x2_t zn, svuint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_s8_u8,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ustmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svtmopa_lane_za32_u8_s811svuint8x2_tu10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ustmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_u8_s8(svuint8x2_t zn, svint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_u8_s8,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.stmopa.za32.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svtmopa_lane_za32_s16_s1611svint16x2_tu11__SVInt16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.stmopa.za32.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_s16_s16(svint16x2_t zn, svint16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_s16_s16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.utmopa.za32.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svtmopa_lane_za32_u16_u1612svuint16x2_tu12__SVUint16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.utmopa.za32.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_u16_u16(svuint16x2_t zn, svuint16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_u16_u16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svtmopa_lane_za32_f16_f1613svfloat16x2_tu13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_f16_f16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svtmopa_lane_za32_f32_f3213svfloat32x2_tu13__SVFloat32_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_f32_f32,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svtmopa_lane_za32_bf16_bf1614svbfloat16x2_tu14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_bf16_bf16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svtmopa_lane_za16_f16_f1613svfloat16x2_tu13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za16,_f16_f16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svtmopa_lane_za16_bf16_bf1614svbfloat16x2_tu14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za16,_bf16_bf16,)(1, zn, zm, zk, 3); +} + +// CHECK-LABEL: @test_svtmopa_lane_za16_mf8_mf8_fpm( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z34test_svtmopa_lane_za16_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tu11__SVUint8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, svuint8_t zk, fpm_t fpmr) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za16,_mf8_mf8,_fpm)(1, zn, zm, zk, 3, fpmr); +} + +// CHECK-LABEL: @test_svtmopa_lane_za32_mf8_mf8_fpm( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z34test_svtmopa_lane_za32_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tu11__SVUint8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]], [[ZK:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svtmopa_lane_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, svuint8_t zk, fpm_t fpmr) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svtmopa_lane_za32,_mf8_mf8,_fpm)(1, zn, zm, zk, 3, fpmr); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_tmop.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_tmop.cpp new file mode 100644 index 0000000000000..5c229d0825366 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_tmop.cpp @@ -0,0 +1,120 @@ +// RUN: %clang_cc1 -triple aarch64 \ +// RUN: -target-feature +sme -target-feature +sme2 -verify -emit-llvm -o - %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(svuint8x2_t zn_u8, svuint8_t zm_u8, + svint8x2_t zn_s8, svint8_t zm_s8, + svint16x2_t zn_s16, svint16_t zm_s16, + svuint16x2_t zn_u16, svuint16_t zm_u16, + svfloat16x2_t zn_f16, svfloat16_t zm_f16, + svbfloat16x2_t zn_bf16, svbfloat16_t zm_bf16, + svfloat32x2_t zn_f32, svfloat32_t zm_f32, + svmfloat8x2_t zn_f8, svmfloat8_t zm_f8, + svuint8_t zk, fpm_t fpm) __arm_streaming __arm_inout("za") { + +// expected-error@+1 {{'svtmopa_lane_za32_s8_s8' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_s8_s8(0, zn_s8, zm_s8, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_u8_u8' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_u8_u8(0, zn_u8, zm_u8, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_s8_u8' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_s8_u8(0, zn_s8, zm_u8, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_u8_s8' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_u8_s8(0, zn_u8, zm_s8, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_s16_s16' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_s16_s16(0, zn_s16, zm_s16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_u16_u16' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_u16_u16(0, zn_u16, zm_u16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_f16_f16' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_f16_f16(0, zn_f16, zm_f16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_f32_f32' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_f32_f32(0, zn_f32, zm_f32, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za32_bf16_bf16' needs target feature sme,sme2,sme-tmop}} + svtmopa_lane_za32_bf16_bf16(0, zn_bf16, zm_bf16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za16_f16_f16' needs target feature sme,sme2,sme-tmop,sme-f16f16}} + svtmopa_lane_za16_f16_f16(0, zn_f16, zm_f16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za16_bf16_bf16' needs target feature sme,sme2,sme-tmop,sme-b16b16}} + svtmopa_lane_za16_bf16_bf16(0, zn_bf16, zm_bf16, zk, 0); +// expected-error@+1 {{'svtmopa_lane_za16_mf8_mf8_fpm' needs target feature sme,sme2,sme-tmop,sme-f8f16}} + svtmopa_lane_za16_mf8_mf8_fpm(0, zn_f8, zm_f8, zk, 0, fpm); +// expected-error@+1 {{'svtmopa_lane_za32_mf8_mf8_fpm' needs target feature sme,sme2,sme-tmop,sme-f8f32}} + svtmopa_lane_za32_mf8_mf8_fpm(0, zn_f8, zm_f8, zk, 0, fpm); +} + +void test_imm(svuint8x2_t zn_u8, svuint8_t zm_u8, + svint8x2_t zn_s8, svint8_t zm_s8, + svint16x2_t zn_s16, svint16_t zm_s16, + svuint16x2_t zn_u16, svuint16_t zm_u16, + svfloat16x2_t zn_f16, svfloat16_t zm_f16, + svbfloat16x2_t zn_bf16, svbfloat16_t zm_bf16, + svfloat32x2_t zn_f32, svfloat32_t zm_f32, + svmfloat8x2_t zn_f8, svmfloat8_t zm_f8, + svuint8_t zk, fpm_t fpm) __arm_streaming __arm_inout("za") { + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s8_s8(0, zn_s8, zm_s8, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s8_s8(4, zn_s8, zm_s8, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u8_u8(0, zn_u8, zm_u8, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u8_u8(4, zn_u8, zm_u8, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s8_u8(0, zn_s8, zm_u8, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s8_u8(4, zn_s8, zm_u8, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u8_s8(0, zn_u8, zm_s8, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u8_s8(4, zn_u8, zm_s8, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s16_s16(0, zn_s16, zm_s16, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_s16_s16(4, zn_s16, zm_s16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u16_u16(0, zn_u16, zm_u16, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_u16_u16(4, zn_u16, zm_u16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_f16_f16(0, zn_f16, zm_f16, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_f16_f16(4, zn_f16, zm_f16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_f32_f32(0, zn_f32, zm_f32, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_f32_f32(4, zn_f32, zm_f32, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_bf16_bf16(0, zn_bf16, zm_bf16, zk, 4); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_bf16_bf16(4, zn_bf16, zm_bf16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za16_f16_f16(0, zn_f16, zm_f16, zk, 4); +// expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svtmopa_lane_za16_f16_f16(2, zn_f16, zm_f16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za16_bf16_bf16(0, zn_bf16, zm_bf16, zk, 4); +// expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svtmopa_lane_za16_bf16_bf16(2, zn_bf16, zm_bf16, zk, 0); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za16_mf8_mf8_fpm(0, zn_f8, zm_f8, zk, 4, fpm); +// expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svtmopa_lane_za16_mf8_mf8_fpm(2, zn_f8, zm_f8, zk, 0, fpm); + +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_mf8_mf8_fpm(0, zn_f8, zm_f8, zk, 4, fpm); +// expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svtmopa_lane_za32_mf8_mf8_fpm(4, zn_f8, zm_f8, zk, 0, fpm); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 2c6129cedebbf..c493198e8ef8f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3107,6 +3107,24 @@ let TargetPrefix = "aarch64" in { } } + class SME_OuterProduct_TMOP_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_nxv16i8_ty, + llvm_i32_ty], + [ImmArg>, ImmArg>, + IntrInaccessibleMemOnly]>; + + def int_aarch64_sme_ftmopa_za16 : SME_OuterProduct_TMOP_Intrinsic; + def int_aarch64_sme_ftmopa_za32 : SME_OuterProduct_TMOP_Intrinsic; + def int_aarch64_sme_stmopa_za32 : SME_OuterProduct_TMOP_Intrinsic; + def int_aarch64_sme_utmopa_za32 : SME_OuterProduct_TMOP_Intrinsic; + def int_aarch64_sme_sutmopa_za32 : SME_OuterProduct_TMOP_Intrinsic; + def int_aarch64_sme_ustmopa_za32 : SME_OuterProduct_TMOP_Intrinsic; + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index fed9b7b173e9c..40553aff04919 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1269,8 +1269,10 @@ def ZPRMul2AsmOp32_Hi : ZPRAsmOperand<"VectorS_Hi", 32, "Mul2_Hi">; def ZPRMul2AsmOp64_Lo : ZPRAsmOperand<"VectorD_Lo", 64, "Mul2_Lo">; def ZPRMul2AsmOp64_Hi : ZPRAsmOperand<"VectorD_Hi", 64, "Mul2_Hi">; -def ZPR_K : RegisterClass<"AArch64", [untyped], 128, - (add Z20, Z21, Z22, Z23, Z28, Z29, Z30, Z31)>; +def ZPR_K : RegisterClass<"AArch64", [nxv16i8], 128, + (add Z20, Z21, Z22, Z23, Z28, Z29, Z30, Z31)>{ + let Size = 128; +} def ZK : RegisterOperand">{ let EncoderMethod = "EncodeZK"; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index f992f73171e0e..a83b17f7c9000 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -175,12 +175,31 @@ let Predicates = [HasSME_MOP4, HasSMEI16I64] in { } let Predicates = [HasSME_TMOP] in { -def STMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b00100, ZZ_b_mul_r, ZPR8, "stmopa">; -def STMOPA_M2ZZZI_HtoS : sme_int_sparse_outer_product_i32<0b00101, ZZ_h_mul_r, ZPR16, "stmopa">; -def UTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b11100, ZZ_b_mul_r, ZPR8, "utmopa">; -def UTMOPA_M2ZZZI_HtoS : sme_int_sparse_outer_product_i32<0b10101, ZZ_h_mul_r, ZPR16, "utmopa">; -def SUTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b01100, ZZ_b_mul_r, ZPR8, "sutmopa">; -def USTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b10100, ZZ_b_mul_r, ZPR8, "ustmopa">; + defm STMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b00100, ZZ_b_mul_r, ZPR8, nxv16i8, "stmopa", int_aarch64_sme_stmopa_za32>; + defm STMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b00101, ZZ_h_mul_r, ZPR16, nxv8i16, "stmopa", int_aarch64_sme_stmopa_za32>; + defm UTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b11100, ZZ_b_mul_r, ZPR8, nxv16i8, "utmopa", int_aarch64_sme_utmopa_za32>; + defm UTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b10101, ZZ_h_mul_r, ZPR16, nxv8i16, "utmopa", int_aarch64_sme_utmopa_za32>; + defm SUTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01100, ZZ_b_mul_r, ZPR8, nxv16i8, "sutmopa", int_aarch64_sme_sutmopa_za32>; + defm USTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b10100, ZZ_b_mul_r, ZPR8, nxv16i8, "ustmopa", int_aarch64_sme_ustmopa_za32>; + defm FTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b11000, ZZ_h_mul_r, ZPR16, nxv8f16, "ftmopa", int_aarch64_sme_ftmopa_za32, [FPCR]>; + defm FTMOPA_M2ZZZI_StoS : sme_tmopa_32b<0b00000, ZZ_s_mul_r, ZPR32, nxv4f32, "ftmopa", int_aarch64_sme_ftmopa_za32, [FPCR]>; + defm BFTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b10000, ZZ_h_mul_r, ZPR16, nxv8bf16, "bftmopa", int_aarch64_sme_ftmopa_za32, [FPCR]>; +} + +let Predicates = [HasSME_TMOP, HasSMEF16F16] in { + defm FTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b10001, ZZ_h_mul_r, ZPR16, nxv8f16, "ftmopa", int_aarch64_sme_ftmopa_za16, [FPCR]>; +} + +let Predicates = [HasSME_TMOP, HasSMEB16B16] in { + defm BFTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b11001, ZZ_h_mul_r, ZPR16, nxv8bf16, "bftmopa", int_aarch64_sme_ftmopa_za16, [FPCR]>; +} + +let Predicates = [HasSME_TMOP, HasSMEF8F16] in { + defm FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_ftmopa_za16, [FPMR, FPCR]>; +} + +let Predicates = [HasSME_TMOP, HasSMEF8F32] in { + defm FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_ftmopa_za32, [FPMR, FPCR]>; } let Predicates = [HasSME] in { @@ -1064,12 +1083,6 @@ let Predicates = [HasSME_MOP4] in { defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; } -let Predicates = [HasSME_TMOP] in { - def FTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b11000, ZZ_h_mul_r, ZPR16, "ftmopa">; - def FTMOPA_M2ZZZI_StoS : sme_tmopa_32b<0b00000, ZZ_s_mul_r, ZPR32, "ftmopa">; - def BFTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b10000, ZZ_h_mul_r, ZPR16, "bftmopa">; -} - let Predicates = [HasSME2p2] in { defm FMUL_2ZZ : sme2_multi2_fmul_sm<"fmul">; defm FMUL_2Z2Z : sme2_multi2_fmul_mm< "fmul">; @@ -1078,26 +1091,10 @@ let Predicates = [HasSME2p2] in { } // [HasSME2p2] -let Predicates = [HasSME_TMOP, HasSMEB16B16] in { - def BFTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b11001, ZZ_h_mul_r, ZPR16, "bftmopa">; -} - -let Predicates = [HasSME_TMOP, HasSMEF8F32], Uses = [FPMR, FPCR] in { - def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">; -} - -let Predicates = [HasSME_TMOP, HasSMEF8F16], Uses = [FPMR, FPCR] in { - def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">; -} - let Predicates = [HasSME_MOP4, HasSMEF8F16], Uses = [FPMR, FPCR] in { defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a">; } -let Predicates = [HasSME_TMOP, HasSMEF16F16] in { - def FTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b10001, ZZ_h_mul_r, ZPR16, "ftmopa">; -} - let Predicates = [HasSME_MOP4, HasSMEF16F16] in { defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">; defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index b611dddb0b045..b174a2a733851 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -106,6 +106,16 @@ class sme_outer_product_pseudo let mayStore = 1; } +class sme_sparse_outer_product_pseudo + : Pseudo<(outs), (ins i32imm:$tile, zn_ty:$zn, zm_ty:$zm, ZK:$zk, i32imm:$idx), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; + let mayLoad = 1; + let mayStore = 1; +} + class sme2_quarter_tile_outer_product_pseudo : Pseudo<(outs), (ins i32imm:$tile, zn_ty:$zn, zm_ty:$zm), []>, @@ -296,6 +306,12 @@ class SME2_ZA_Tile_Vec_Multi_Single_Pat : Pat<(intrinsic imm_ty:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm1, vt:$Zm2), (!cast(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; + +class SME2_ZA_TMOP_Pat + : Pat<(intrinsic tile_imm:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm, nxv16i8:$Zk, timm32_0_3:$idx), + (!cast(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), $Zm, $Zk, $idx)>; + + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -489,35 +505,6 @@ multiclass sme_int_outer_product_i64 opc, string mnemonic, def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } -class sme_int_sparse_outer_product_i32 opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> - : I<(outs TileOp32:$ZAda), - (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm, ZK:$Zk, VectorIndexS32b:$imm), - mnemonic, "\t$ZAda, $Zn, $Zm, $Zk$imm", - "", []>, - Sched<[]> { - bits<2> ZAda; - bits<4> Zn; - bits<5> Zm; - bits<3> Zk; - bits<2> imm; - let Inst{31-25} = 0b1000000; - let Inst{24} = opc{4}; - let Inst{23-22} = 0b01; - let Inst{21} = opc{3}; - let Inst{20-16} = Zm; - let Inst{15} = opc{2}; - let Inst{14} = 0b0; - let Inst{13} = opc{1}; - let Inst{12-10} = Zk; - let Inst{9-6} = Zn; - let Inst{5-4} = imm; - let Inst{3} = opc{0}; - let Inst{2} = 0b0; - let Inst{1-0} = ZAda; - - let Constraints = "$ZAda = $_ZAda"; -} - class sme_outer_product_widening_inst opc, ZPRRegOp zpr_ty, string mnemonic> : I<(outs TileOp32:$ZAda), (ins TileOp32:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), @@ -3562,7 +3549,7 @@ multiclass sme2_int_bmopx_tile op, SDPatternOperator i //===----------------------------------------------------------------------===// // SME2 Sparse Outer Product and Accumulate -class sme_tmopa_16b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> +class sme_int_sparse_outer_product_i16 opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> : I<(outs TileOp16:$ZAda), (ins TileOp16:$_ZAda, zn_ty:$Zn, zm_ty:$Zm, ZK:$Zk, VectorIndexS32b:$imm), mnemonic, "\t$ZAda, $Zn, $Zm, $Zk$imm", @@ -3591,7 +3578,7 @@ class sme_tmopa_16b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, s let Constraints = "$ZAda = $_ZAda"; } -class sme_tmopa_32b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> +class sme_int_sparse_outer_product_i32 opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> : I<(outs TileOp32:$ZAda), (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm, ZK:$Zk, VectorIndexS32b:$imm), mnemonic, "\t$ZAda, $Zn, $Zm, $Zk$imm", @@ -3620,6 +3607,25 @@ class sme_tmopa_32b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, s let Constraints = "$ZAda = $_ZAda"; } +multiclass sme_tmopa_16b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, ValueType vt, string mnemonic, SDPatternOperator intrinsic, list uses=[]> { + def NAME : sme_int_sparse_outer_product_i16, SMEPseudo2Instr { + let Uses = uses; + } + + def NAME # _PSEUDO : sme_sparse_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_TMOP_Pat; +} + +multiclass sme_tmopa_32b opc, RegisterOperand zn_ty, RegisterOperand zm_ty, ValueType vt, string mnemonic, SDPatternOperator intrinsic, list uses=[]> { + def NAME : sme_int_sparse_outer_product_i32, SMEPseudo2Instr { + let Uses = uses; + } + + def NAME # _PSEUDO : sme_sparse_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_TMOP_Pat; +} //===----------------------------------------------------------------------===/// // SME2 Zero Lookup Table. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir index 216f94b2b51e3..37d6e852429fe 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir @@ -57,11 +57,11 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_reg_output - ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2883594 /* regdef:GPR32common */, def %0 + ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2818058 /* regdef:GPR32common */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 - INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2883594 /* regdef:GPR32common */, def %0:gpr32common + INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2818058 /* regdef:GPR32common */, def %0:gpr32common %1:_(s32) = COPY %0 $w0 = COPY %1(s32) RET_ReallyLR implicit $w0 @@ -75,12 +75,12 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_mixed_types - ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2883594 /* regdef:GPR32common */, def %0, 3735562 /* regdef:FPR64 */, def %1 + ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2818058 /* regdef:GPR32common */, def %0, 3670026 /* regdef:FPR64 */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr(s64) = COPY %1 ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64) ; CHECK-NEXT: RET_ReallyLR implicit $d0 - INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2883594 /* regdef:GPR32common */, def %0:gpr32common, 3735562 /* regdef:FPR64 */, def %1:fpr64 + INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2818058 /* regdef:GPR32common */, def %0:gpr32common, 3670026 /* regdef:FPR64 */, def %1:fpr64 %3:_(s32) = COPY %0 %4:_(s64) = COPY %1 $d0 = COPY %4(s64) diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir index 253e6a9c076c6..615accc7f75e5 100644 --- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir +++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir @@ -91,10 +91,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3735562 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3670026 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY %2 ; CHECK-NEXT: [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3735562 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3670026 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) ; CHECK-NEXT: [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2 ; CHECK-NEXT: nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr ; CHECK-NEXT: Bcc 1, %bb.2, implicit $nzcv @@ -111,10 +111,10 @@ body: | %6:gpr64common = LOADgot target-flags(aarch64-got) @c %3:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c) - INLINEASM &"", 1 /* sideeffect attdialect */, 3735562 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 3670026 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3) %0:fpr64 = COPY %2 %5:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c) - INLINEASM &"", 1 /* sideeffect attdialect */, 3735562 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 3670026 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3) %7:fpr64 = FNEGDr %2 nofpexcept FCMPDrr %4, killed %7, implicit-def $nzcv, implicit $fpcr Bcc 1, %bb.2, implicit $nzcv diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir index 3174d3c8c1a73..aef01e42ed7cc 100644 --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -487,7 +487,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]] - ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3735562 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] + ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3670026 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF @@ -505,7 +505,7 @@ body: | %0:gpr64common = COPY $x0 %2:gpr64all = IMPLICIT_DEF %3:gpr64sp = COPY %2 - INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3735562 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed %3 + INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3670026 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed %3 %4:fpr128 = MOVIv2d_ns 0 %5:fpr64 = COPY %4.dsub %7:fpr128 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll new file mode 100644 index 0000000000000..efd5f951eced3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @stmopa_za32_s8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: stmopa_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stmopa za0.s, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.stmopa.za32.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @utmopa_za32_u8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: utmopa_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: utmopa za0.s, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.utmopa.za32.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ustmopa_za32_u8_s8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ustmopa_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ustmopa za0.s, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ustmopa.za32.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @sutmopa_za32_s8_u8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: sutmopa_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sutmopa za0.s, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sutmopa.za32.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @stmopa_za32_s16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: stmopa_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stmopa za0.s, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.stmopa.za32.nxv8i16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @utmopa_za32_u16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: utmopa_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: utmopa za0.s, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.utmopa.za32.nxv8i16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ftmopa_za32_f16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ftmopa_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ftmopa za0.s, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za32.nxv8f16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @bftmopa_za32_bf16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: bftmopa_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bftmopa za0.s, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za32.nxv8bf16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ftmopa_za32_f32( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ftmopa_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ftmopa za0.s, { z0.s, z1.s }, z2.s, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za32.nxv4f32(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ftmopa_za16_f16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ftmopa_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ftmopa za0.h, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za16.nxv8f16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @bftmopa_za16_bf16( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: bftmopa_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bftmopa za0.h, { z0.h, z1.h }, z2.h, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za16.nxv8bf16(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ftmopa_za16_f8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ftmopa_za16_f8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ftmopa za0.h, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +define void @ftmopa_za32_f8( %zn1, %zn2, %zm, %zk) #0 { +; CHECK-LABEL: ftmopa_za32_f8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z28.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: ftmopa za0.s, { z0.b, z1.b }, z2.b, z28[0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 0, %zn1, %zn2, %zm, %zk, i32 0) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme2,+sme-tmop,+sme-f16f16,+sme-b16b16,+sme-f8f16,+sme-f8f32,+sme2p1,+bf16" } From d508f0cb009a0be98ca97f4dc0498294e0681a66 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 16 Apr 2025 11:41:07 +0100 Subject: [PATCH 101/710] [AArch64] Fix FPMR handling when switching streaming mode (#135827) According to the [documentation](https://developer.arm.com/documentation/ddi0601/latest/AArch64-Registers/FPMR--Floating-point-Mode-Register), the FPMR register is set to 0 when entering or exiting streaming mode. This patch models that behavior by adding FPMR as an implicit def to the instructions used for entering and exiting streaming mode. --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 +++ llvm/test/CodeGen/AArch64/sme-write-fpmr.ll | 23 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sme-write-fpmr.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bea8087750d6e..a95d8d343adf2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8882,12 +8882,15 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MI.removeOperand(I); // The SVE vector length can change when entering/leaving streaming mode. + // FPMR is set to 0 when entering/leaving streaming mode. if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM || MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) { MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, /*IsImplicit=*/true)); MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true, /*IsImplicit=*/true)); + MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true, + /*IsImplicit=*/true)); } } diff --git a/llvm/test/CodeGen/AArch64/sme-write-fpmr.ll b/llvm/test/CodeGen/AArch64/sme-write-fpmr.ll new file mode 100644 index 0000000000000..074d65831a584 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-write-fpmr.ll @@ -0,0 +1,23 @@ +; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s + +target triple = "aarch64" + +; Check that we don't define FPMR for 'smstart za' and 'smstop za' +define void @smstart_za() "aarch64_new_za" nounwind { + ; CHECK-LABEL: name: smstart_za + ; CHECK-NOT: implicit-def {{[^,]*}}$fpmr + ret void +} + +; Check that we do define FPMR for 'smstart sm' and 'smstop sm' +define void @smstart_sm() nounwind { + ; CHECK-LABEL: name: smstart_sm + ; CHECK: MSRpstatesvcrImm1 1, 1, + ; CHECK-SAME: implicit-def {{[^,]*}}$fpmr + ; CHECK: MSRpstatesvcrImm1 1, 0, + ; CHECK-SAME: implicit-def {{[^,]*}}$fpmr + call void @require_sm() + ret void +} + +declare void @require_sm() "aarch64_pstate_sm_enabled" From 41c97afea055a5b7264167ec47b8c14c0f471f2f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 06:53:10 -0400 Subject: [PATCH 102/710] [SLP][NFC]Remove handling of duplicates from getGatherCost Duplicates are handled in BoUpSLP::processBuildVector (see TryPackScalars), support for duplicates in getGatherCost is not needed anymore. Reviewers: hiraditya, RKSimon Reviewed By: hiraditya, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/135834 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 34 ++----------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b48674f6993e3..f9acc276f37f9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15727,13 +15727,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, Type *ScalarTy) const { const unsigned VF = VL.size(); auto *VecTy = getWidenedType(ScalarTy, VF); - bool DuplicateNonConst = false; // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. - APInt ShuffledElements = APInt::getZero(VF); APInt DemandedElements = APInt::getZero(VF); - DenseMap UniqueElements; constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost; auto EstimateInsertCost = [&](unsigned I, Value *V) { @@ -15742,32 +15739,18 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(), TTI::CastContextHint::None, CostKind); }; - SmallVector ShuffleMask(VF, PoisonMaskElem); SmallVector ConstantShuffleMask(VF, PoisonMaskElem); std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0); for (auto [I, V] : enumerate(VL)) { // No need to shuffle duplicates for constants. - if ((ForPoisonSrc && isConstant(V)) || isa(V)) { - ShuffledElements.setBit(I); - ShuffleMask[I] = isa(V) ? PoisonMaskElem : I; + if ((ForPoisonSrc && isConstant(V)) || isa(V)) continue; - } if (isConstant(V)) { ConstantShuffleMask[I] = I + VF; - ShuffleMask[I] = I; - continue; - } - auto Res = UniqueElements.try_emplace(V, I); - if (Res.second) { - EstimateInsertCost(I, V); - ShuffleMask[I] = I; continue; } - - DuplicateNonConst = true; - ShuffledElements.setBit(I); - ShuffleMask[I] = Res.first->second; + EstimateInsertCost(I, V); } // FIXME: add a cost for constant vector materialization. bool IsAnyNonUndefConst = @@ -15776,15 +15759,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, if (!ForPoisonSrc && IsAnyNonUndefConst) { Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy, ConstantShuffleMask); - // Update the shuffle mask for shuffling with incoming source (all elements - // are used!) or with constant subvector. - for_each(enumerate(ShuffleMask), [&](auto P) { - if ((!ForPoisonSrc && P.value() == PoisonMaskElem) || - ConstantShuffleMask[P.index()] != PoisonMaskElem) - P.value() = P.index(); - else if (P.value() != PoisonMaskElem) - P.value() += VF; - }); } // 2. Insert unique non-constants. @@ -15793,10 +15767,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, /*Insert=*/true, /*Extract=*/false, CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL); - // 3. Shuffle duplicates. - if (DuplicateNonConst) - Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, - VecTy, ShuffleMask, CostKind); return Cost; } From 1e61b374ba3ba2891dc1abda732b0b9263216785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 16 Apr 2025 13:16:04 +0200 Subject: [PATCH 103/710] [mlir][vector] Tighten the semantics of vector.gather (#135749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch restricts `vector.gather` to only accept tensors and memrefs as valid sources. Currently, the source is typed as `AnyShaped`, which also includes vectors—allowing the following (invalid) construct to pass verification: ```mlir %0 = vector.gather %base[%c0][%indices], %mask, %pass_thru : vector<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> ``` (Note: the source %base here is a vector, which is incorrect.) In contrast, `vector.scatter` currently only accepts memrefs, so some asymmetry remains between the two ops. This PR is a step toward aligning their semantics. --- .../mlir/Dialect/Vector/IR/VectorOps.td | 2 +- mlir/include/mlir/IR/CommonTypeConstraints.td | 15 ++++++++++++- mlir/test/Dialect/Vector/invalid.mlir | 21 +++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 7fc56b1aa4e7e..d7518943229ea 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -1972,7 +1972,7 @@ def Vector_GatherOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods ]>, - Arguments<(ins Arg:$base, + Arguments<(ins Arg, "", [MemRead]>:$base, Variadic:$indices, VectorOfNonZeroRankOf<[AnyInteger, Index]>:$index_vec, VectorOfNonZeroRankOf<[I1]>:$mask, diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td index e6f17ded4628b..45ec1846580f2 100644 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td @@ -63,6 +63,9 @@ def IsTensorTypePred : CPred<"::llvm::isa<::mlir::TensorType>($_self)">; // Whether a type is a MemRefType. def IsMemRefTypePred : CPred<"::llvm::isa<::mlir::MemRefType>($_self)">; +// Whether a type is a TensorType or a MemRefType. +def IsTensorOrMemRefTypePred : Or<[IsTensorTypePred, IsMemRefTypePred]>; + // Whether a type is an UnrankedMemRefType def IsUnrankedMemRefTypePred : CPred<"::llvm::isa<::mlir::UnrankedMemRefType>($_self)">; @@ -426,7 +429,9 @@ class ValueSemanticsContainerOf allowedTypes> : ShapedContainerType; +//===----------------------------------------------------------------------===// // Vector types. +//===----------------------------------------------------------------------===// class VectorOfNonZeroRankOf allowedTypes> : ShapedContainerType allowedTypes> def AnyStaticShapeTensor : StaticShapeTensorOf<[AnyType]>; //===----------------------------------------------------------------------===// -// Memref type. +// Memref types. //===----------------------------------------------------------------------===// // Any unranked memref whose element type is from the given `allowedTypes` list. @@ -878,6 +883,14 @@ class NestedTupleOf allowedTypes> : "getFlattenedTypes(::llvm::cast<::mlir::TupleType>($_self))", "nested tuple">; +//===----------------------------------------------------------------------===// +// Mixed types +//===----------------------------------------------------------------------===// + +class TensorOrMemRef allowedTypes> : + ShapedContainerType; + //===----------------------------------------------------------------------===// // Common type constraints //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index dbf829e014b8d..3a8320971bac4 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1409,6 +1409,16 @@ func.func @maskedstore_memref_mismatch(%base: memref, %mask: vector<16xi1 // ----- +func.func @gather_from_vector(%base: vector<16xf32>, %indices: vector<16xi32>, + %mask: vector<16xi1>, %pass_thru: vector<16xf32>) { + %c0 = arith.constant 0 : index + // expected-error@+1 {{'vector.gather' op operand #0 must be Tensor or MemRef of any type values, but got 'vector<16xf32>'}} + %0 = vector.gather %base[%c0][%indices], %mask, %pass_thru + : vector<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +} + +// ----- + func.func @gather_base_type_mismatch(%base: memref, %indices: vector<16xi32>, %mask: vector<16xi1>, %pass_thru: vector<16xf32>) { %c0 = arith.constant 0 : index @@ -1469,6 +1479,17 @@ func.func @gather_pass_thru_type_mismatch(%base: memref, %indices: vector // ----- +func.func @scatter_to_vector(%base: vector<16xf32>, %indices: vector<16xi32>, + %mask: vector<16xi1>, %pass_thru: vector<16xf32>) { + %c0 = arith.constant 0 : index + // expected-error@+2 {{custom op 'vector.scatter' invalid kind of type specified}} + vector.scatter %base[%c0][%indices], %mask, %pass_thru + : vector<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +} + +// ----- + + func.func @scatter_base_type_mismatch(%base: memref, %indices: vector<16xi32>, %mask: vector<16xi1>, %value: vector<16xf32>) { %c0 = arith.constant 0 : index From 2d63faead4e6339e679ab62113f47112d67a5b06 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Apr 2025 13:21:25 +0200 Subject: [PATCH 104/710] [clang][bytecode][NFC] Remove PT_FnPtr (#135947) We don't need this anymore since we don't return it from classify() anymore. --- clang/lib/AST/ByteCode/Compiler.cpp | 15 ++++++------- clang/lib/AST/ByteCode/Disasm.cpp | 2 -- clang/lib/AST/ByteCode/EvalEmitter.cpp | 12 ----------- clang/lib/AST/ByteCode/FunctionPointer.cpp | 13 ++++------- clang/lib/AST/ByteCode/FunctionPointer.h | 25 ++-------------------- clang/lib/AST/ByteCode/Interp.h | 21 ------------------ clang/lib/AST/ByteCode/InterpBuiltin.cpp | 6 +----- clang/lib/AST/ByteCode/InterpStack.h | 2 -- clang/lib/AST/ByteCode/Opcodes.td | 7 ++---- clang/lib/AST/ByteCode/Pointer.cpp | 6 +++--- clang/lib/AST/ByteCode/PrimType.h | 9 ++------ 11 files changed, 20 insertions(+), 98 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index afd8d09a088cd..157e306e5cdb3 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4057,7 +4057,7 @@ template bool Compiler::visitBool(const Expr *E) { return true; // Convert pointers to bool. - if (T == PT_Ptr || T == PT_FnPtr) { + if (T == PT_Ptr) { if (!this->emitNull(*T, 0, nullptr, E)) return false; return this->emitNE(*T, E); @@ -4103,8 +4103,6 @@ bool Compiler::visitZeroInitializer(PrimType T, QualType QT, case PT_Ptr: return this->emitNullPtr(Ctx.getASTContext().getTargetNullPointerValue(QT), nullptr, E); - case PT_FnPtr: - return this->emitNullFnPtr(0, nullptr, E); case PT_MemberPtr: return this->emitNullMemberPtr(0, nullptr, E); case PT_Float: @@ -4255,7 +4253,6 @@ bool Compiler::emitConst(T Value, PrimType Ty, const Expr *E) { case PT_Bool: return this->emitConstBool(Value, E); case PT_Ptr: - case PT_FnPtr: case PT_MemberPtr: case PT_Float: case PT_IntAP: @@ -4956,7 +4953,7 @@ bool Compiler::VisitCallExpr(const CallExpr *E) { // If we know the callee already, check the known parametrs for nullability. if (FuncDecl && NonNullArgs[ArgIndex]) { PrimType ArgT = classify(Arg).value_or(PT_Ptr); - if (ArgT == PT_Ptr || ArgT == PT_FnPtr) { + if (ArgT == PT_Ptr) { if (!this->emitCheckNonNullArg(ArgT, Arg)) return false; } @@ -5997,7 +5994,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr || T == PT_FnPtr) { + if (T == PT_Ptr) { if (!this->emitIncPtr(E)) return false; @@ -6021,7 +6018,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr || T == PT_FnPtr) { + if (T == PT_Ptr) { if (!this->emitDecPtr(E)) return false; @@ -6045,7 +6042,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr || T == PT_FnPtr) { + if (T == PT_Ptr) { if (!this->emitLoadPtr(E)) return false; if (!this->emitConstUint8(1, E)) @@ -6088,7 +6085,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr || T == PT_FnPtr) { + if (T == PT_Ptr) { if (!this->emitLoadPtr(E)) return false; if (!this->emitConstUint8(1, E)) diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index 4bdf0f0afb1b0..83b9af9de0796 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -256,8 +256,6 @@ static const char *primTypeToString(PrimType T) { return "Float"; case PT_Ptr: return "Ptr"; - case PT_FnPtr: - return "FnPtr"; case PT_MemberPtr: return "MemberPtr"; case PT_FixedPoint: diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index a37be38c7a839..71d688498ffa5 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -142,10 +142,6 @@ bool EvalEmitter::speculate(const CallExpr *E, const LabelTy &EndLabel) { if (T == PT_Ptr) { const auto &Ptr = S.Stk.pop(); return this->emitBool(CheckBCPResult(S, Ptr), E); - } else if (T == PT_FnPtr) { - S.Stk.discard(); - // Never accepted - return this->emitBool(false, E); } // Otherwise, this is fine! @@ -210,14 +206,6 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { return true; } -template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { - if (!isActive()) - return true; - - // Function pointers cannot be converted to rvalues. - EvalResult.setFunctionPointer(S.Stk.pop()); - return true; -} bool EvalEmitter::emitRetVoid(const SourceInfo &Info) { EvalResult.setValid(); diff --git a/clang/lib/AST/ByteCode/FunctionPointer.cpp b/clang/lib/AST/ByteCode/FunctionPointer.cpp index 6b0b559a63386..4ab7af170efe4 100644 --- a/clang/lib/AST/ByteCode/FunctionPointer.cpp +++ b/clang/lib/AST/ByteCode/FunctionPointer.cpp @@ -16,27 +16,22 @@ APValue FunctionPointer::toAPValue(const ASTContext &) const { return APValue(static_cast(nullptr), CharUnits::Zero(), {}, /*OnePastTheEnd=*/false, /*IsNull=*/true); - if (!Valid) - return APValue(static_cast(nullptr), - CharUnits::fromQuantity(getIntegerRepresentation()), {}, - /*OnePastTheEnd=*/false, /*IsNull=*/false); - if (Func->getDecl()) - return APValue(Func->getDecl(), CharUnits::fromQuantity(Offset), {}, + return APValue(Func->getDecl(), CharUnits::fromQuantity(0), {}, /*OnePastTheEnd=*/false, /*IsNull=*/false); - return APValue(Func->getExpr(), CharUnits::fromQuantity(Offset), {}, + return APValue(Func->getExpr(), CharUnits::fromQuantity(0), {}, /*OnePastTheEnd=*/false, /*IsNull=*/false); } void FunctionPointer::print(llvm::raw_ostream &OS) const { OS << "FnPtr("; - if (Func && Valid) + if (Func) OS << Func->getName(); else if (Func) OS << reinterpret_cast(Func); else OS << "nullptr"; - OS << ") + " << Offset; + OS << ")"; } } // namespace interp diff --git a/clang/lib/AST/ByteCode/FunctionPointer.h b/clang/lib/AST/ByteCode/FunctionPointer.h index e2b45b2344fdc..9e8ea2f1af5f8 100644 --- a/clang/lib/AST/ByteCode/FunctionPointer.h +++ b/clang/lib/AST/ByteCode/FunctionPointer.h @@ -20,24 +20,15 @@ namespace interp { class FunctionPointer final { private: const Function *Func; - uint64_t Offset; - bool Valid; public: FunctionPointer() = default; - FunctionPointer(const Function *Func, uint64_t Offset = 0) - : Func(Func), Offset(Offset), Valid(true) {} - - FunctionPointer(uintptr_t IntVal, const Descriptor *Desc = nullptr) - : Func(reinterpret_cast(IntVal)), Offset(0), - Valid(false) {} + FunctionPointer(const Function *Func) : Func(Func) {} const Function *getFunction() const { return Func; } - uint64_t getOffset() const { return Offset; } bool isZero() const { return !Func; } - bool isValid() const { return Valid; } bool isWeak() const { - if (!Func || !Valid || !Func->getDecl()) + if (!Func || !Func->getDecl()) return false; return Func->getDecl()->isWeak(); @@ -56,20 +47,8 @@ class FunctionPointer final { uint64_t getIntegerRepresentation() const { return static_cast(reinterpret_cast(Func)); } - - ComparisonCategoryResult compare(const FunctionPointer &RHS) const { - if (Func == RHS.Func && Offset == RHS.Offset) - return ComparisonCategoryResult::Equal; - return ComparisonCategoryResult::Unordered; - } }; -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - FunctionPointer FP) { - FP.print(OS); - return OS; -} - } // namespace interp } // namespace clang diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 88a011efe708e..bd58c2a88e9d9 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -20,7 +20,6 @@ #include "FixedPoint.h" #include "Floating.h" #include "Function.h" -#include "FunctionPointer.h" #include "InterpBuiltinBitCast.h" #include "InterpFrame.h" #include "InterpStack.h" @@ -984,26 +983,6 @@ bool CmpHelperEQ(InterpState &S, CodePtr OpPC, CompareFn Fn) { return CmpHelper(S, OpPC, Fn); } -template <> -inline bool CmpHelperEQ(InterpState &S, CodePtr OpPC, - CompareFn Fn) { - const auto &RHS = S.Stk.pop(); - const auto &LHS = S.Stk.pop(); - - // We cannot compare against weak declarations at compile time. - for (const auto &FP : {LHS, RHS}) { - if (FP.isWeak()) { - const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_pointer_weak_comparison) - << FP.toDiagnosticString(S.getASTContext()); - return false; - } - } - - S.Stk.push(Boolean::from(Fn(LHS.compare(RHS)))); - return true; -} - template <> inline bool CmpHelper(InterpState &S, CodePtr OpPC, CompareFn Fn) { using BoolT = PrimConv::T; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index bde416d98edd3..d06941bf10fe0 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -130,7 +130,6 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC, return Ret(S, OpPC); switch (*T) { RET_CASE(PT_Ptr); - RET_CASE(PT_FnPtr); RET_CASE(PT_Float); RET_CASE(PT_Bool); RET_CASE(PT_Sint8); @@ -766,10 +765,7 @@ static bool interp__builtin_addressof(InterpState &S, CodePtr OpPC, assert(Call->getArg(0)->isLValue()); PrimType PtrT = S.getContext().classify(Call->getArg(0)).value_or(PT_Ptr); - if (PtrT == PT_FnPtr) { - const FunctionPointer &Arg = S.Stk.peek(); - S.Stk.push(Arg); - } else if (PtrT == PT_Ptr) { + if (PtrT == PT_Ptr) { const Pointer &Arg = S.Stk.peek(); S.Stk.push(Arg); } else { diff --git a/clang/lib/AST/ByteCode/InterpStack.h b/clang/lib/AST/ByteCode/InterpStack.h index f7b8c386bcc13..0b76f1d650580 100644 --- a/clang/lib/AST/ByteCode/InterpStack.h +++ b/clang/lib/AST/ByteCode/InterpStack.h @@ -183,8 +183,6 @@ class InterpStack final { return PT_Uint64; else if constexpr (std::is_same_v) return PT_Float; - else if constexpr (std::is_same_v) - return PT_FnPtr; else if constexpr (std::is_same_v>) return PT_IntAP; else if constexpr (std::is_same_v>) diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 798771bf91f05..5a9079fea0846 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -29,7 +29,6 @@ def IntAP : Type; def IntAPS : Type; def Float : Type; def Ptr : Type; -def FnPtr : Type; def MemberPtr : Type; def FixedPoint : Type; @@ -106,9 +105,7 @@ def AluTypeClass : TypeClass { let Types = !listconcat(IntegerTypeClass.Types, [Bool], [FixedPoint]); } -def PtrTypeClass : TypeClass { - let Types = [Ptr, FnPtr, MemberPtr]; -} +def PtrTypeClass : TypeClass { let Types = [Ptr, MemberPtr]; } def NonPtrTypeClass : TypeClass { let Types = !listconcat(IntegerTypeClass.Types, [Bool], [Float], [FixedPoint]); @@ -119,7 +116,7 @@ def AllTypeClass : TypeClass { } def ComparableTypeClass : TypeClass { - let Types = !listconcat(AluTypeClass.Types, [Ptr], [Float], [FnPtr]); + let Types = !listconcat(AluTypeClass.Types, [Ptr], [Float]); } class SingletonTypeClass : TypeClass { diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index c09d3224b1f36..c43c0a063bd9e 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -155,10 +155,10 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const { if (isFunctionPointer()) { const FunctionPointer &FP = asFunctionPointer(); if (const FunctionDecl *FD = FP.getFunction()->getDecl()) - return APValue(FD, CharUnits::fromQuantity(FP.getOffset() + Offset), {}, + return APValue(FD, CharUnits::fromQuantity(Offset), {}, /*OnePastTheEnd=*/false, /*IsNull=*/false); - return APValue(FP.getFunction()->getExpr(), - CharUnits::fromQuantity(FP.getOffset() + Offset), {}, + return APValue(FP.getFunction()->getExpr(), CharUnits::fromQuantity(Offset), + {}, /*OnePastTheEnd=*/false, /*IsNull=*/false); } diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h index 59c04c4673d93..a3c0b0f3ceca8 100644 --- a/clang/lib/AST/ByteCode/PrimType.h +++ b/clang/lib/AST/ByteCode/PrimType.h @@ -46,12 +46,11 @@ enum PrimType : unsigned { PT_FixedPoint = 11, PT_Float = 12, PT_Ptr = 13, - PT_FnPtr = 14, - PT_MemberPtr = 15, + PT_MemberPtr = 14, }; inline constexpr bool isPtrType(PrimType T) { - return T == PT_Ptr || T == PT_FnPtr || T == PT_MemberPtr; + return T == PT_Ptr || T == PT_MemberPtr; } enum class CastKind : uint8_t { @@ -114,9 +113,6 @@ template <> struct PrimConv { template <> struct PrimConv { using T = Pointer; }; -template <> struct PrimConv { - using T = FunctionPointer; -}; template <> struct PrimConv { using T = MemberPointer; }; @@ -166,7 +162,6 @@ static inline bool aligned(const void *P) { TYPE_SWITCH_CASE(PT_Float, B) \ TYPE_SWITCH_CASE(PT_Bool, B) \ TYPE_SWITCH_CASE(PT_Ptr, B) \ - TYPE_SWITCH_CASE(PT_FnPtr, B) \ TYPE_SWITCH_CASE(PT_MemberPtr, B) \ TYPE_SWITCH_CASE(PT_FixedPoint, B) \ } \ From bc03d6cce25712601423398350f56114e64e4e29 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 16 Apr 2025 13:30:45 +0200 Subject: [PATCH 105/710] [VPlan] Introduce all loop regions as VPlan transform. (NFC) (#129402) Further simplify VPlan CFG builder by moving introduction of inner regions to a VPlan transform, building on https://github.com/llvm/llvm-project/pull/128419. The HCFG builder now only constructs plain CFGs. I will move it to VPlanConstruction as follow-up. Depends on https://github.com/llvm/llvm-project/pull/128419. PR: https://github.com/llvm/llvm-project/pull/129402 --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- llvm/lib/Transforms/Vectorize/VPlan.h | 8 + .../Vectorize/VPlanConstruction.cpp | 103 +++++++++--- .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 150 +++--------------- .../Transforms/Vectorize/VPlanHCFGBuilder.h | 8 +- .../Transforms/Vectorize/VPlanTransforms.h | 19 ++- llvm/lib/Transforms/Vectorize/VPlanValue.h | 6 + .../vplan-printing-outer-loop.ll | 29 ++-- .../LoopVectorize/vplan_hcfg_stress_test.ll | 2 +- .../Transforms/Vectorize/VPlanTestBase.h | 6 +- 10 files changed, 149 insertions(+), 198 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index af94dc01c8c5c..dd7f05465a50b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9544,14 +9544,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Range); auto Plan = std::make_unique(OrigLoop); // Build hierarchical CFG. - // Convert to VPlan-transform and consoliate all transforms for VPlan + // TODO: Convert to VPlan-transform and consolidate all transforms for VPlan // creation. VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); - HCFGBuilder.buildHierarchicalCFG(); + HCFGBuilder.buildPlainCFG(); - VPlanTransforms::introduceTopLevelVectorLoopRegion( - *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, - CM.foldTailByMasking(), OrigLoop); + VPlanTransforms::createLoopRegions(*Plan, Legal->getWidestInductionType(), + PSE, RequiresScalarEpilogueCheck, + CM.foldTailByMasking(), OrigLoop); // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split @@ -9851,10 +9851,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { auto Plan = std::make_unique(OrigLoop); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); - HCFGBuilder.buildHierarchicalCFG(); + HCFGBuilder.buildPlainCFG(); - VPlanTransforms::introduceTopLevelVectorLoopRegion( - *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop); + VPlanTransforms::createLoopRegions(*Plan, Legal->getWidestInductionType(), + PSE, true, false, OrigLoop); for (ElementCount VF : Range) Plan->addVF(VF); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 94b5167c60089..7084676af6d5b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -57,6 +57,7 @@ class SCEV; class Type; class VPBasicBlock; class VPBuilder; +class VPDominatorTree; class VPRegionBlock; class VPlan; class VPLane; @@ -303,6 +304,13 @@ class VPBlockBase { /// Remove all the successors of this block. void clearSuccessors() { Successors.clear(); } + /// Swap predecessors of the block. The block must have exactly 2 + /// predecessors. + void swapPredecessors() { + assert(Predecessors.size() == 2 && "must have 2 predecessors to swap"); + std::swap(Predecessors[0], Predecessors[1]); + } + /// Swap successors of the block. The block must have exactly 2 successors. // TODO: This should be part of introducing conditional branch recipes rather // than being independent. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index f58f0290b5fa9..1e687d0879f18 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -14,26 +14,88 @@ #include "LoopVectorizationPlanner.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "VPlanTransforms.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" using namespace llvm; -void VPlanTransforms::introduceTopLevelVectorLoopRegion( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) { - // TODO: Generalize to introduce all loop regions. - auto *HeaderVPBB = cast(Plan.getEntry()->getSingleSuccessor()); - VPBlockUtils::disconnectBlocks(Plan.getEntry(), HeaderVPBB); +/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it +/// has exactly 2 predecessors (preheader and latch), where the block +/// dominates the latch and the preheader dominates the block. If it is a +/// header block return true, making sure the preheader appears first and +/// the latch second. Otherwise return false. +static bool canonicalHeader(VPBlockBase *HeaderVPB, + const VPDominatorTree &VPDT) { + ArrayRef Preds = HeaderVPB->getPredecessors(); + if (Preds.size() != 2) + return false; - VPBasicBlock *OriginalLatch = - cast(HeaderVPBB->getSinglePredecessor()); - VPBlockUtils::disconnectBlocks(OriginalLatch, HeaderVPBB); - VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph"); - VPBlockUtils::connectBlocks(Plan.getEntry(), VecPreheader); - assert(OriginalLatch->getNumSuccessors() == 0 && - "Plan should end at top level latch"); + auto *PreheaderVPBB = Preds[0]; + auto *LatchVPBB = Preds[1]; + if (VPDT.dominates(PreheaderVPBB, HeaderVPB) && + VPDT.dominates(HeaderVPB, LatchVPBB)) + return true; + + std::swap(PreheaderVPBB, LatchVPBB); + + if (VPDT.dominates(PreheaderVPBB, HeaderVPB) && + VPDT.dominates(HeaderVPB, LatchVPBB)) { + // Canonicalize predecessors of header so that preheader is first and latch + // second. + HeaderVPB->swapPredecessors(); + for (VPRecipeBase &R : cast(HeaderVPB)->phis()) + R.swapOperands(); + return true; + } + + return false; +} + +/// Create a new VPRegionBlock for the loop starting at \p HeaderVPB. +static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { + auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0]; + auto *LatchVPBB = HeaderVPB->getPredecessors()[1]; + + VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPB); + VPBlockUtils::disconnectBlocks(LatchVPBB, HeaderVPB); + VPBlockBase *Succ = LatchVPBB->getSingleSuccessor(); + assert(LatchVPBB->getNumSuccessors() <= 1 && + "Latch has more than one successor"); + if (Succ) + VPBlockUtils::disconnectBlocks(LatchVPBB, Succ); + + auto *R = Plan.createVPRegionBlock(HeaderVPB, LatchVPBB, "", + false /*isReplicator*/); + R->setParent(HeaderVPB->getParent()); + // All VPBB's reachable shallowly from HeaderVPB belong to top level loop, + // because VPlan is expected to end at top level latch disconnected above. + for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPB)) + VPBB->setParent(R); + + VPBlockUtils::insertBlockAfter(R, PreheaderVPBB); + if (Succ) + VPBlockUtils::connectBlocks(R, Succ); +} + +void VPlanTransforms::createLoopRegions(VPlan &Plan, Type *InductionTy, + PredicatedScalarEvolution &PSE, + bool RequiresScalarEpilogueCheck, + bool TailFolded, Loop *TheLoop) { + VPDominatorTree VPDT; + VPDT.recalculate(Plan); + for (VPBlockBase *HeaderVPB : vp_depth_first_shallow(Plan.getEntry())) + if (canonicalHeader(HeaderVPB, VPDT)) + createLoopRegion(Plan, HeaderVPB); + + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + auto *OrigExiting = TopRegion->getExiting(); + VPBasicBlock *LatchVPBB = Plan.createVPBasicBlock("vector.latch"); + VPBlockUtils::insertBlockAfter(LatchVPBB, OrigExiting); + TopRegion->setExiting(LatchVPBB); + TopRegion->setName("vector loop"); + TopRegion->getEntryBasicBlock()->setName("vector.body"); // Create SCEV and VPValue for the trip count. // We use the symbolic max backedge-taken-count, which works also when @@ -47,18 +109,9 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion( Plan.setTripCount( vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE)); - // Create VPRegionBlock, with existing header and new empty latch block, to be - // filled. - VPBasicBlock *LatchVPBB = Plan.createVPBasicBlock("vector.latch"); - VPBlockUtils::insertBlockAfter(LatchVPBB, OriginalLatch); - auto *TopRegion = Plan.createVPRegionBlock( - HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/); - // All VPBB's reachable shallowly from HeaderVPBB belong to top level loop, - // because VPlan is expected to end at top level latch. - for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPBB)) - VPBB->setParent(TopRegion); - - VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); + VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph"); + VPBlockUtils::insertBlockAfter(VecPreheader, Plan.getEntry()); + VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 4b8a2420b3037..5bacd2d4e6d88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -12,9 +12,7 @@ /// components and steps: // /// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that -/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top -/// Region) is created to enclose and serve as parent of all the VPBasicBlocks -/// in the plain CFG. +/// faithfully represents the CFG in the incoming IR. /// NOTE: At this point, there is a direct correspondence between all the /// VPBasicBlocks created for the initial plain CFG and the incoming /// BasicBlocks. However, this might change in the future. @@ -57,12 +55,8 @@ class PlainCFGBuilder { // Hold phi node's that need to be fixed once the plain CFG has been built. SmallVector PhisToFix; - /// Maps loops in the original IR to their corresponding region. - DenseMap Loop2Region; - // Utility functions. void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); - void setRegionPredsFromBB(VPRegionBlock *VPBB, BasicBlock *BB); void fixHeaderPhis(); VPBasicBlock *getOrCreateVPBB(BasicBlock *BB); #ifndef NDEBUG @@ -83,25 +77,6 @@ class PlainCFGBuilder { // Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB // must have no predecessors. void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) { - auto GetLatchOfExit = [this](BasicBlock *BB) -> BasicBlock * { - auto *SinglePred = BB->getSinglePredecessor(); - Loop *LoopForBB = LI->getLoopFor(BB); - if (!SinglePred || LI->getLoopFor(SinglePred) == LoopForBB) - return nullptr; - // The input IR must be in loop-simplify form, ensuring a single predecessor - // for exit blocks. - assert(SinglePred == LI->getLoopFor(SinglePred)->getLoopLatch() && - "SinglePred must be the only loop latch"); - return SinglePred; - }; - if (auto *LatchBB = GetLatchOfExit(BB)) { - auto *PredRegion = getOrCreateVPBB(LatchBB)->getParent(); - assert(VPBB == cast(PredRegion->getSingleSuccessor()) && - "successor must already be set for PredRegion; it must have VPBB " - "as single successor"); - VPBB->setPredecessors({PredRegion}); - return; - } // Collect VPBB predecessors. SmallVector VPBBPreds; for (BasicBlock *Pred : predecessors(BB)) @@ -113,13 +88,6 @@ static bool isHeaderBB(BasicBlock *BB, Loop *L) { return L && BB == L->getHeader(); } -void PlainCFGBuilder::setRegionPredsFromBB(VPRegionBlock *Region, - BasicBlock *BB) { - // BB is a loop header block. Connect the region to the loop preheader. - Loop *LoopOfBB = LI->getLoopFor(BB); - Region->setPredecessors({getOrCreateVPBB(LoopOfBB->getLoopPredecessor())}); -} - // Add operands to VPInstructions representing phi nodes from the input IR. void PlainCFGBuilder::fixHeaderPhis() { for (auto *Phi : PhisToFix) { @@ -130,43 +98,18 @@ void PlainCFGBuilder::fixHeaderPhis() { auto *VPPhi = cast(VPVal); assert(VPPhi->getNumOperands() == 0 && "Expected VPInstruction with no operands."); - - Loop *L = LI->getLoopFor(Phi->getParent()); - assert(isHeaderBB(Phi->getParent(), L)); - // For header phis, make sure the incoming value from the loop - // predecessor is the first operand of the recipe. + assert(isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent())) && + "Expected Phi in header block."); assert(Phi->getNumOperands() == 2 && "header phi must have exactly 2 operands"); - BasicBlock *LoopPred = L->getLoopPredecessor(); - VPPhi->addOperand( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred))); - BasicBlock *LoopLatch = L->getLoopLatch(); - VPPhi->addOperand( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch))); - } -} - -static bool isHeaderVPBB(VPBasicBlock *VPBB) { - return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB; -} - -/// Return true of \p L loop is contained within \p OuterLoop. -static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) { - if (L->getLoopDepth() < OuterLoop->getLoopDepth()) - return false; - const Loop *P = L; - while (P) { - if (P == OuterLoop) - return true; - P = P->getParentLoop(); + for (BasicBlock *Pred : predecessors(Phi->getParent())) + VPPhi->addOperand( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(Pred))); } - return false; } -// Create a new empty VPBasicBlock for an incoming BasicBlock in the region -// corresponding to the containing loop or retrieve an existing one if it was -// already created. If no region exists yet for the loop containing \p BB, a new -// one is created. +// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an +// existing one if it was already created. VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { if (auto *VPBB = BB2VPBB.lookup(BB)) { // Retrieve existing VPBB. @@ -174,32 +117,10 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { } // Create new VPBB. - StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName(); + StringRef Name = BB->getName(); LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n"); VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name); BB2VPBB[BB] = VPBB; - - // Get or create a region for the loop containing BB, except for the top - // region of TheLoop which is created later. - Loop *LoopOfBB = LI->getLoopFor(BB); - if (!LoopOfBB || LoopOfBB == TheLoop || !doesContainLoop(LoopOfBB, TheLoop)) - return VPBB; - - auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB); - if (!isHeaderBB(BB, LoopOfBB)) { - assert(RegionOfVPBB && - "Region should have been created by visiting header earlier"); - VPBB->setParent(RegionOfVPBB); - return VPBB; - } - - assert(!RegionOfVPBB && - "First visit of a header basic block expects to register its region."); - // Handle a header - take care of its Region. - RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/); - RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]); - RegionOfVPBB->setEntry(VPBB); - Loop2Region[LoopOfBB] = RegionOfVPBB; return VPBB; } @@ -351,6 +272,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, // Main interface to build the plain CFG. void PlainCFGBuilder::buildPlainCFG( DenseMap &VPB2IRBB) { + VPIRBasicBlock *Entry = cast(Plan.getEntry()); + BB2VPBB[Entry->getIRBasicBlock()] = Entry; // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for @@ -376,26 +299,13 @@ void PlainCFGBuilder::buildPlainCFG( for (BasicBlock *BB : RPO) { // Create or retrieve the VPBasicBlock for this BB. VPBasicBlock *VPBB = getOrCreateVPBB(BB); - VPRegionBlock *Region = VPBB->getParent(); Loop *LoopForBB = LI->getLoopFor(BB); // Set VPBB predecessors in the same order as they are in the incoming BB. - if (!isHeaderBB(BB, LoopForBB)) { - setVPBBPredsFromBB(VPBB, BB); - } else if (Region) { - // BB is a loop header and there's a corresponding region, set the - // predecessor for it. - setRegionPredsFromBB(Region, BB); - } + setVPBBPredsFromBB(VPBB, BB); // Create VPInstructions for BB. createVPInstructionsForVPBB(VPBB, BB); - if (BB == TheLoop->getLoopLatch()) { - VPBasicBlock *HeaderVPBB = getOrCreateVPBB(LoopForBB->getHeader()); - VPBlockUtils::connectBlocks(VPBB, HeaderVPBB); - continue; - } - // Set VPBB successors. We create empty VPBBs for successors if they don't // exist already. Recipes will be created when the successor is visited // during the RPO traversal. @@ -410,10 +320,7 @@ void PlainCFGBuilder::buildPlainCFG( auto *BI = cast(BB->getTerminator()); unsigned NumSuccs = succ_size(BB); if (NumSuccs == 1) { - auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor()); - VPBB->setOneSuccessor(isHeaderVPBB(Successor) - ? Successor->getParent() - : static_cast(Successor)); + VPBB->setOneSuccessor(getOrCreateVPBB(BB->getSingleSuccessor())); continue; } assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() && @@ -423,21 +330,11 @@ void PlainCFGBuilder::buildPlainCFG( BasicBlock *IRSucc1 = BI->getSuccessor(1); VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0); VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1); - if (BB == LoopForBB->getLoopLatch()) { - // For a latch we need to set the successor of the region rather than that - // of VPBB and it should be set to the exit, i.e., non-header successor, - // except for the top region, which is handled elsewhere. - assert(LoopForBB != TheLoop && - "Latch of the top region should have been handled earlier"); - Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1 - : Successor0); - Region->setExiting(VPBB); - continue; - } - // Don't connect any blocks outside the current loop except the latch for - // now. The latch is handled above. - if (LoopForBB) { + // Don't connect any blocks outside the current loop except the latches for + // inner loops. + // TODO: Also connect exit blocks during initial VPlan construction. + if (LoopForBB == TheLoop || BB != LoopForBB->getLoopLatch()) { if (!LoopForBB->contains(IRSucc0)) { VPBB->setOneSuccessor(Successor1); continue; @@ -456,21 +353,16 @@ void PlainCFGBuilder::buildPlainCFG( // corresponding VPlan operands. fixHeaderPhis(); - VPBlockUtils::connectBlocks(Plan.getEntry(), - getOrCreateVPBB(TheLoop->getHeader())); + Plan.getEntry()->setOneSuccessor(getOrCreateVPBB(TheLoop->getHeader())); + Plan.getEntry()->setPlan(&Plan); for (const auto &[IRBB, VPB] : BB2VPBB) VPB2IRBB[VPB] = IRBB; + + LLVM_DEBUG(Plan.setName("Plain CFG\n"); dbgs() << Plan); } void VPlanHCFGBuilder::buildPlainCFG() { PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); PCFGBuilder.buildPlainCFG(VPB2IRBB); } - -// Public interface to build a H-CFG. -void VPlanHCFGBuilder::buildHierarchicalCFG() { - // Build Top Region enclosing the plain CFG. - buildPlainCFG(); - LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan); -} diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index f7f98ed7b1755..f2e90d3f4d9b3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -30,7 +30,6 @@ namespace llvm { class Loop; class LoopInfo; -class VPRegionBlock; class VPlan; class VPlanTestIRBase; class VPBlockBase; @@ -54,15 +53,12 @@ class VPlanHCFGBuilder { /// created for a input IR basic block. DenseMap VPB2IRBB; - /// Build plain CFG for TheLoop and connects it to Plan's entry. - void buildPlainCFG(); - public: VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) : TheLoop(Lp), LI(LI), Plan(P) {} - /// Build H-CFG for TheLoop and update Plan accordingly. - void buildHierarchicalCFG(); + /// Build plain CFG for TheLoop and connects it to Plan's entry. + void buildPlainCFG(); /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if /// there is no such corresponding block. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ee3642a8aff73..a9461b261ddb6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -52,20 +52,19 @@ struct VPlanTransforms { verifyVPlanIsValid(Plan); } - /// Introduce the top-level VPRegionBlock for the main loop in \p Plan. Coming - /// into this function, \p Plan's top-level loop is modeled using a plain CFG. - /// This transform wraps the plain CFG of the top-level loop within a - /// VPRegionBlock and creates a VPValue expression for the original trip - /// count. It will also introduce a dedicated VPBasicBlock for the vector - /// pre-header as well a VPBasicBlock as exit block of the region - /// (middle.block). If a check is needed to guard executing the scalar + /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turing \p Plan's + /// flat CFG into a hierarchical CFG. It also creates a VPValue expression for + /// the original trip count. It will also introduce a dedicated VPBasicBlock + /// for the vector pre-header as well a VPBasicBlock as exit block of the + /// region (middle.block). If a check is needed to guard executing the scalar /// epilogue loop, it will be added to the middle block, together with /// VPBasicBlocks for the scalar preheader and exit blocks. \p InductionTy is /// the type of the canonical induction and used for related values, like the /// trip count expression. - static void introduceTopLevelVectorLoopRegion( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop); + static void createLoopRegions(VPlan &Plan, Type *InductionTy, + PredicatedScalarEvolution &PSE, + bool RequiresScalarEpilogueCheck, + bool TailFolded, Loop *TheLoop); /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. Returns false if any VPInstructions could not be converted diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index ced60a30ad56e..638156eab7a84 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -246,6 +246,12 @@ class VPUser { New->addUser(*this); } + /// Swap operands of the VPUser. It must have exactly 2 operands. + void swapOperands() { + assert(Operands.size() == 2 && "must have 2 operands to swap"); + std::swap(Operands[0], Operands[1]); + } + /// Replaces all uses of \p From in the VPUser with \p To. void replaceUsesOfWith(VPValue *From, VPValue *To); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll index 625a32c098f94..91a5ea6b7fe36 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll @@ -6,35 +6,32 @@ @arr = external global [8 x [8 x i64]], align 16 define void @foo(i64 %n) { -; CHECK: VPlan 'HCFGBuilder: Plain CFG +; CHECK: VPlan 'Plain CFG ; CHECK-NEXT: { ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector.body +; CHECK-NEXT: Successor(s): outer.header ; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: WIDEN-PHI ir<%outer.iv> = phi ir<0>, ir<%outer.iv.next> +; CHECK-NEXT: outer.header: +; CHECK-NEXT: WIDEN-PHI ir<%outer.iv> = phi ir<%outer.iv.next>, ir<0> ; CHECK-NEXT: EMIT ir<%gep.1> = getelementptr ir<@arr2>, ir<0>, ir<%outer.iv> ; CHECK-NEXT: EMIT store ir<%outer.iv>, ir<%gep.1> ; CHECK-NEXT: EMIT ir<%add> = add ir<%outer.iv>, ir<%n> ; CHECK-NEXT: Successor(s): inner ; CHECK-EMPTY: -; CHECK-NEXT: inner: { -; CHECK-NEXT: inner: -; CHECK-NEXT: WIDEN-PHI ir<%inner.iv> = phi ir<0>, ir<%inner.iv.next> -; CHECK-NEXT: EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv> -; CHECK-NEXT: EMIT store ir<%add>, ir<%gep.2> -; CHECK-NEXT: EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1> -; CHECK-NEXT: EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8> -; CHECK-NEXT: EMIT branch-on-cond ir<%inner.ec> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): outer.latch +; CHECK-NEXT: inner: +; CHECK-NEXT: WIDEN-PHI ir<%inner.iv> = phi ir<%inner.iv.next>, ir<0> +; CHECK-NEXT: EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv> +; CHECK-NEXT: EMIT store ir<%add>, ir<%gep.2> +; CHECK-NEXT: EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1> +; CHECK-NEXT: EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8> +; CHECK-NEXT: EMIT branch-on-cond ir<%inner.ec> +; CHECK-NEXT: Successor(s): outer.latch, inner ; CHECK-EMPTY: ; CHECK-NEXT: outer.latch: ; CHECK-NEXT: EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1> ; CHECK-NEXT: EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8> -; CHECK-NEXT: Successor(s): vector.body +; CHECK-NEXT: Successor(s): outer.header ; CHECK-NEXT: } entry: br label %outer.header diff --git a/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll index 89eaca0cfa8c8..29aeb7c4e97f9 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll @@ -4,7 +4,7 @@ ; Verify that the stress testing flag for the VPlan H-CFG builder works as ; expected with and without enabling the VPlan H-CFG Verifier. -; CHECK: VPlan 'HCFGBuilder: Plain CFG +; CHECK: VPlan 'Plain CFG target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index caf5d2357411d..486296535996b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -73,9 +73,9 @@ class VPlanTestIRBase : public testing::Test { PredicatedScalarEvolution PSE(*SE, *L); auto Plan = std::make_unique(L); VPlanHCFGBuilder HCFGBuilder(L, LI.get(), *Plan); - HCFGBuilder.buildHierarchicalCFG(); - VPlanTransforms::introduceTopLevelVectorLoopRegion( - *Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L); + HCFGBuilder.buildPlainCFG(); + VPlanTransforms::createLoopRegions(*Plan, IntegerType::get(*Ctx, 64), PSE, + true, false, L); return Plan; } }; From bb5f53c727419c90e3ad6ca7db49330c64a8f54c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 16 Apr 2025 12:34:11 +0100 Subject: [PATCH 106/710] [DAG] isSplatValue - only treat binop splats with repeated undef elements as undef (#135945) #135597 didn't correctly fix the issue of binops with an undef element from only one operand - only reporting the common undef elements could incorrectly recognise splats where the (binop X, undef) fold might actually be different - we need to ensure both operands have the same demanded undefs for certainty. Fixes #135917 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 10 +- .../rvv/fixed-vectors-buildvec-of-binop.ll | 3 +- llvm/test/CodeGen/X86/pr134602.ll | 2 +- llvm/test/CodeGen/X86/pr135917.ll | 46 +--- .../CodeGen/X86/vector-fshl-rot-sub128.ll | 92 +++++-- llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 250 ++++++++++++++--- .../CodeGen/X86/vector-fshr-rot-sub128.ll | 106 ++++++-- llvm/test/CodeGen/X86/vector-fshr-sub128.ll | 256 +++++++++++++----- 8 files changed, 568 insertions(+), 197 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 46fc8856640de..8682c40898046 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3002,12 +3002,14 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, APInt UndefLHS, UndefRHS; SDValue LHS = V.getOperand(0); SDValue RHS = V.getOperand(1); - // Only propagate common undef elts for both operands, otherwise we might - // fail to handle binop-specific undef handling. + // Only recognize splats with the same demanded undef elements for both + // operands, otherwise we might fail to handle binop-specific undef + // handling. // e.g. (and undef, 0) -> 0 etc. if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) && - isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) { - UndefElts = UndefLHS & UndefRHS; + isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1) && + (DemandedElts & UndefLHS) == (DemandedElts & UndefRHS)) { + UndefElts = UndefLHS | UndefRHS; return true; } return false; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index 3df63b4de82e3..dbbb8362144ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -452,7 +452,8 @@ define void @buggy(i32 %0) #0 { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vor.vi v8, v8, 1 -; RV64-NEXT: vse32.v v8, (zero) +; RV64-NEXT: vrgather.vi v9, v8, 0 +; RV64-NEXT: vse32.v v9, (zero) ; RV64-NEXT: ret entry: %mul.us.us.i.3 = shl i32 %0, 1 diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll index 50efcde462532..063b6f31fe630 100644 --- a/llvm/test/CodeGen/X86/pr134602.ll +++ b/llvm/test/CodeGen/X86/pr134602.ll @@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) { ; X64-NEXT: movzwl %di, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: paddw %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: psrld $16, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr135917.ll b/llvm/test/CodeGen/X86/pr135917.ll index 9eed955128b74..2061e3e7cc395 100644 --- a/llvm/test/CodeGen/X86/pr135917.ll +++ b/llvm/test/CodeGen/X86/pr135917.ll @@ -1,46 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefix=SSE4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefix=AVX512 define i32 @PR135917(i1 %a0) { -; SSE2-LABEL: PR135917: -; SSE2: # %bb.0: -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: addl %ecx, %eax -; SSE2-NEXT: retq -; -; SSE4-LABEL: PR135917: -; SSE4: # %bb.0: -; SSE4-NEXT: movd %edi, %xmm0 -; SSE4-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movd %xmm0, %ecx -; SSE4-NEXT: pextrd $1, %xmm0, %eax -; SSE4-NEXT: addl %ecx, %eax -; SSE4-NEXT: retq -; -; AVX2-LABEL: PR135917: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: retq +; CHECK-LABEL: PR135917: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: notl %edi +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: leal (%rdi,%rdi), %eax +; CHECK-NEXT: retq ; ; AVX512-LABEL: PR135917: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $1, %dil -; AVX512-NEXT: negb %dil ; AVX512-NEXT: kmovd %edi, %k0 ; AVX512-NEXT: knotw %k0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %ecx ; AVX512-NEXT: vpextrd $1, %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 8523cb4973827..9ecc6296a844a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -162,42 +162,72 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i32: ; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v2i32: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pslld $23, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE41-NEXT: psllq %xmm1, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i32: ; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i32: @@ -259,12 +289,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i32: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; X86-SSE2-NEXT: psllq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE2-NEXT: psllq %xmm1, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index eb4d84b8d7dd6..322ebe22671e6 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -248,27 +248,162 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: psllq %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: psllq %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrld %xmm7, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] -; AVX-NEXT: retq +; SSE41-LABEL: splatvar_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pandn %xmm3, %xmm4 +; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: psrld $1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psrld %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: psrld %xmm7, %xmm8 +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psrld %xmm4, %xmm6 +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm4, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_funnnel_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 +; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: splatvar_funnnel_v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatvar_funnnel_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatvar_funnnel_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: @@ -286,26 +421,67 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; XOPAVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: psllq %xmm2, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; X86-SSE2-NEXT: movaps %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: psrld $1, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psrld %xmm2, %xmm6 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrld %xmm7, %xmm2 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: psrld %xmm6, %xmm7 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm5, %xmm1 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pslld $23, %xmm3 +; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index d4874ad2cbd78..178c02f384f9b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -172,42 +172,78 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pslld $23, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i32: ; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] +; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i32: @@ -273,12 +309,24 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; X86-SSE2-NEXT: psrlq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE2-NEXT: psrlq %xmm1, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 +; X86-SSE2-NEXT: psubd %xmm1, %xmm2 +; X86-SSE2-NEXT: pslld $23, %xmm2 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 58dc17988b646..372deb05e550c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -249,78 +249,161 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: psrlq %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: psrlq %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrld %xmm7, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm5, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psrld %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: psrld %xmm7, %xmm8 +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psrld %xmm4, %xmm6 +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm4, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: pslld $23, %xmm2 +; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm0 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 +; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX2-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 -; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 -; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32: @@ -340,26 +423,67 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; XOPAVX2-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: psrlq %xmm2, %xmm3 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; X86-SSE2-NEXT: movaps %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psrld %xmm2, %xmm6 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrld %xmm7, %xmm2 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: psrld %xmm6, %xmm7 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm5, %xmm1 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] +; X86-SSE2-NEXT: pandn %xmm4, %xmm3 +; X86-SSE2-NEXT: pslld $23, %xmm3 +; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm1 +; X86-SSE2-NEXT: paddd %xmm0, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %splat) From cf2399a2ee5b8a721eff385acbaa68fb9b00127c Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 16 Apr 2025 11:34:59 +0000 Subject: [PATCH 107/710] [CI] enable code-format-helper for .cl files (#135748) In clang-format, OpenCL .cl file uses default C++ formatting. There are many pull-requests in libclc project that change OpenCL files. It is beneficial to enable clang-format for them in CI. --- llvm/utils/git/code-format-helper.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index ed102b54f9b52..4e4145dcbb8c2 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -186,7 +186,17 @@ def filter_changed_files(self, changed_files: List[str]) -> List[str]: filtered_files = [] for path in changed_files: _, ext = os.path.splitext(path) - if ext in (".cpp", ".c", ".h", ".hpp", ".hxx", ".cxx", ".inc", ".cppm"): + if ext in ( + ".cpp", + ".c", + ".h", + ".hpp", + ".hxx", + ".cxx", + ".inc", + ".cppm", + ".cl", + ): filtered_files.append(path) elif ext == "" and self.should_include_extensionless_file(path): filtered_files.append(path) From 38ca73db223031b1831cd24ef66ddb6a8546a16c Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Apr 2025 13:37:03 +0200 Subject: [PATCH 108/710] [clang][bytecode] Give typeinfo APValues an LValuePath (#135948) That's what the current interpreter does as well. --- clang/lib/AST/ByteCode/Pointer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index c43c0a063bd9e..686ec381d232a 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -167,7 +167,8 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const { return APValue( APValue::LValueBase::getTypeInfo( TypeInfo, QualType(PointeeStorage.Typeid.TypeInfoType, 0)), - CharUnits::Zero(), APValue::NoLValuePath{}); + CharUnits::Zero(), {}, + /*OnePastTheEnd=*/false, /*IsNull=*/false); } // Build the lvalue base from the block. From 27c1aa9b9cf9e0b14211758ff8f7d3aaba24ffcf Mon Sep 17 00:00:00 2001 From: mgschossmann <109181247+mgschossmann@users.noreply.github.com> Date: Wed, 16 Apr 2025 13:42:37 +0200 Subject: [PATCH 109/710] [Clang,debuginfo] added vtt parameter in destructor DISubroutineType (#130674) Fixes issue #104765: When creating a virtual destructor with an artificial "vtt" argument, the type of "vtt" was previously missing in the `DISubroutineType` `types` array. This commit fixes this behavior and adds a regression test. --- clang/lib/CodeGen/CGDebugInfo.cpp | 21 +++++++++++++--- clang/lib/CodeGen/CGDebugInfo.h | 7 +++++- .../debug-info-dtor-implicit-args.cpp | 24 +++++++++++++++++++ 3 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGenCXX/debug-info-dtor-implicit-args.cpp diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index f3ec498d4064b..1582d911ba052 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -2018,8 +2018,17 @@ CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method, return getOrCreateInstanceMethodType(ThisType, Func, Unit); } -llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType( - QualType ThisPtr, const FunctionProtoType *Func, llvm::DIFile *Unit) { +llvm::DISubroutineType *CGDebugInfo::getOrCreateMethodTypeForDestructor( + const CXXMethodDecl *Method, llvm::DIFile *Unit, QualType FNType) { + const FunctionProtoType *Func = FNType->getAs(); + // skip the first param since it is also this + return getOrCreateInstanceMethodType(Method->getThisType(), Func, Unit, true); +} + +llvm::DISubroutineType * +CGDebugInfo::getOrCreateInstanceMethodType(QualType ThisPtr, + const FunctionProtoType *Func, + llvm::DIFile *Unit, bool SkipFirst) { FunctionProtoType::ExtProtoInfo EPI = Func->getExtProtoInfo(); Qualifiers &Qc = EPI.TypeQuals; Qc.removeConst(); @@ -2059,7 +2068,7 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType( } // Copy rest of the arguments. - for (unsigned i = 1, e = Args.size(); i != e; ++i) + for (unsigned i = (SkipFirst ? 2 : 1), e = Args.size(); i != e; ++i) Elts.push_back(Args[i]); // Attach FlagObjectPointer to the explicit "this" parameter. @@ -4372,6 +4381,12 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateFunctionType(const Decl *D, // subprogram DIE will miss DW_AT_decl_file and DW_AT_decl_line fields. return DBuilder.createSubroutineType(DBuilder.getOrCreateTypeArray({})); + if (const auto *Method = dyn_cast(D)) { + // Read method type from 'FnType' because 'D.getType()' does not cover + // implicit arguments for destructors. + return getOrCreateMethodTypeForDestructor(Method, F, FnType); + } + if (const auto *Method = dyn_cast(D)) return getOrCreateMethodType(Method, F); diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h index b287ce7b92eee..771c129230eea 100644 --- a/clang/lib/CodeGen/CGDebugInfo.h +++ b/clang/lib/CodeGen/CGDebugInfo.h @@ -249,9 +249,14 @@ class CGDebugInfo { /// to get a method type which includes \c this pointer. llvm::DISubroutineType *getOrCreateMethodType(const CXXMethodDecl *Method, llvm::DIFile *F); + + llvm::DISubroutineType * + getOrCreateMethodTypeForDestructor(const CXXMethodDecl *Method, + llvm::DIFile *F, QualType FNType); + llvm::DISubroutineType * getOrCreateInstanceMethodType(QualType ThisPtr, const FunctionProtoType *Func, - llvm::DIFile *Unit); + llvm::DIFile *Unit, bool SkipFirst = false); llvm::DISubroutineType * getOrCreateFunctionType(const Decl *D, QualType FnType, llvm::DIFile *F); /// \return debug info descriptor for vtable. diff --git a/clang/test/CodeGenCXX/debug-info-dtor-implicit-args.cpp b/clang/test/CodeGenCXX/debug-info-dtor-implicit-args.cpp new file mode 100644 index 0000000000000..4bb51dcc4da51 --- /dev/null +++ b/clang/test/CodeGenCXX/debug-info-dtor-implicit-args.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm -debug-info-kind=limited %s -o - | FileCheck --check-prefix MSVC %s + +struct B { + virtual ~B() {} +}; + +struct A : virtual B { +}; + +A a; + + +// CHECK-DAG: !{{[0-9]+}} = !DILocalVariable(name: "vtt", arg: 2, scope: ![[destructor:[0-9]+]], type: ![[vtttype:[0-9]+]], flags: DIFlagArtificial) +// CHECK-DAG: ![[destructor]] = distinct !DISubprogram(name: "~A", {{.*}}, type: ![[subroutinetype:[0-9]+]] +// CHECK-DAG: ![[subroutinetype]] = !DISubroutineType(types: ![[types:[0-9]+]]) +// CHECK-DAG: [[types]] = !{null, !{{[0-9]+}}, ![[vtttype]]} + +// MSVC-DAG: ![[inttype:[0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +// MSVC-DAG: ![[voidpointertype:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +// MSVC-DAG: ![[destructor:[0-9]+]] = distinct !DISubprogram(name: "~A", linkageName: "??_GA@@UEAAPEAXI@Z", {{.*}}, type: ![[subroutinetype:[0-9]+]] +// MSVC-DAG: !{{[0-9]+}} = !DILocalVariable(name: "should_call_delete", arg: 2, scope: ![[destructor]], type: ![[inttype]], flags: DIFlagArtificial) +// MSVC-DAG: ![[subroutinetype]] = !DISubroutineType(types: ![[types:[0-9]+]]) +// MSVC-DAG: [[types]] = !{![[voidpointertype]], !{{[0-9]+}}, ![[inttype]]} From 616613c80b75614736d0781d12c0e1237d79738f Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 16 Apr 2025 14:08:59 +0200 Subject: [PATCH 110/710] [Clang] [Sema] Fix a crash when a `friend` function is redefined as deleted (#135679) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NB: This only fixes the crash introduced in Clang 19; we still accept this code even though we shouldn’t: ```c++ struct S { friend int f() { return 3; } friend int f() = delete; }; ``` I tried figuring out a way to diagnose this redeclaration, but it seems tricky because I kept running into issues around defaulted comparison operators. From my testing, however, this fix here would still be required even once we do start diagnosing this. Fixes #135506. --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaDecl.cpp | 15 ++++------ .../SemaCXX/cxx2c-delete-with-message.cpp | 30 +++++++++++++++++++ 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5af4c08f64cd8..0891fd058bb57 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -425,6 +425,8 @@ Bug Fixes in This Version - Fixed a clang 20 regression where diagnostics attached to some calls to member functions using C++23 "deducing this" did not have a diagnostic location (#GH135522) +- Fixed a crash when a ``friend`` function is redefined as deleted. (#GH135506) + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 5f811c824e11d..127c0a4500a43 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16199,16 +16199,11 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, // This is meant to pop the context added in ActOnStartOfFunctionDef(). ExitFunctionBodyRAII ExitRAII(*this, isLambdaCallOperator(FD)); if (FD) { - // If this is called by Parser::ParseFunctionDefinition() after marking - // the declaration as deleted, and if the deleted-function-body contains - // a message (C++26), then a DefaultedOrDeletedInfo will have already been - // added to store that message; do not overwrite it in that case. - // - // Since this would always set the body to 'nullptr' in that case anyway, - // which is already done when the function decl is initially created, - // always skipping this irrespective of whether there is a delete message - // should not be a problem. - if (!FD->isDeletedAsWritten()) + // The function body and the DefaultedOrDeletedInfo, if present, use + // the same storage; don't overwrite the latter if the former is null + // (the body is initialised to null anyway, so even if the latter isn't + // present, this would still be a no-op). + if (Body) FD->setBody(Body); FD->setWillHaveBody(false); diff --git a/clang/test/SemaCXX/cxx2c-delete-with-message.cpp b/clang/test/SemaCXX/cxx2c-delete-with-message.cpp index 22e65d902ecd4..5609da18c05aa 100644 --- a/clang/test/SemaCXX/cxx2c-delete-with-message.cpp +++ b/clang/test/SemaCXX/cxx2c-delete-with-message.cpp @@ -271,3 +271,33 @@ void operators() { if (to_int_int) {} // expected-error {{attempt to use a deleted function: deleted (TO, operator bool)}} static_cast(to_int_int); // expected-error {{static_cast from 'TO' to 'bool' uses deleted function: deleted (TO, operator bool)}} }; + +namespace gh135506 { +struct a { + // FIXME: We currently don't diagnose these invalid redeclarations if the + // second declaration is defaulted or deleted. This probably needs to be + // handled in ParseCXXInlineMethodDef() after parsing the defaulted/deleted + // body. + friend consteval int f() { return 3; } + friend consteval int f() = delete("foo"); + + friend consteval int g() { return 3; } + friend consteval int g() = delete; + + friend int h() { return 3; } + friend int h() = delete; + + friend consteval int i() = delete; // expected-note {{previous definition is here}} + friend consteval int i() { return 3; } // expected-error {{redefinition of 'i'}} +}; + +struct b { + friend consteval bool operator==(b, b) { return true; } // expected-note {{previous declaration is here}} + friend consteval bool operator==(b, b) = default; // expected-error {{defaulting this equality comparison operator is not allowed because it was already declared outside the class}} +}; + +struct c { + friend consteval bool operator==(c, c) = default; // expected-note {{previous definition is here}} + friend consteval bool operator==(c, c) { return true; } // expected-error {{redefinition of 'operator=='}} +}; +} From 30990c09c99bdcbfa7084d32b2b9851e19b6fb2a Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 16 Apr 2025 14:20:27 +0200 Subject: [PATCH 111/710] Revert "[flang][fir] Lower `do concurrent` loop nests to `fir.do_concurrent` (#132904)" (#135904) This reverts commit 04b87e15e40f8857e29ade8321b8b67691545a50. The reasons for reverting is that the following: 1. I still need need to upstream some part of the do concurrent to OpenMP pass from our downstream implementation and taking this in downstream will make things more difficult. 2. I still need to work on a solution for modeling locality specifiers on `hlfir.do_concurrent` ops. I would prefer to do that and merge the entire stack together instead of having a partial solution. After merging the revert I will reopen the origianl PR and keep it updated against main until I finish the above. --- flang/lib/Lower/Bridge.cpp | 228 +++++++----------- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 3 - flang/test/Lower/do_concurrent.f90 | 39 +-- .../do_concurrent_local_default_init.f90 | 4 +- flang/test/Lower/loops.f90 | 37 ++- flang/test/Lower/loops3.f90 | 4 +- flang/test/Lower/nsw.f90 | 5 +- .../Transforms/DoConcurrent/basic_host.f90 | 3 - .../DoConcurrent/locally_destroyed_temp.f90 | 3 - .../DoConcurrent/loop_nest_test.f90 | 3 - .../multiple_iteration_ranges.f90 | 3 - .../DoConcurrent/non_const_bounds.f90 | 3 - .../DoConcurrent/not_perfectly_nested.f90 | 3 - 13 files changed, 130 insertions(+), 208 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 625dd116fe726..b4d1197822a43 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -94,11 +94,10 @@ struct IncrementLoopInfo { template explicit IncrementLoopInfo(Fortran::semantics::Symbol &sym, const T &lower, const T &upper, const std::optional &step, - bool isConcurrent = false) + bool isUnordered = false) : loopVariableSym{&sym}, lowerExpr{Fortran::semantics::GetExpr(lower)}, upperExpr{Fortran::semantics::GetExpr(upper)}, - stepExpr{Fortran::semantics::GetExpr(step)}, - isConcurrent{isConcurrent} {} + stepExpr{Fortran::semantics::GetExpr(step)}, isUnordered{isUnordered} {} IncrementLoopInfo(IncrementLoopInfo &&) = default; IncrementLoopInfo &operator=(IncrementLoopInfo &&x) = default; @@ -121,7 +120,7 @@ struct IncrementLoopInfo { const Fortran::lower::SomeExpr *upperExpr; const Fortran::lower::SomeExpr *stepExpr; const Fortran::lower::SomeExpr *maskExpr = nullptr; - bool isConcurrent; + bool isUnordered; // do concurrent, forall llvm::SmallVector localSymList; llvm::SmallVector localInitSymList; llvm::SmallVector< @@ -131,7 +130,7 @@ struct IncrementLoopInfo { mlir::Value loopVariable = nullptr; // Data members for structured loops. - mlir::Operation *loopOp = nullptr; + fir::DoLoopOp doLoop = nullptr; // Data members for unstructured loops. bool hasRealControl = false; @@ -1981,7 +1980,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm_unreachable("illegal reduction operator"); } - /// Collect DO CONCURRENT loop control information. + /// Collect DO CONCURRENT or FORALL loop control information. IncrementLoopNestInfo getConcurrentControl( const Fortran::parser::ConcurrentHeader &header, const std::list &localityList = {}) { @@ -2292,14 +2291,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get( builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}); - if (has_attrs) { - if (auto loopOp = mlir::dyn_cast(info.loopOp)) - loopOp.setLoopAnnotationAttr(la); - - if (auto doConcurrentOp = - mlir::dyn_cast(info.loopOp)) - doConcurrentOp.setLoopAnnotationAttr(la); - } + if (has_attrs) + info.doLoop.setLoopAnnotationAttr(la); } /// Generate FIR to begin a structured or unstructured increment loop nest. @@ -2308,77 +2301,96 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm::SmallVectorImpl &dirs) { assert(!incrementLoopNestInfo.empty() && "empty loop nest"); mlir::Location loc = toLocation(); + mlir::Operation *boundsAndStepIP = nullptr; mlir::arith::IntegerOverflowFlags iofBackup{}; - llvm::SmallVector nestLBs; - llvm::SmallVector nestUBs; - llvm::SmallVector nestSts; - llvm::SmallVector nestReduceOperands; - llvm::SmallVector nestReduceAttrs; - bool genDoConcurrent = false; - for (IncrementLoopInfo &info : incrementLoopNestInfo) { - genDoConcurrent = info.isStructured() && info.isConcurrent; - - if (!genDoConcurrent) - info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, - info.isConcurrent); - - if (!getLoweringOptions().getIntegerWrapAround()) { - iofBackup = builder->getIntegerOverflowFlags(); - builder->setIntegerOverflowFlags( - mlir::arith::IntegerOverflowFlags::nsw); - } + mlir::Value lowerValue; + mlir::Value upperValue; + mlir::Value stepValue; - nestLBs.push_back(genControlValue(info.lowerExpr, info)); - nestUBs.push_back(genControlValue(info.upperExpr, info)); - bool isConst = true; - nestSts.push_back(genControlValue( - info.stepExpr, info, info.isStructured() ? nullptr : &isConst)); + { + mlir::OpBuilder::InsertionGuard guard(*builder); - if (!getLoweringOptions().getIntegerWrapAround()) - builder->setIntegerOverflowFlags(iofBackup); + // Set the IP before the first loop in the nest so that all nest bounds + // and step values are created outside the nest. + if (boundsAndStepIP) + builder->setInsertionPointAfter(boundsAndStepIP); - // Use a temp variable for unstructured loops with non-const step. - if (!isConst) { - mlir::Value stepValue = nestSts.back(); - info.stepVariable = builder->createTemporary(loc, stepValue.getType()); - builder->create(loc, stepValue, info.stepVariable); - } - - if (genDoConcurrent && nestReduceOperands.empty()) { - // Create DO CONCURRENT reduce operands and attributes - for (const auto &reduceSym : info.reduceSymList) { - const fir::ReduceOperationEnum reduceOperation = reduceSym.first; - const Fortran::semantics::Symbol *sym = reduceSym.second; - fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr); - nestReduceOperands.push_back(fir::getBase(exv)); - auto reduceAttr = - fir::ReduceAttr::get(builder->getContext(), reduceOperation); - nestReduceAttrs.push_back(reduceAttr); + info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, + info.isUnordered); + if (!getLoweringOptions().getIntegerWrapAround()) { + iofBackup = builder->getIntegerOverflowFlags(); + builder->setIntegerOverflowFlags( + mlir::arith::IntegerOverflowFlags::nsw); + } + lowerValue = genControlValue(info.lowerExpr, info); + upperValue = genControlValue(info.upperExpr, info); + bool isConst = true; + stepValue = genControlValue(info.stepExpr, info, + info.isStructured() ? nullptr : &isConst); + if (!getLoweringOptions().getIntegerWrapAround()) + builder->setIntegerOverflowFlags(iofBackup); + boundsAndStepIP = stepValue.getDefiningOp(); + + // Use a temp variable for unstructured loops with non-const step. + if (!isConst) { + info.stepVariable = + builder->createTemporary(loc, stepValue.getType()); + boundsAndStepIP = + builder->create(loc, stepValue, info.stepVariable); } } - } - for (auto [info, lowerValue, upperValue, stepValue] : - llvm::zip_equal(incrementLoopNestInfo, nestLBs, nestUBs, nestSts)) { // Structured loop - generate fir.do_loop. if (info.isStructured()) { - if (genDoConcurrent) - continue; - - // The loop variable is a doLoop op argument. mlir::Type loopVarType = info.getLoopVariableType(); - auto loopOp = builder->create( - loc, lowerValue, upperValue, stepValue, /*unordered=*/false, - /*finalCountValue=*/true, - builder->createConvert(loc, loopVarType, lowerValue)); - info.loopOp = loopOp; - builder->setInsertionPointToStart(loopOp.getBody()); - mlir::Value loopValue = loopOp.getRegionIterArgs()[0]; - + mlir::Value loopValue; + if (info.isUnordered) { + llvm::SmallVector reduceOperands; + llvm::SmallVector reduceAttrs; + // Create DO CONCURRENT reduce operands and attributes + for (const auto &reduceSym : info.reduceSymList) { + const fir::ReduceOperationEnum reduce_operation = reduceSym.first; + const Fortran::semantics::Symbol *sym = reduceSym.second; + fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr); + reduceOperands.push_back(fir::getBase(exv)); + auto reduce_attr = + fir::ReduceAttr::get(builder->getContext(), reduce_operation); + reduceAttrs.push_back(reduce_attr); + } + // The loop variable value is explicitly updated. + info.doLoop = builder->create( + loc, lowerValue, upperValue, stepValue, /*unordered=*/true, + /*finalCountValue=*/false, /*iterArgs=*/std::nullopt, + llvm::ArrayRef(reduceOperands), reduceAttrs); + builder->setInsertionPointToStart(info.doLoop.getBody()); + loopValue = builder->createConvert(loc, loopVarType, + info.doLoop.getInductionVar()); + } else { + // The loop variable is a doLoop op argument. + info.doLoop = builder->create( + loc, lowerValue, upperValue, stepValue, /*unordered=*/false, + /*finalCountValue=*/true, + builder->createConvert(loc, loopVarType, lowerValue)); + builder->setInsertionPointToStart(info.doLoop.getBody()); + loopValue = info.doLoop.getRegionIterArgs()[0]; + } // Update the loop variable value in case it has non-index references. builder->create(loc, loopValue, info.loopVariable); + if (info.maskExpr) { + Fortran::lower::StatementContext stmtCtx; + mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx); + stmtCtx.finalizeAndReset(); + mlir::Value maskCondCast = + builder->createConvert(loc, builder->getI1Type(), maskCond); + auto ifOp = builder->create(loc, maskCondCast, + /*withElseRegion=*/false); + builder->setInsertionPointToStart(&ifOp.getThenRegion().front()); + } + if (info.hasLocalitySpecs()) + handleLocalitySpecs(info); + addLoopAnnotationAttr(info, dirs); continue; } @@ -2442,60 +2454,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->restoreInsertionPoint(insertPt); } } - - if (genDoConcurrent) { - auto loopWrapperOp = builder->create(loc); - builder->setInsertionPointToStart( - builder->createBlock(&loopWrapperOp.getRegion())); - - for (IncrementLoopInfo &info : llvm::reverse(incrementLoopNestInfo)) { - info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, - info.isConcurrent); - } - - builder->setInsertionPointToEnd(loopWrapperOp.getBody()); - auto loopOp = builder->create( - loc, nestLBs, nestUBs, nestSts, nestReduceOperands, - nestReduceAttrs.empty() - ? nullptr - : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs), - nullptr); - - llvm::SmallVector loopBlockArgTypes( - incrementLoopNestInfo.size(), builder->getIndexType()); - llvm::SmallVector loopBlockArgLocs( - incrementLoopNestInfo.size(), loc); - mlir::Region &loopRegion = loopOp.getRegion(); - mlir::Block *loopBlock = builder->createBlock( - &loopRegion, loopRegion.begin(), loopBlockArgTypes, loopBlockArgLocs); - builder->setInsertionPointToStart(loopBlock); - - for (auto [info, blockArg] : - llvm::zip_equal(incrementLoopNestInfo, loopBlock->getArguments())) { - info.loopOp = loopOp; - mlir::Value loopValue = - builder->createConvert(loc, info.getLoopVariableType(), blockArg); - builder->create(loc, loopValue, info.loopVariable); - - if (info.maskExpr) { - Fortran::lower::StatementContext stmtCtx; - mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx); - stmtCtx.finalizeAndReset(); - mlir::Value maskCondCast = - builder->createConvert(loc, builder->getI1Type(), maskCond); - auto ifOp = builder->create(loc, maskCondCast, - /*withElseRegion=*/false); - builder->setInsertionPointToStart(&ifOp.getThenRegion().front()); - } - } - - IncrementLoopInfo &innermostInfo = incrementLoopNestInfo.back(); - - if (innermostInfo.hasLocalitySpecs()) - handleLocalitySpecs(innermostInfo); - - addLoopAnnotationAttr(innermostInfo, dirs); - } } /// Generate FIR to end a structured or unstructured increment loop nest. @@ -2512,31 +2470,29 @@ class FirConverter : public Fortran::lower::AbstractConverter { it != rend; ++it) { IncrementLoopInfo &info = *it; if (info.isStructured()) { - // End fir.do_concurent.loop. - if (info.isConcurrent) { - builder->setInsertionPointAfter(info.loopOp->getParentOp()); + // End fir.do_loop. + if (info.isUnordered) { + builder->setInsertionPointAfter(info.doLoop); continue; } - - // End fir.do_loop. // Decrement tripVariable. - auto doLoopOp = mlir::cast(info.loopOp); - builder->setInsertionPointToEnd(doLoopOp.getBody()); + builder->setInsertionPointToEnd(info.doLoop.getBody()); llvm::SmallVector results; results.push_back(builder->create( - loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr)); + loc, info.doLoop.getInductionVar(), info.doLoop.getStep(), + iofAttr)); // Step loopVariable to help optimizations such as vectorization. // Induction variable elimination will clean up as necessary. mlir::Value step = builder->createConvert( - loc, info.getLoopVariableType(), doLoopOp.getStep()); + loc, info.getLoopVariableType(), info.doLoop.getStep()); mlir::Value loopVar = builder->create(loc, info.loopVariable); results.push_back( builder->create(loc, loopVar, step, iofAttr)); builder->create(loc, results); - builder->setInsertionPointAfter(doLoopOp); + builder->setInsertionPointAfter(info.doLoop); // The loop control variable may be used after the loop. - builder->create(loc, doLoopOp.getResult(1), + builder->create(loc, info.doLoop.getResult(1), info.loopVariable); continue; } diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index d35367d7657cf..3cf9b5ae72d9e 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -280,9 +280,6 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() { if (auto cufKernelOp = getRegion().getParentOfType()) return &cufKernelOp.getRegion().front(); - if (auto doConcurentOp = getRegion().getParentOfType()) - return doConcurentOp.getBody(); - return getEntryBlock(); } diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90 index cc113f59c35e3..ef93d2d6b035b 100644 --- a/flang/test/Lower/do_concurrent.f90 +++ b/flang/test/Lower/do_concurrent.f90 @@ -14,9 +14,6 @@ subroutine sub1(n) implicit none integer :: n, m, i, j, k integer, dimension(n) :: a -!CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFsub1En"} -!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Ea"} - !CHECK: %[[LB1:.*]] = arith.constant 1 : i32 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index !CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref @@ -32,30 +29,10 @@ subroutine sub1(n) !CHECK: %[[UB3:.*]] = arith.constant 10 : i32 !CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index -!CHECK: fir.do_concurrent -!CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i"} -!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] -!CHECK: %[[J:.*]] = fir.alloca i32 {bindc_name = "j"} -!CHECK: %[[J_DECL:.*]]:2 = hlfir.declare %[[J]] -!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k"} -!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] - -!CHECK: fir.do_concurrent.loop (%[[I_IV:.*]], %[[J_IV:.*]], %[[K_IV:.*]]) = -!CHECK-SAME: (%[[LB1_CVT]], %[[LB2_CVT]], %[[LB3_CVT]]) to -!CHECK-SAME: (%[[UB1_CVT]], %[[UB2_CVT]], %[[UB3_CVT]]) step -!CHECK-SAME: (%{{.*}}, %{{.*}}, %{{.*}}) { -!CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32 -!CHECK: fir.store %[[I_IV_CVT]] to %[[I_DECL]]#0 : !fir.ref -!CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32 -!CHECK: fir.store %[[J_IV_CVT]] to %[[J_DECL]]#0 : !fir.ref -!CHECK: %[[K_IV_CVT:.*]] = fir.convert %[[K_IV]] : (index) -> i32 -!CHECK: fir.store %[[K_IV_CVT]] to %[[K_DECL]]#0 : !fir.ref +!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered +!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered +!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered -!CHECK: %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref -!CHECK: %[[I_VAL:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref -!CHECK: %[[I_VAL_CVT:.*]] = fir.convert %[[I_VAL]] : (i32) -> i64 -!CHECK: %[[A_ELEM:.*]] = hlfir.designate %[[A_DECL]]#0 (%[[I_VAL_CVT]]) -!CHECK: hlfir.assign %[[N_VAL]] to %[[A_ELEM]] : i32, !fir.ref do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10) a(i) = n end do @@ -68,17 +45,14 @@ subroutine sub2(n) integer, dimension(n) :: a !CHECK: %[[LB1:.*]] = arith.constant 1 : i32 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index -!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref +!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref !CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index -!CHECK: fir.do_concurrent -!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB1_CVT]]) to (%[[UB1_CVT]]) step (%{{.*}}) - +!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered !CHECK: %[[LB2:.*]] = arith.constant 1 : i32 !CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index !CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs fastmath : (!fir.ref, !fir.ref) -> i32 !CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index -!CHECK: fir.do_concurrent -!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB2_CVT]]) to (%[[UB2_CVT]]) step (%{{.*}}) +!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered do concurrent(i=1:n) do concurrent(j=1:bar(n*m, n/m)) a(i) = n @@ -86,6 +60,7 @@ subroutine sub2(n) end do end subroutine + !CHECK-LABEL: unstructured subroutine unstructured(inner_step) integer(4) :: i, j, inner_step diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90 index 207704ac1a990..7652e4fcd0402 100644 --- a/flang/test/Lower/do_concurrent_local_default_init.f90 +++ b/flang/test/Lower/do_concurrent_local_default_init.f90 @@ -29,7 +29,7 @@ subroutine test_default_init() ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>>> {fir.bindc_name = "p"}) { ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref>>>> ! CHECK: %[[VAL_7:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box>>>) -> index -! CHECK: fir.do_concurrent.loop +! CHECK: fir.do_loop ! CHECK: %[[VAL_16:.*]] = fir.alloca !fir.box>>> {bindc_name = "p", pinned, uniq_name = "_QFtest_ptrEp"} ! CHECK: %[[VAL_17:.*]] = fir.zero_bits !fir.ptr>> ! CHECK: %[[VAL_18:.*]] = arith.constant 0 : index @@ -43,7 +43,7 @@ subroutine test_default_init() ! CHECK: } ! CHECK-LABEL: func.func @_QPtest_default_init( -! CHECK: fir.do_concurrent.loop +! CHECK: fir.do_loop ! CHECK: %[[VAL_26:.*]] = fir.alloca !fir.type<_QFtest_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFtest_default_initEa"} ! CHECK: %[[VAL_27:.*]] = fir.embox %[[VAL_26]] : (!fir.ref>) -> !fir.box> ! CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.box>) -> !fir.box diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 60df27a591dc3..ea65ba3e4d66d 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -2,6 +2,15 @@ ! CHECK-LABEL: loop_test subroutine loop_test + ! CHECK: %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i"} + ! CHECK: %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i"} + ! CHECK: %[[VAL_4:.*]] = fir.alloca i16 {bindc_name = "i"} + ! CHECK: %[[VAL_5:.*]] = fir.alloca i8 {bindc_name = "k"} + ! CHECK: %[[VAL_6:.*]] = fir.alloca i8 {bindc_name = "j"} + ! CHECK: %[[VAL_7:.*]] = fir.alloca i8 {bindc_name = "i"} + ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "k"} + ! CHECK: %[[VAL_9:.*]] = fir.alloca i32 {bindc_name = "j"} + ! CHECK: %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i"} ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<5x5x5xi32> {bindc_name = "a", uniq_name = "_QFloop_testEa"} ! CHECK: %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "asum", uniq_name = "_QFloop_testEasum"} ! CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFloop_testEi"} @@ -16,7 +25,7 @@ subroutine loop_test j = 200 k = 300 - ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}} + ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered do concurrent (i=1:5, j=1:5, k=1:5) ! shared(a) ! CHECK: fir.coordinate_of a(i,j,k) = 0 @@ -24,7 +33,7 @@ subroutine loop_test ! CHECK: fir.call @_FortranAioBeginExternalListOutput print*, 'A:', i, j, k - ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}} + ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered ! CHECK: fir.if do concurrent (integer(1)::i=1:5, j=1:5, k=1:5, i.ne.j .and. k.ne.3) shared(a) ! CHECK-COUNT-2: fir.coordinate_of @@ -44,7 +53,7 @@ subroutine loop_test ! CHECK: fir.call @_FortranAioBeginExternalListOutput print*, 'B:', i, j, k, '-', asum - ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}} + ! CHECK: fir.do_loop {{.*}} unordered ! CHECK-COUNT-2: fir.if do concurrent (integer(2)::i=1:5, i.ne.3) if (i.eq.2 .or. i.eq.4) goto 5 ! fir.if @@ -53,7 +62,7 @@ subroutine loop_test 5 continue enddo - ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}} + ! CHECK: fir.do_loop {{.*}} unordered ! CHECK-COUNT-2: fir.if do concurrent (integer(2)::i=1:5, i.ne.3) if (i.eq.2 .or. i.eq.4) then ! fir.if @@ -84,6 +93,10 @@ end subroutine loop_test ! CHECK-LABEL: c.func @_QPlis subroutine lis(n) + ! CHECK-DAG: fir.alloca i32 {bindc_name = "m"} + ! CHECK-DAG: fir.alloca i32 {bindc_name = "j"} + ! CHECK-DAG: fir.alloca i32 {bindc_name = "i"} + ! CHECK-DAG: fir.alloca i8 {bindc_name = "i"} ! CHECK-DAG: fir.alloca i32 {bindc_name = "j", uniq_name = "_QFlisEj"} ! CHECK-DAG: fir.alloca i32 {bindc_name = "k", uniq_name = "_QFlisEk"} ! CHECK-DAG: fir.alloca !fir.box>> {bindc_name = "p", uniq_name = "_QFlisEp"} @@ -104,8 +117,8 @@ subroutine lis(n) ! CHECK: } r = 0 - ! CHECK: fir.do_concurrent { - ! CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { + ! CHECK: fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %{{.*}} unordered { + ! CHECK: fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg3 = %{{.*}}) -> (index, i32) { ! CHECK: } ! CHECK: } do concurrent (integer(kind=1)::i=n:1:-1) @@ -115,18 +128,16 @@ subroutine lis(n) enddo enddo - ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}) = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) { + ! CHECK: fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { + ! CHECK: fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { ! CHECK: fir.if %{{.*}} { ! CHECK: %[[V_95:[0-9]+]] = fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "t", pinned, uniq_name = "_QFlisEt"} ! CHECK: %[[V_96:[0-9]+]] = fir.alloca !fir.box>> {bindc_name = "p", pinned, uniq_name = "_QFlisEp"} ! CHECK: fir.store %{{.*}} to %[[V_96]] : !fir.ref>>> ! CHECK: fir.do_loop %arg3 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg4 = %{{.*}}) -> (index, i32) { - ! CHECK: fir.do_concurrent { - ! CHECK: fir.alloca i32 {bindc_name = "m"} - ! CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - ! CHECK: fir.load %[[V_96]] : !fir.ref>>> - ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> - ! CHECK: } + ! CHECK: fir.do_loop %arg5 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered { + ! CHECK: fir.load %[[V_96]] : !fir.ref>>> + ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> ! CHECK: } ! CHECK: } ! CHECK: fir.convert %[[V_95]] : (!fir.ref>) -> !fir.ref> diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 84db1972cca16..78f39e1013082 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -12,7 +12,9 @@ subroutine loop_test ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"} ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref - ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m) tmp = i + j + k sum = tmp + sum diff --git a/flang/test/Lower/nsw.f90 b/flang/test/Lower/nsw.f90 index 2ec1efb2af42a..4ee9e5da829e6 100644 --- a/flang/test/Lower/nsw.f90 +++ b/flang/test/Lower/nsw.f90 @@ -139,6 +139,7 @@ subroutine loop_params3(a,lb,ub,st) ! CHECK-LABEL: func.func @_QPloop_params3( ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref ! CHECK: %[[VAL_11:.*]] = fir.declare %{{.*}}lb"} : (!fir.ref, !fir.dscope) -> !fir.ref ! CHECK: %[[VAL_12:.*]] = fir.declare %{{.*}}ub"} : (!fir.ref, !fir.dscope) -> !fir.ref ! CHECK: %[[VAL_14:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref @@ -152,6 +153,4 @@ subroutine loop_params3(a,lb,ub,st) ! CHECK: %[[VAL_31:.*]] = fir.load %[[VAL_15]] : !fir.ref ! CHECK: %[[VAL_32:.*]] = arith.muli %[[VAL_31]], %[[VAL_4]] overflow : i32 ! CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> index -! CHECK: fir.do_concurrent { -! CHECK: %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref) -> !fir.ref -! CHECK: fir.do_concurrent.loop (%[[VAL_34:.*]]) = (%[[VAL_28]]) to (%[[VAL_30]]) step (%[[VAL_33]]) { +! CHECK: fir.do_loop %[[VAL_34:.*]] = %[[VAL_28]] to %[[VAL_30]] step %[[VAL_33]] unordered { diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index b84d4481ac766..12f63031cbaee 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 index 4e13c0919589a..f82696669eca6 100644 --- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 +++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests that "loop-local values" are properly handled by localizing them to the ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue` ! for a definition of "loop-local values" and how they are handled. diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 index adc4a488d1ec9..32bed61fe69e4 100644 --- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 +++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests loop-nest detection algorithm for do-concurrent mapping. ! REQUIRES: asserts diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 index 26800678d381c..d0210726de83e 100644 --- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests mapping of a `do concurrent` loop with multiple iteration ranges. ! RUN: split-file %s %t diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 index 23a3aae976c07..cd1bd4f98a3f5 100644 --- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 +++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ ! RUN: | FileCheck %s diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 index d1c02101318ab..184fdfe00d397 100644 --- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 +++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that ! we skip converting the not-perfectly nested `do concurrent` loop. From fe4a31d59db7b18dc45c3593bf100c101e725b79 Mon Sep 17 00:00:00 2001 From: Benjamin Chetioui <3920784+bchetioui@users.noreply.github.com> Date: Wed, 16 Apr 2025 14:29:44 +0200 Subject: [PATCH 112/710] [bazel] Fix bazel build after 00eaff3e9c897c263a879416d0f151d7ca7eeaff. (#135949) --- .../IR/BufferizationTypeInterfaces.h | 1 + .../llvm-project-overlay/mlir/BUILD.bazel | 24 +++++++++++++++++++ .../mlir/test/BUILD.bazel | 7 ++++++ 3 files changed, 32 insertions(+) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h index f6b296eccd748..8672aa60a43c5 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h @@ -14,5 +14,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h.inc" +#include "mlir/IR/Types.h" #endif // MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZATIONTYPEINTERFACES_H_ diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 10503fe1d123b..85049ff8339c1 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -12520,6 +12520,28 @@ gentbl_cc_library( deps = [":BufferizationEnumsTdFiles"], ) +td_library( + name = "BufferizationTypeInterfacesTdFiles", + srcs = [ + "include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td", + ], + includes = ["include"], + deps = [ + ":OpBaseTdFiles", + ], +) + +gentbl_cc_library( + name = "BufferizationTypeInterfacesIncGen", + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + }, + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td", + deps = [":BufferizationTypeInterfacesTdFiles"], +) + td_library( name = "BufferizationTransformOpsTdFiles", srcs = [ @@ -12593,6 +12615,7 @@ cc_library( "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h", "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.h", "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h", + "include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h", ], includes = ["include"], deps = [ @@ -12600,6 +12623,7 @@ cc_library( ":BufferViewFlowOpInterfaceIncGen", ":BufferizableOpInterfaceIncGen", ":BufferizationEnumsIncGen", + ":BufferizationTypeInterfacesIncGen", ":FunctionInterfaces", ":IR", ":Support", diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 611fbecd215cd..59410286f22d8 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -110,6 +110,7 @@ td_library( name = "TestOpTdFiles", srcs = glob(["lib/Dialect/Test/*.td"]), deps = [ + "//mlir:BufferizationTypeInterfacesTdFiles", "//mlir:BuiltinDialectTdFiles", "//mlir:CallInterfacesTdFiles", "//mlir:ControlFlowInterfacesTdFiles", @@ -240,6 +241,7 @@ gentbl_cc_library( test = True, deps = [ ":TestOpTdFiles", + "//mlir:BufferizationTypeInterfacesTdFiles", "//mlir:BuiltinDialectTdFiles", ], ) @@ -342,6 +344,7 @@ cc_library( "//llvm:IRReader", "//llvm:Support", "//mlir:ArithDialect", + "//mlir:BufferizationInterfaces", "//mlir:BytecodeOpInterface", "//mlir:CallOpInterfaces", "//mlir:ControlFlowInterfaces", @@ -404,6 +407,7 @@ cc_library( "//llvm:Support", "//mlir:Analysis", "//mlir:ArithDialect", + "//mlir:BufferizationInterfaces", "//mlir:BytecodeOpInterface", "//mlir:BytecodeReader", "//mlir:BytecodeWriter", @@ -986,8 +990,11 @@ cc_library( srcs = glob(["lib/Dialect/Bufferization/*.cpp"]), includes = ["lib/Dialect/Test"], deps = [ + ":TestDialect", "//mlir:BufferizationDialect", + "//mlir:BufferizationInterfaces", "//mlir:BufferizationTransforms", + "//mlir:FuncDialect", "//mlir:IR", "//mlir:Pass", ], From d3153ad66c539ad146062b6e65741901e5b5e1cc Mon Sep 17 00:00:00 2001 From: yronglin Date: Wed, 16 Apr 2025 20:53:25 +0800 Subject: [PATCH 113/710] [clang] Unify `SourceLocation` and `IdentifierInfo*` pair-like data structures to `IdentifierLoc` (#135808) I found this issue when I working on https://github.com/llvm/llvm-project/pull/107168. Currently we have many similiar data structures like: - `std::pair`. - Element type of `ModuleIdPath`. - `IdentifierLocPair`. - `IdentifierLoc`. This PR unify these data structures to `IdentifierLoc`, moved `IdentifierLoc` definition to SourceLocation.h, and deleted other similer data structures. --------- Signed-off-by: yronglin --- .../pp-trace/PPCallbacksTracker.cpp | 4 +- clang/include/clang/AST/OpenACCClause.h | 20 +-- clang/include/clang/Basic/IdentifierTable.h | 26 +++- clang/include/clang/Lex/ModuleLoader.h | 3 +- clang/include/clang/Lex/PPCallbacks.h | 1 + clang/include/clang/Lex/Preprocessor.h | 9 +- clang/include/clang/Parse/LoopHint.h | 2 +- clang/include/clang/Parse/Parser.h | 13 +- clang/include/clang/Sema/ParsedAttr.h | 10 -- clang/include/clang/Sema/Sema.h | 2 +- clang/include/clang/Sema/SemaCodeCompletion.h | 3 +- clang/include/clang/Sema/SemaObjC.h | 4 +- clang/include/clang/Sema/SemaOpenACC.h | 2 +- clang/lib/AST/OpenACCClause.cpp | 4 +- clang/lib/AST/TextNodeDumper.cpp | 4 +- clang/lib/Frontend/CompilerInstance.cpp | 53 ++++---- clang/lib/Frontend/FrontendActions.cpp | 4 +- clang/lib/Lex/PPDirectives.cpp | 22 ++-- clang/lib/Lex/PPLexerChange.cpp | 6 +- clang/lib/Lex/Pragma.cpp | 73 +++++------ clang/lib/Lex/Preprocessor.cpp | 16 +-- clang/lib/Parse/ParseDecl.cpp | 28 ++-- clang/lib/Parse/ParseExpr.cpp | 7 +- clang/lib/Parse/ParseHLSL.cpp | 2 +- clang/lib/Parse/ParseObjc.cpp | 38 +++--- clang/lib/Parse/ParseOpenACC.cpp | 12 +- clang/lib/Parse/ParsePragma.cpp | 15 +-- clang/lib/Parse/ParseStmt.cpp | 6 +- clang/lib/Parse/Parser.cpp | 19 ++- clang/lib/Sema/ParsedAttr.cpp | 8 -- clang/lib/Sema/SemaARM.cpp | 2 +- clang/lib/Sema/SemaCodeComplete.cpp | 8 +- clang/lib/Sema/SemaDeclAttr.cpp | 124 +++++++++--------- clang/lib/Sema/SemaDeclObjC.cpp | 35 ++--- clang/lib/Sema/SemaHLSL.cpp | 12 +- clang/lib/Sema/SemaModule.cpp | 42 +++--- clang/lib/Sema/SemaObjC.cpp | 45 +++---- clang/lib/Sema/SemaOpenACCClause.cpp | 11 +- clang/lib/Sema/SemaStmtAttr.cpp | 29 ++-- clang/lib/Sema/SemaSwift.cpp | 24 ++-- clang/lib/Sema/SemaTemplateVariadic.cpp | 10 +- clang/lib/Sema/SemaType.cpp | 13 +- clang/lib/Serialization/ASTReader.cpp | 2 +- clang/lib/Serialization/ASTWriter.cpp | 8 +- .../DependencyScanning/ModuleDepCollector.cpp | 2 +- 45 files changed, 399 insertions(+), 384 deletions(-) diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp index 3bb30fd15b2e1..4c916fa30685b 100644 --- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp +++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp @@ -547,8 +547,8 @@ void PPCallbacksTracker::appendArgument(const char *Name, ModuleIdPath Value) { if (I) SS << ", "; SS << "{" - << "Name: " << Value[I].first->getName() << ", " - << "Loc: " << getSourceLocationString(PP, Value[I].second) << "}"; + << "Name: " << Value[I].getIdentifierInfo()->getName() << ", " + << "Loc: " << getSourceLocationString(PP, Value[I].getLoc()) << "}"; } SS << "]"; appendArgument(Name, SS.str()); diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 681567228cbb0..f18a6cf62f2c5 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -258,7 +258,7 @@ inline bool operator!=(const OpenACCBindClause &LHS, return !(LHS == RHS); } -using DeviceTypeArgument = std::pair; +using DeviceTypeArgument = IdentifierLoc; /// A 'device_type' or 'dtype' clause, takes a list of either an 'asterisk' or /// an identifier. The 'asterisk' means 'the rest'. class OpenACCDeviceTypeClause final @@ -280,16 +280,16 @@ class OpenACCDeviceTypeClause final "Invalid clause kind for device-type"); assert(!llvm::any_of(Archs, [](const DeviceTypeArgument &Arg) { - return Arg.second.isInvalid(); + return Arg.getLoc().isInvalid(); }) && "Invalid SourceLocation for an argument"); - assert( - (Archs.size() == 1 || !llvm::any_of(Archs, - [](const DeviceTypeArgument &Arg) { - return Arg.first == nullptr; - })) && - "Only a single asterisk version is permitted, and must be the " - "only one"); + assert((Archs.size() == 1 || + !llvm::any_of(Archs, + [](const DeviceTypeArgument &Arg) { + return Arg.getIdentifierInfo() == nullptr; + })) && + "Only a single asterisk version is permitted, and must be the " + "only one"); std::uninitialized_copy(Archs.begin(), Archs.end(), getTrailingObjects()); @@ -302,7 +302,7 @@ class OpenACCDeviceTypeClause final } bool hasAsterisk() const { return getArchitectures().size() > 0 && - getArchitectures()[0].first == nullptr; + getArchitectures()[0].getIdentifierInfo() == nullptr; } ArrayRef getArchitectures() const { diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index 0347880244a40..1275b056227b5 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -18,6 +18,7 @@ #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticIDs.h" #include "clang/Basic/LLVM.h" +#include "clang/Basic/SourceLocation.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/FoldingSet.h" @@ -76,9 +77,6 @@ inline bool isReservedInAllContexts(ReservedIdentifierStatus Status) { Status != ReservedIdentifierStatus::StartsWithUnderscoreAndIsExternC; } -/// A simple pair of identifier info and location. -using IdentifierLocPair = std::pair; - /// IdentifierInfo and other related classes are aligned to /// 8 bytes so that DeclarationName can use the lower 3 bits /// of a pointer to one of these classes. @@ -1165,6 +1163,28 @@ class SelectorTable { static std::string getPropertyNameFromSetterSelector(Selector Sel); }; +/// A simple pair of identifier info and location. +class IdentifierLoc { + SourceLocation Loc; + IdentifierInfo *II = nullptr; + +public: + IdentifierLoc() = default; + IdentifierLoc(SourceLocation L, IdentifierInfo *Ident) : Loc(L), II(Ident) {} + + void setLoc(SourceLocation L) { Loc = L; } + void setIdentifierInfo(IdentifierInfo *Ident) { II = Ident; } + SourceLocation getLoc() const { return Loc; } + IdentifierInfo *getIdentifierInfo() const { return II; } + + bool operator==(const IdentifierLoc &X) const { + return Loc == X.Loc && II == X.II; + } + + bool operator!=(const IdentifierLoc &X) const { + return Loc != X.Loc || II != X.II; + } +}; } // namespace clang namespace llvm { diff --git a/clang/include/clang/Lex/ModuleLoader.h b/clang/include/clang/Lex/ModuleLoader.h index f880a9091a2ed..a58407200c41c 100644 --- a/clang/include/clang/Lex/ModuleLoader.h +++ b/clang/include/clang/Lex/ModuleLoader.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_LEX_MODULELOADER_H #define LLVM_CLANG_LEX_MODULELOADER_H +#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/Module.h" #include "clang/Basic/SourceLocation.h" @@ -29,7 +30,7 @@ class IdentifierInfo; /// A sequence of identifier/location pairs used to describe a particular /// module or submodule, e.g., std.vector. -using ModuleIdPath = ArrayRef>; +using ModuleIdPath = ArrayRef; /// Describes the result of attempting to load a module. class ModuleLoadResult { diff --git a/clang/include/clang/Lex/PPCallbacks.h b/clang/include/clang/Lex/PPCallbacks.h index 46cc564086f1c..313b730afbab8 100644 --- a/clang/include/clang/Lex/PPCallbacks.h +++ b/clang/include/clang/Lex/PPCallbacks.h @@ -15,6 +15,7 @@ #define LLVM_CLANG_LEX_PPCALLBACKS_H #include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/ModuleLoader.h" diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 24bb524783e93..f8f2f567f9171 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -327,7 +327,7 @@ class Preprocessor { SourceLocation ModuleImportLoc; /// The import path for named module that we're currently processing. - SmallVector, 2> NamedModuleImportPath; + SmallVector NamedModuleImportPath; llvm::DenseMap> CheckPoints; unsigned CheckPointCounter = 0; @@ -622,7 +622,7 @@ class Preprocessor { /// The identifier and source location of the currently-active /// \#pragma clang arc_cf_code_audited begin. - std::pair PragmaARCCFCodeAuditedInfo; + IdentifierLoc PragmaARCCFCodeAuditedInfo; /// The source location of the currently-active /// \#pragma clang assume_nonnull begin. @@ -1998,8 +1998,7 @@ class Preprocessor { /// arc_cf_code_audited begin. /// /// Returns an invalid location if there is no such pragma active. - std::pair - getPragmaARCCFCodeAuditedInfo() const { + IdentifierLoc getPragmaARCCFCodeAuditedInfo() const { return PragmaARCCFCodeAuditedInfo; } @@ -2007,7 +2006,7 @@ class Preprocessor { /// arc_cf_code_audited begin. An invalid location ends the pragma. void setPragmaARCCFCodeAuditedInfo(IdentifierInfo *Ident, SourceLocation Loc) { - PragmaARCCFCodeAuditedInfo = {Ident, Loc}; + PragmaARCCFCodeAuditedInfo = IdentifierLoc(Loc, Ident); } /// The location of the currently-active \#pragma clang diff --git a/clang/include/clang/Parse/LoopHint.h b/clang/include/clang/Parse/LoopHint.h index cec5605ea3615..72be043d3c5a4 100644 --- a/clang/include/clang/Parse/LoopHint.h +++ b/clang/include/clang/Parse/LoopHint.h @@ -9,12 +9,12 @@ #ifndef LLVM_CLANG_PARSE_LOOPHINT_H #define LLVM_CLANG_PARSE_LOOPHINT_H +#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/SourceLocation.h" namespace clang { class Expr; -struct IdentifierLoc; /// Loop optimization hint for loop and unroll pragmas. struct LoopHint { diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 9ebcf144ba59e..662f54d0e8d8a 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1725,8 +1725,8 @@ class Parser : public CodeCompletionHandler { ObjCTypeParamList *parseObjCTypeParamList(); ObjCTypeParamList *parseObjCTypeParamListOrProtocolRefs( ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, - SmallVectorImpl &protocolIdents, - SourceLocation &rAngleLoc, bool mayBeProtocolList = true); + SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, + bool mayBeProtocolList = true); void HelperActionsForIvarDeclarations(ObjCContainerDecl *interfaceDecl, SourceLocation atLoc, @@ -3818,8 +3818,7 @@ class Parser : public CodeCompletionHandler { SourceLocation Loc, llvm::SmallVectorImpl &IntExprs); /// Parses the 'device-type-list', which is a list of identifiers. - bool ParseOpenACCDeviceTypeList( - llvm::SmallVector> &Archs); + bool ParseOpenACCDeviceTypeList(llvm::SmallVector &Archs); /// Parses the 'async-argument', which is an integral value with two /// 'special' values that are likely negative (but come from Macros). OpenACCIntExprParseResult ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, @@ -3951,10 +3950,8 @@ class Parser : public CodeCompletionHandler { return false; } - bool ParseModuleName( - SourceLocation UseLoc, - SmallVectorImpl> &Path, - bool IsImport); + bool ParseModuleName(SourceLocation UseLoc, + SmallVectorImpl &Path, bool IsImport); //===--------------------------------------------------------------------===// // C++11/G++: Type Traits [Type-Traits.html in the GCC manual] diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index b88b871dc8821..428d3111de80d 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -40,7 +40,6 @@ class LangOptions; class Sema; class Stmt; class TargetInfo; -struct IdentifierLoc; /// Represents information about a change in availability for /// an entity, which is part of the encoding of the 'availability' @@ -99,15 +98,6 @@ struct PropertyData { } // namespace detail -/// Wraps an identifier and optional source location for the identifier. -struct IdentifierLoc { - SourceLocation Loc; - IdentifierInfo *Ident; - - static IdentifierLoc *create(ASTContext &Ctx, SourceLocation Loc, - IdentifierInfo *Ident); -}; - /// A union of the various pointer types that can be passed to an /// ParsedAttr as an argument. using ArgsUnion = llvm::PointerUnion; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index fe37fd7701ce3..1f23b754a69cb 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -143,7 +143,7 @@ enum class LangAS : unsigned int; class LocalInstantiationScope; class LookupResult; class MangleNumberingContext; -typedef ArrayRef> ModuleIdPath; +typedef ArrayRef ModuleIdPath; class ModuleLoader; class MultiLevelTemplateArgumentList; struct NormalizedConstraint; diff --git a/clang/include/clang/Sema/SemaCodeCompletion.h b/clang/include/clang/Sema/SemaCodeCompletion.h index 72159de3a6e72..3029e56e5cfe2 100644 --- a/clang/include/clang/Sema/SemaCodeCompletion.h +++ b/clang/include/clang/Sema/SemaCodeCompletion.h @@ -193,8 +193,7 @@ class SemaCodeCompletion : public SemaBase { void CodeCompleteObjCForCollection(Scope *S, DeclGroupPtrTy IterationVar); void CodeCompleteObjCSelector(Scope *S, ArrayRef SelIdents); - void - CodeCompleteObjCProtocolReferences(ArrayRef Protocols); + void CodeCompleteObjCProtocolReferences(ArrayRef Protocols); void CodeCompleteObjCProtocolDecl(Scope *S); void CodeCompleteObjCInterfaceDecl(Scope *S); void CodeCompleteObjCClassForwardDecl(Scope *S); diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h index 791a7f45b832f..4cda41a82b61f 100644 --- a/clang/include/clang/Sema/SemaObjC.h +++ b/clang/include/clang/Sema/SemaObjC.h @@ -307,11 +307,11 @@ class SemaObjC : public SemaBase { DeclGroupPtrTy ActOnForwardProtocolDeclaration(SourceLocation AtProtoclLoc, - ArrayRef IdentList, + ArrayRef IdentList, const ParsedAttributesView &attrList); void FindProtocolDeclaration(bool WarnOnDeclarations, bool ForObjCContainer, - ArrayRef ProtocolId, + ArrayRef ProtocolId, SmallVectorImpl &Protocols); void DiagnoseTypeArgsAndProtocols(IdentifierInfo *ProtocolId, diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 4c3a13a3b044f..8d31d46444c7e 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -212,7 +212,7 @@ class SemaOpenACC : public SemaBase { } LoopWithoutSeqInfo; // Redeclaration of the version in OpenACCClause.h. - using DeviceTypeArgument = std::pair; + using DeviceTypeArgument = IdentifierLoc; /// A type to represent all the data for an OpenACC Clause that has been /// parsed, but not yet created/semantically analyzed. This is effectively a diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index d7cbb51335359..2820d7b288658 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -891,10 +891,10 @@ void OpenACCClausePrinter::VisitDeviceTypeClause( OS << "("; llvm::interleaveComma(C.getArchitectures(), OS, [&](const DeviceTypeArgument &Arch) { - if (Arch.first == nullptr) + if (Arch.getIdentifierInfo() == nullptr) OS << "*"; else - OS << Arch.first->getName(); + OS << Arch.getIdentifierInfo()->getName(); }); OS << ")"; } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index c8b459ee78e6b..1bd94a3ac6431 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -500,10 +500,10 @@ void TextNodeDumper::Visit(const OpenACCClause *C) { llvm::interleaveComma( cast(C)->getArchitectures(), OS, [&](const DeviceTypeArgument &Arch) { - if (Arch.first == nullptr) + if (Arch.getIdentifierInfo() == nullptr) OS << "*"; else - OS << Arch.first->getName(); + OS << Arch.getIdentifierInfo()->getName(); }); OS << ")"; break; diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 243e0a3c15b05..93e4e31c2891d 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -35,6 +35,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" #include "clang/Serialization/ASTReader.h" #include "clang/Serialization/GlobalModuleIndex.h" @@ -2009,8 +2010,8 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, Module::NameVisibilityKind Visibility, bool IsInclusionDirective) { // Determine what file we're searching from. - StringRef ModuleName = Path[0].first->getName(); - SourceLocation ModuleNameLoc = Path[0].second; + StringRef ModuleName = Path[0].getIdentifierInfo()->getName(); + SourceLocation ModuleNameLoc = Path[0].getLoc(); // If we've already handled this import, just return the cached result. // This one-element cache is important to eliminate redundant diagnostics @@ -2026,7 +2027,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // If we don't already have information on this module, load the module now. Module *Module = nullptr; ModuleMap &MM = getPreprocessor().getHeaderSearchInfo().getModuleMap(); - if (auto MaybeModule = MM.getCachedModuleLoad(*Path[0].first)) { + if (auto MaybeModule = MM.getCachedModuleLoad(*Path[0].getIdentifierInfo())) { // Use the cached result, which may be nullptr. Module = *MaybeModule; // Config macros are already checked before building a module, but they need @@ -2046,7 +2047,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // * `Preprocessor::HandleHeaderIncludeOrImport` will never call this // function as the `#include` or `#import` is textual. - MM.cacheModuleLoad(*Path[0].first, Module); + MM.cacheModuleLoad(*Path[0].getIdentifierInfo(), Module); } else { ModuleLoadResult Result = findOrCompileModuleAndReadAST( ModuleName, ImportLoc, ModuleNameLoc, IsInclusionDirective); @@ -2055,7 +2056,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (!Result) DisableGeneratingGlobalModuleIndex = true; Module = Result; - MM.cacheModuleLoad(*Path[0].first, Module); + MM.cacheModuleLoad(*Path[0].getIdentifierInfo(), Module); } // If we never found the module, fail. Otherwise, verify the module and link @@ -2067,7 +2068,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // a submodule. bool MapPrivateSubModToTopLevel = false; for (unsigned I = 1, N = Path.size(); I != N; ++I) { - StringRef Name = Path[I].first->getName(); + StringRef Name = Path[I].getIdentifierInfo()->getName(); clang::Module *Sub = Module->findSubmodule(Name); // If the user is requesting Foo.Private and it doesn't exist, try to @@ -2078,10 +2079,10 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, SmallString<128> PrivateModule(Module->Name); PrivateModule.append("_Private"); - SmallVector, 2> PrivPath; + SmallVector PrivPath; auto &II = PP->getIdentifierTable().get( PrivateModule, PP->getIdentifierInfo(Module->Name)->getTokenID()); - PrivPath.push_back(std::make_pair(&II, Path[0].second)); + PrivPath.emplace_back(Path[0].getLoc(), &II); std::string FileName; // If there is a modulemap module or prebuilt module, load it. @@ -2095,11 +2096,12 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, PP->markClangModuleAsAffecting(Module); if (!getDiagnostics().isIgnored( diag::warn_no_priv_submodule_use_toplevel, ImportLoc)) { - getDiagnostics().Report(Path[I].second, + getDiagnostics().Report(Path[I].getLoc(), diag::warn_no_priv_submodule_use_toplevel) - << Path[I].first << Module->getFullModuleName() << PrivateModule - << SourceRange(Path[0].second, Path[I].second) - << FixItHint::CreateReplacement(SourceRange(Path[0].second), + << Path[I].getIdentifierInfo() << Module->getFullModuleName() + << PrivateModule + << SourceRange(Path[0].getLoc(), Path[I].getLoc()) + << FixItHint::CreateReplacement(SourceRange(Path[0].getLoc()), PrivateModule); getDiagnostics().Report(Sub->DefinitionLoc, diag::note_private_top_level_defined); @@ -2128,10 +2130,11 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // If there was a clear winner, user it. if (Best.size() == 1) { - getDiagnostics().Report(Path[I].second, diag::err_no_submodule_suggest) - << Path[I].first << Module->getFullModuleName() << Best[0] - << SourceRange(Path[0].second, Path[I - 1].second) - << FixItHint::CreateReplacement(SourceRange(Path[I].second), + getDiagnostics().Report(Path[I].getLoc(), + diag::err_no_submodule_suggest) + << Path[I].getIdentifierInfo() << Module->getFullModuleName() + << Best[0] << SourceRange(Path[0].getLoc(), Path[I - 1].getLoc()) + << FixItHint::CreateReplacement(SourceRange(Path[I].getLoc()), Best[0]); Sub = Module->findSubmodule(Best[0]); @@ -2141,9 +2144,9 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (!Sub) { // No submodule by this name. Complain, and don't look for further // submodules. - getDiagnostics().Report(Path[I].second, diag::err_no_submodule) - << Path[I].first << Module->getFullModuleName() - << SourceRange(Path[0].second, Path[I - 1].second); + getDiagnostics().Report(Path[I].getLoc(), diag::err_no_submodule) + << Path[I].getIdentifierInfo() << Module->getFullModuleName() + << SourceRange(Path[0].getLoc(), Path[I - 1].getLoc()); break; } @@ -2161,8 +2164,8 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // FIXME: Should we detect this at module load time? It seems fairly // expensive (and rare). getDiagnostics().Report(ImportLoc, diag::warn_missing_submodule) - << Module->getFullModuleName() - << SourceRange(Path.front().second, Path.back().second); + << Module->getFullModuleName() + << SourceRange(Path.front().getLoc(), Path.back().getLoc()); return ModuleLoadResult(Module, ModuleLoadResult::MissingExpected); } @@ -2171,7 +2174,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (Preprocessor::checkModuleIsAvailable(getLangOpts(), getTarget(), *Module, getDiagnostics())) { getDiagnostics().Report(ImportLoc, diag::note_module_import_here) - << SourceRange(Path.front().second, Path.back().second); + << SourceRange(Path.front().getLoc(), Path.back().getLoc()); LastModuleImportLoc = ImportLoc; LastModuleImportResult = ModuleLoadResult(); return ModuleLoadResult(); @@ -2296,9 +2299,9 @@ GlobalModuleIndex *CompilerInstance::loadGlobalModuleIndex( Module *TheModule = I->second; OptionalFileEntryRef Entry = TheModule->getASTFile(); if (!Entry) { - SmallVector, 2> Path; - Path.push_back(std::make_pair( - getPreprocessor().getIdentifierInfo(TheModule->Name), TriggerLoc)); + SmallVector Path; + Path.emplace_back(TriggerLoc, + getPreprocessor().getIdentifierInfo(TheModule->Name)); std::reverse(Path.begin(), Path.end()); // Load a module as hidden. This also adds it to the global index. loadModule(TheModule->DefinitionLoc, Path, Module::Hidden, false); diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index c5aeb92c7af73..e6c7b9f32c29b 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -1216,9 +1216,9 @@ void GetDependenciesByModuleNameAction::ExecuteAction() { SourceManager &SM = PP.getSourceManager(); FileID MainFileID = SM.getMainFileID(); SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID); - SmallVector, 2> Path; + SmallVector Path; IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName); - Path.push_back(std::make_pair(ModuleID, FileStart)); + Path.emplace_back(FileStart, ModuleID); auto ModResult = CI.loadModule(FileStart, Path, Module::Hidden, false); PPCallbacks *CB = PP.getPPCallbacks(); CB->moduleImport(SourceLocation(), Path, ModResult); diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 8411526019f3e..21ec83b437ef4 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1916,15 +1916,15 @@ void Preprocessor::EnterAnnotationToken(SourceRange Range, /// Produce a diagnostic informing the user that a #include or similar /// was implicitly treated as a module import. -static void diagnoseAutoModuleImport( - Preprocessor &PP, SourceLocation HashLoc, Token &IncludeTok, - ArrayRef> Path, - SourceLocation PathEnd) { +static void diagnoseAutoModuleImport(Preprocessor &PP, SourceLocation HashLoc, + Token &IncludeTok, + ArrayRef Path, + SourceLocation PathEnd) { SmallString<128> PathString; for (size_t I = 0, N = Path.size(); I != N; ++I) { if (I) PathString += '.'; - PathString += Path[I].first->getName(); + PathString += Path[I].getIdentifierInfo()->getName(); } int IncludeKind = 0; @@ -2273,12 +2273,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( SourceLocation StartLoc = IsImportDecl ? IncludeTok.getLocation() : HashLoc; // Complain about attempts to #include files in an audit pragma. - if (PragmaARCCFCodeAuditedInfo.second.isValid()) { + if (PragmaARCCFCodeAuditedInfo.getLoc().isValid()) { Diag(StartLoc, diag::err_pp_include_in_arc_cf_code_audited) << IsImportDecl; - Diag(PragmaARCCFCodeAuditedInfo.second, diag::note_pragma_entered_here); + Diag(PragmaARCCFCodeAuditedInfo.getLoc(), diag::note_pragma_entered_here); // Immediately leave the pragma. - PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; + PragmaARCCFCodeAuditedInfo = IdentifierLoc(); } // Complain about attempts to #include files in an assume-nonnull pragma. @@ -2403,10 +2403,10 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // Compute the module access path corresponding to this module. // FIXME: Should we have a second loadModule() overload to avoid this // extra lookup step? - SmallVector, 2> Path; + SmallVector Path; for (Module *Mod = ModuleToImport; Mod; Mod = Mod->Parent) - Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name), - FilenameTok.getLocation())); + Path.emplace_back(FilenameTok.getLocation(), + getIdentifierInfo(Mod->Name)); std::reverse(Path.begin(), Path.end()); // Warn that we're replacing the include/import with a module import. diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp index a373a52506a24..db6069e31fa46 100644 --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -409,13 +409,13 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) { // Complain about reaching a true EOF within arc_cf_code_audited. // We don't want to complain about reaching the end of a macro // instantiation or a _Pragma. - if (PragmaARCCFCodeAuditedInfo.second.isValid() && !isEndOfMacro && + if (PragmaARCCFCodeAuditedInfo.getLoc().isValid() && !isEndOfMacro && !(CurLexer && CurLexer->Is_PragmaLexer)) { - Diag(PragmaARCCFCodeAuditedInfo.second, + Diag(PragmaARCCFCodeAuditedInfo.getLoc(), diag::err_pp_eof_in_arc_cf_code_audited); // Recover by leaving immediately. - PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; + PragmaARCCFCodeAuditedInfo = IdentifierLoc(); } // Complain about reaching a true EOF within assume_nonnull. diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp index 91c1619e35623..5b6a29bdad910 100644 --- a/clang/lib/Lex/Pragma.cpp +++ b/clang/lib/Lex/Pragma.cpp @@ -763,20 +763,19 @@ void Preprocessor::HandlePragmaIncludeAlias(Token &Tok) { // Lex a component of a module name: either an identifier or a string literal; // for components that can be expressed both ways, the two forms are equivalent. -static bool LexModuleNameComponent( - Preprocessor &PP, Token &Tok, - std::pair &ModuleNameComponent, - bool First) { +static bool LexModuleNameComponent(Preprocessor &PP, Token &Tok, + IdentifierLoc &ModuleNameComponent, + bool First) { PP.LexUnexpandedToken(Tok); if (Tok.is(tok::string_literal) && !Tok.hasUDSuffix()) { StringLiteralParser Literal(Tok, PP); if (Literal.hadError) return true; - ModuleNameComponent = std::make_pair( - PP.getIdentifierInfo(Literal.GetString()), Tok.getLocation()); + ModuleNameComponent = IdentifierLoc( + Tok.getLocation(), PP.getIdentifierInfo(Literal.GetString())); } else if (!Tok.isAnnotation() && Tok.getIdentifierInfo()) { ModuleNameComponent = - std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()); + IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); } else { PP.Diag(Tok.getLocation(), diag::err_pp_expected_module_name) << First; return true; @@ -784,12 +783,10 @@ static bool LexModuleNameComponent( return false; } -static bool LexModuleName( - Preprocessor &PP, Token &Tok, - llvm::SmallVectorImpl> - &ModuleName) { +static bool LexModuleName(Preprocessor &PP, Token &Tok, + llvm::SmallVectorImpl &ModuleName) { while (true) { - std::pair NameComponent; + IdentifierLoc NameComponent; if (LexModuleNameComponent(PP, Tok, NameComponent, ModuleName.empty())) return true; ModuleName.push_back(NameComponent); @@ -803,10 +800,10 @@ static bool LexModuleName( void Preprocessor::HandlePragmaModuleBuild(Token &Tok) { SourceLocation Loc = Tok.getLocation(); - std::pair ModuleNameLoc; + IdentifierLoc ModuleNameLoc; if (LexModuleNameComponent(*this, Tok, ModuleNameLoc, true)) return; - IdentifierInfo *ModuleName = ModuleNameLoc.first; + IdentifierInfo *ModuleName = ModuleNameLoc.getIdentifierInfo(); LexUnexpandedToken(Tok); if (Tok.isNot(tok::eod)) { @@ -1109,17 +1106,17 @@ struct PragmaDebugHandler : public PragmaHandler { PP.Diag(MacroName, diag::warn_pragma_debug_missing_argument) << II->getName(); } else if (II->isStr("module_map")) { - llvm::SmallVector, 8> - ModuleName; + llvm::SmallVector ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; ModuleMap &MM = PP.getHeaderSearchInfo().getModuleMap(); Module *M = nullptr; for (auto IIAndLoc : ModuleName) { - M = MM.lookupModuleQualified(IIAndLoc.first->getName(), M); + M = MM.lookupModuleQualified(IIAndLoc.getIdentifierInfo()->getName(), + M); if (!M) { - PP.Diag(IIAndLoc.second, diag::warn_pragma_debug_unknown_module) - << IIAndLoc.first->getName(); + PP.Diag(IIAndLoc.getLoc(), diag::warn_pragma_debug_unknown_module) + << IIAndLoc.getIdentifierInfo()->getName(); return; } } @@ -1707,8 +1704,7 @@ struct PragmaModuleImportHandler : public PragmaHandler { SourceLocation ImportLoc = Tok.getLocation(); // Read the module name. - llvm::SmallVector, 8> - ModuleName; + llvm::SmallVector ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1723,7 +1719,7 @@ struct PragmaModuleImportHandler : public PragmaHandler { return; PP.makeModuleVisible(Imported, ImportLoc); - PP.EnterAnnotationToken(SourceRange(ImportLoc, ModuleName.back().second), + PP.EnterAnnotationToken(SourceRange(ImportLoc, ModuleName.back().getLoc()), tok::annot_module_include, Imported); if (auto *CB = PP.getPPCallbacks()) CB->moduleImport(ImportLoc, ModuleName, Imported); @@ -1744,8 +1740,7 @@ struct PragmaModuleBeginHandler : public PragmaHandler { SourceLocation BeginLoc = Tok.getLocation(); // Read the module name. - llvm::SmallVector, 8> - ModuleName; + llvm::SmallVector ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1754,10 +1749,11 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // We can only enter submodules of the current module. StringRef Current = PP.getLangOpts().CurrentModule; - if (ModuleName.front().first->getName() != Current) { - PP.Diag(ModuleName.front().second, diag::err_pp_module_begin_wrong_module) - << ModuleName.front().first << (ModuleName.size() > 1) - << Current.empty() << Current; + if (ModuleName.front().getIdentifierInfo()->getName() != Current) { + PP.Diag(ModuleName.front().getLoc(), + diag::err_pp_module_begin_wrong_module) + << ModuleName.front().getIdentifierInfo() << (ModuleName.size() > 1) + << Current.empty() << Current; return; } @@ -1765,17 +1761,19 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // be loaded or implicitly loadable. auto &HSI = PP.getHeaderSearchInfo(); auto &MM = HSI.getModuleMap(); - Module *M = HSI.lookupModule(Current, ModuleName.front().second); + Module *M = HSI.lookupModule(Current, ModuleName.front().getLoc()); if (!M) { - PP.Diag(ModuleName.front().second, - diag::err_pp_module_begin_no_module_map) << Current; + PP.Diag(ModuleName.front().getLoc(), + diag::err_pp_module_begin_no_module_map) + << Current; return; } for (unsigned I = 1; I != ModuleName.size(); ++I) { - auto *NewM = MM.findOrInferSubmodule(M, ModuleName[I].first->getName()); + auto *NewM = MM.findOrInferSubmodule( + M, ModuleName[I].getIdentifierInfo()->getName()); if (!NewM) { - PP.Diag(ModuleName[I].second, diag::err_pp_module_begin_no_submodule) - << M->getFullModuleName() << ModuleName[I].first; + PP.Diag(ModuleName[I].getLoc(), diag::err_pp_module_begin_no_submodule) + << M->getFullModuleName() << ModuleName[I].getIdentifierInfo(); return; } M = NewM; @@ -1791,7 +1789,7 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // Enter the scope of the submodule. PP.EnterSubmodule(M, BeginLoc, /*ForPragma*/true); - PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().second), + PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().getLoc()), tok::annot_module_begin, M); } }; @@ -1835,8 +1833,7 @@ struct PragmaModuleLoadHandler : public PragmaHandler { SourceLocation Loc = Tok.getLocation(); // Read the module name. - llvm::SmallVector, 8> - ModuleName; + llvm::SmallVector ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1901,7 +1898,7 @@ struct PragmaARCCFCodeAuditedHandler : public PragmaHandler { PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; // The start location of the active audit. - SourceLocation BeginLoc = PP.getPragmaARCCFCodeAuditedInfo().second; + SourceLocation BeginLoc = PP.getPragmaARCCFCodeAuditedInfo().getLoc(); // The start location we want after processing this. SourceLocation NewLoc; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index c25a3efd899e0..4c050bf1f5bb2 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -1159,8 +1159,8 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { if (Result.is(tok::colon) && ModuleDeclState.isNamedModule()) { std::string Name = ModuleDeclState.getPrimaryName().str(); Name += ":"; - NamedModuleImportPath.push_back( - {getIdentifierInfo(Name), Result.getLocation()}); + NamedModuleImportPath.emplace_back(Result.getLocation(), + getIdentifierInfo(Name)); CurLexerCallback = CLK_LexAfterModuleImport; return true; } @@ -1258,8 +1258,8 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { if (ModuleImportExpectsIdentifier && Result.getKind() == tok::identifier) { // We expected to see an identifier here, and we did; continue handling // identifiers. - NamedModuleImportPath.push_back( - std::make_pair(Result.getIdentifierInfo(), Result.getLocation())); + NamedModuleImportPath.emplace_back(Result.getLocation(), + Result.getIdentifierInfo()); ModuleImportExpectsIdentifier = false; CurLexerCallback = CLK_LexAfterModuleImport; return true; @@ -1302,12 +1302,12 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { // If the FlatModuleName ends with colon, it implies it is a partition. if (!FlatModuleName.empty() && FlatModuleName.back() != ':') FlatModuleName += "."; - FlatModuleName += Piece.first->getName(); + FlatModuleName += Piece.getIdentifierInfo()->getName(); } - SourceLocation FirstPathLoc = NamedModuleImportPath[0].second; + SourceLocation FirstPathLoc = NamedModuleImportPath[0].getLoc(); NamedModuleImportPath.clear(); - NamedModuleImportPath.push_back( - std::make_pair(getIdentifierInfo(FlatModuleName), FirstPathLoc)); + NamedModuleImportPath.emplace_back(FirstPathLoc, + getIdentifierInfo(FlatModuleName)); } Module *Imported = nullptr; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 8fa74ecff19aa..8444ff3332e08 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -432,9 +432,8 @@ static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II, IdentifierLoc *Parser::ParseIdentifierLoc() { assert(Tok.is(tok::identifier) && "expected an identifier"); - IdentifierLoc *IL = IdentifierLoc::create(Actions.Context, - Tok.getLocation(), - Tok.getIdentifierInfo()); + IdentifierLoc *IL = new (Actions.Context) + IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); ConsumeToken(); return IL; } @@ -1353,20 +1352,21 @@ void Parser::ParseAvailabilityAttribute( return; } IdentifierLoc *Platform = ParseIdentifierLoc(); - if (const IdentifierInfo *const Ident = Platform->Ident) { + if (const IdentifierInfo *const Ident = Platform->getIdentifierInfo()) { // Disallow xrOS for availability attributes. if (Ident->getName().contains("xrOS") || Ident->getName().contains("xros")) - Diag(Platform->Loc, diag::warn_availability_unknown_platform) << Ident; + Diag(Platform->getLoc(), diag::warn_availability_unknown_platform) + << Ident; // Canonicalize platform name from "macosx" to "macos". else if (Ident->getName() == "macosx") - Platform->Ident = PP.getIdentifierInfo("macos"); + Platform->setIdentifierInfo(PP.getIdentifierInfo("macos")); // Canonicalize platform name from "macosx_app_extension" to // "macos_app_extension". else if (Ident->getName() == "macosx_app_extension") - Platform->Ident = PP.getIdentifierInfo("macos_app_extension"); + Platform->setIdentifierInfo(PP.getIdentifierInfo("macos_app_extension")); else - Platform->Ident = PP.getIdentifierInfo( - AvailabilityAttr::canonicalizePlatformName(Ident->getName())); + Platform->setIdentifierInfo(PP.getIdentifierInfo( + AvailabilityAttr::canonicalizePlatformName(Ident->getName()))); } // Parse the ',' following the platform name. @@ -1418,8 +1418,8 @@ void Parser::ParseAvailabilityAttribute( continue; } - if (Keyword == Ident_deprecated && Platform->Ident && - Platform->Ident->isStr("swift")) { + if (Keyword == Ident_deprecated && Platform->getIdentifierInfo() && + Platform->getIdentifierInfo()->isStr("swift")) { // For swift, we deprecate for all versions. if (Changes[Deprecated].KeywordLoc.isValid()) { Diag(KeywordLoc, diag::err_availability_redundant) @@ -1436,7 +1436,7 @@ void Parser::ParseAvailabilityAttribute( if (Keyword == Ident_environment) { if (EnvironmentLoc != nullptr) { Diag(KeywordLoc, diag::err_availability_redundant) - << Keyword << SourceRange(EnvironmentLoc->Loc); + << Keyword << SourceRange(EnvironmentLoc->getLoc()); } } @@ -1792,8 +1792,8 @@ void Parser::ParseSwiftNewTypeAttribute( return; } - auto *SwiftType = IdentifierLoc::create(Actions.Context, Tok.getLocation(), - Tok.getIdentifierInfo()); + auto *SwiftType = new (Actions.Context) + IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); ConsumeToken(); // Closing ')' diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 0a22f7372a9f9..1416d52157dca 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -4006,19 +4006,20 @@ std::optional Parser::ParseAvailabilitySpec() { if (Version.empty()) return std::nullopt; - StringRef GivenPlatform = PlatformIdentifier->Ident->getName(); + StringRef GivenPlatform = + PlatformIdentifier->getIdentifierInfo()->getName(); StringRef Platform = AvailabilityAttr::canonicalizePlatformName(GivenPlatform); if (AvailabilityAttr::getPrettyPlatformName(Platform).empty() || (GivenPlatform.contains("xros") || GivenPlatform.contains("xrOS"))) { - Diag(PlatformIdentifier->Loc, + Diag(PlatformIdentifier->getLoc(), diag::err_avail_query_unrecognized_platform_name) << GivenPlatform; return std::nullopt; } - return AvailabilitySpec(Version, Platform, PlatformIdentifier->Loc, + return AvailabilitySpec(Version, Platform, PlatformIdentifier->getLoc(), VersionRange.getEnd()); } } diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp index f4c109f9a81a2..7ffacc4b79f79 100644 --- a/clang/lib/Parse/ParseHLSL.cpp +++ b/clang/lib/Parse/ParseHLSL.cpp @@ -115,7 +115,7 @@ static void fixSeparateAttrArgAndNumber(StringRef ArgStr, SourceLocation ArgLoc, << FixedArg << FixItHint::CreateReplacement(SourceRange(ArgLoc, EndNumLoc), FixedArg); ArgsUnion &Slot = ArgExprs.back(); - Slot = IdentifierLoc::create(Ctx, ArgLoc, PP.getIdentifierInfo(FixedArg)); + Slot = new (Ctx) IdentifierLoc(ArgLoc, PP.getIdentifierInfo(FixedArg)); } void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs, diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp index bcbf4dfbabafa..d872177b3d7aa 100644 --- a/clang/lib/Parse/ParseObjc.cpp +++ b/clang/lib/Parse/ParseObjc.cpp @@ -261,7 +261,7 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, // case, LAngleLoc will be valid and ProtocolIdents will capture the // protocol references (that have not yet been resolved). SourceLocation LAngleLoc, EndProtoLoc; - SmallVector ProtocolIdents; + SmallVector ProtocolIdents; ObjCTypeParamList *typeParameterList = nullptr; ObjCTypeParamListScope typeParamScope(Actions, getCurScope()); if (Tok.is(tok::less)) @@ -361,8 +361,8 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, if (!ProtocolIdents.empty()) { // We already parsed the protocols named when we thought we had a // type parameter list. Translate them into actual protocol references. - for (const auto &pair : ProtocolIdents) { - protocolLocs.push_back(pair.second); + for (const auto &Loc : ProtocolIdents) { + protocolLocs.push_back(Loc.getLoc()); } Actions.ObjC().FindProtocolDeclaration(/*WarnOnDeclarations=*/true, /*ForObjCContainer=*/true, @@ -459,8 +459,8 @@ static void addContextSensitiveTypeNullability(Parser &P, /// \param rAngleLoc The location of the ending '>'. ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, - SmallVectorImpl &protocolIdents, - SourceLocation &rAngleLoc, bool mayBeProtocolList) { + SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, + bool mayBeProtocolList) { assert(Tok.is(tok::less) && "Not at the beginning of a type parameter list"); // Within the type parameter list, don't treat '>' as an operator. @@ -474,7 +474,8 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( for (const auto &pair : protocolIdents) { DeclResult typeParam = Actions.ObjC().actOnObjCTypeParam( getCurScope(), ObjCTypeParamVariance::Invariant, SourceLocation(), - index++, pair.first, pair.second, SourceLocation(), nullptr); + index++, pair.getIdentifierInfo(), pair.getLoc(), SourceLocation(), + nullptr); if (typeParam.isUsable()) typeParams.push_back(typeParam.get()); } @@ -546,7 +547,7 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( } else if (mayBeProtocolList) { // If this could still be a protocol list, just capture the identifier. // We don't want to turn it into a parameter. - protocolIdents.push_back(std::make_pair(paramName, paramLoc)); + protocolIdents.emplace_back(paramLoc, paramName); continue; } @@ -606,7 +607,7 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( /// Parse an objc-type-parameter-list. ObjCTypeParamList *Parser::parseObjCTypeParamList() { SourceLocation lAngleLoc; - SmallVector protocolIdents; + SmallVector protocolIdents; SourceLocation rAngleLoc; ObjCTypeParamListScope Scope(Actions, getCurScope()); @@ -1598,7 +1599,7 @@ ParseObjCProtocolReferences(SmallVectorImpl &Protocols, LAngleLoc = ConsumeToken(); // the "<" - SmallVector ProtocolIdents; + SmallVector ProtocolIdents; while (true) { if (Tok.is(tok::code_completion)) { @@ -1612,8 +1613,7 @@ ParseObjCProtocolReferences(SmallVectorImpl &Protocols, SkipUntil(tok::greater, StopAtSemi); return true; } - ProtocolIdents.push_back(std::make_pair(Tok.getIdentifierInfo(), - Tok.getLocation())); + ProtocolIdents.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); ProtocolLocs.push_back(Tok.getLocation()); ConsumeToken(); @@ -1693,10 +1693,9 @@ void Parser::parseObjCTypeArgsOrProtocolQualifiers( if (Tok.is(tok::code_completion)) { // FIXME: Also include types here. - SmallVector identifierLocPairs; + SmallVector identifierLocPairs; for (unsigned i = 0, n = identifiers.size(); i != n; ++i) { - identifierLocPairs.push_back(IdentifierLocPair(identifiers[i], - identifierLocs[i])); + identifierLocPairs.emplace_back(identifierLocs[i], identifiers[i]); } QualType BaseT = Actions.GetTypeFromParser(baseType); @@ -2094,7 +2093,7 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, SourceLocation nameLoc = ConsumeToken(); if (TryConsumeToken(tok::semi)) { // forward declaration of one protocol. - IdentifierLocPair ProtoInfo(protocolName, nameLoc); + IdentifierLoc ProtoInfo(nameLoc, protocolName); return Actions.ObjC().ActOnForwardProtocolDeclaration(AtLoc, ProtoInfo, attrs); } @@ -2102,8 +2101,8 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, CheckNestedObjCContexts(AtLoc); if (Tok.is(tok::comma)) { // list of forward declarations. - SmallVector ProtocolRefs; - ProtocolRefs.push_back(std::make_pair(protocolName, nameLoc)); + SmallVector ProtocolRefs; + ProtocolRefs.emplace_back(nameLoc, protocolName); // Parse the list of forward declarations. while (true) { @@ -2112,8 +2111,7 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, SkipUntil(tok::semi); return nullptr; } - ProtocolRefs.push_back(IdentifierLocPair(Tok.getIdentifierInfo(), - Tok.getLocation())); + ProtocolRefs.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); ConsumeToken(); // the identifier if (Tok.isNot(tok::comma)) @@ -2196,7 +2194,7 @@ Parser::ParseObjCAtImplementationDeclaration(SourceLocation AtLoc, // permitted here. Parse and diagnose them. if (Tok.is(tok::less)) { SourceLocation lAngleLoc, rAngleLoc; - SmallVector protocolIdents; + SmallVector protocolIdents; SourceLocation diagLoc = Tok.getLocation(); ObjCTypeParamListScope typeParamScope(Actions, getCurScope()); if (parseObjCTypeParamListOrProtocolRefs(typeParamScope, lAngleLoc, diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 64916995907c5..337b3eca49764 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -15,6 +15,7 @@ #include "clang/Basic/OpenACCKinds.h" #include "clang/Parse/Parser.h" #include "clang/Parse/RAIIObjectsForParser.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/SemaOpenACC.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -814,7 +815,7 @@ bool Parser::ParseOpenACCIntExprList(OpenACCDirectiveKind DK, /// /// The device_type clause may be abbreviated to dtype. bool Parser::ParseOpenACCDeviceTypeList( - llvm::SmallVector> &Archs) { + llvm::SmallVector &Archs) { if (expectIdentifierOrKeyword(*this)) { SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end, @@ -822,7 +823,7 @@ bool Parser::ParseOpenACCDeviceTypeList( return true; } IdentifierInfo *Ident = getCurToken().getIdentifierInfo(); - Archs.emplace_back(Ident, ConsumeToken()); + Archs.emplace_back(ConsumeToken(), Ident); while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) { ExpectAndConsume(tok::comma); @@ -833,7 +834,7 @@ bool Parser::ParseOpenACCDeviceTypeList( return true; } Ident = getCurToken().getIdentifierInfo(); - Archs.emplace_back(Ident, ConsumeToken()); + Archs.emplace_back(ConsumeToken(), Ident); } return false; } @@ -1154,11 +1155,12 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( } case OpenACCClauseKind::DType: case OpenACCClauseKind::DeviceType: { - llvm::SmallVector> Archs; + llvm::SmallVector Archs; if (getCurToken().is(tok::star)) { // FIXME: We want to mark that this is an 'everything else' type of // device_type in Sema. - ParsedClause.setDeviceTypeDetails({{nullptr, ConsumeToken()}}); + ParsedClause.setDeviceTypeDetails( + {IdentifierLoc(ConsumeToken(), nullptr)}); } else if (!ParseOpenACCDeviceTypeList(Archs)) { ParsedClause.setDeviceTypeDetails(std::move(Archs)); } else { diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp index 21ebff1e50559..17b2b30942582 100644 --- a/clang/lib/Parse/ParsePragma.cpp +++ b/clang/lib/Parse/ParsePragma.cpp @@ -1419,16 +1419,16 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { static_cast(Tok.getAnnotationValue()); IdentifierInfo *PragmaNameInfo = Info->PragmaName.getIdentifierInfo(); - Hint.PragmaNameLoc = IdentifierLoc::create( - Actions.Context, Info->PragmaName.getLocation(), PragmaNameInfo); + Hint.PragmaNameLoc = new (Actions.Context) + IdentifierLoc(Info->PragmaName.getLocation(), PragmaNameInfo); // It is possible that the loop hint has no option identifier, such as // #pragma unroll(4). IdentifierInfo *OptionInfo = Info->Option.is(tok::identifier) ? Info->Option.getIdentifierInfo() : nullptr; - Hint.OptionLoc = IdentifierLoc::create( - Actions.Context, Info->Option.getLocation(), OptionInfo); + Hint.OptionLoc = new (Actions.Context) + IdentifierLoc(Info->Option.getLocation(), OptionInfo); llvm::ArrayRef Toks = Info->Toks; @@ -1508,7 +1508,7 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { if (Toks.size() > 2) Diag(Tok.getLocation(), diag::warn_pragma_extra_tokens_at_eol) << PragmaLoopHintString(Info->PragmaName, Info->Option); - Hint.StateLoc = IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); + Hint.StateLoc = new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); } else if (OptionInfo && OptionInfo->getName() == "vectorize_width") { PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/false, /*IsReinject=*/false); @@ -1529,8 +1529,7 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { ConsumeAnyToken(); } - Hint.StateLoc = - IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); + Hint.StateLoc = new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); ConsumeToken(); // Consume the constant expression eof terminator. } else { @@ -1554,7 +1553,7 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { Arg2Error = true; } else Hint.StateLoc = - IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); + new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); PP.Lex(Tok); // Identifier } diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index e8ec140fbe3e5..4a82d57fe566b 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -2545,9 +2545,9 @@ StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts, ArgsUnion ArgHints[] = {Hint.PragmaNameLoc, Hint.OptionLoc, Hint.StateLoc, ArgsUnion(Hint.ValueExpr)}; - TempAttrs.addNew(Hint.PragmaNameLoc->Ident, Hint.Range, nullptr, - Hint.PragmaNameLoc->Loc, ArgHints, 4, - ParsedAttr::Form::Pragma()); + TempAttrs.addNew(Hint.PragmaNameLoc->getIdentifierInfo(), Hint.Range, + /*scopeName=*/nullptr, Hint.PragmaNameLoc->getLoc(), + ArgHints, /*numArgs=*/4, ParsedAttr::Form::Pragma()); } // Get the next statement. diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index f3191762b1244..d528664bca352 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -2541,17 +2541,17 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { return Actions.ActOnPrivateModuleFragmentDecl(ModuleLoc, PrivateLoc); } - SmallVector, 2> Path; + SmallVector Path; if (ParseModuleName(ModuleLoc, Path, /*IsImport*/ false)) return nullptr; // Parse the optional module-partition. - SmallVector, 2> Partition; + SmallVector Partition; if (Tok.is(tok::colon)) { SourceLocation ColonLoc = ConsumeToken(); if (!getLangOpts().CPlusPlusModules) Diag(ColonLoc, diag::err_unsupported_module_partition) - << SourceRange(ColonLoc, Partition.back().second); + << SourceRange(ColonLoc, Partition.back().getLoc()); // Recover by ignoring the partition name. else if (ParseModuleName(ModuleLoc, Partition, /*IsImport*/ false)) return nullptr; @@ -2600,7 +2600,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, SourceLocation ImportLoc = ConsumeToken(); // For C++20 modules, we can have "name" or ":Partition name" as valid input. - SmallVector, 2> Path; + SmallVector Path; bool IsPartition = false; Module *HeaderUnit = nullptr; if (Tok.is(tok::header_name)) { @@ -2616,7 +2616,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, SourceLocation ColonLoc = ConsumeToken(); if (!getLangOpts().CPlusPlusModules) Diag(ColonLoc, diag::err_unsupported_module_partition) - << SourceRange(ColonLoc, Path.back().second); + << SourceRange(ColonLoc, Path.back().getLoc()); // Recover by leaving partition empty. else if (ParseModuleName(ColonLoc, Path, /*IsImport*/ true)) return nullptr; @@ -2718,10 +2718,9 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, /// module-name-qualifier[opt] identifier /// module-name-qualifier: /// module-name-qualifier[opt] identifier '.' -bool Parser::ParseModuleName( - SourceLocation UseLoc, - SmallVectorImpl> &Path, - bool IsImport) { +bool Parser::ParseModuleName(SourceLocation UseLoc, + SmallVectorImpl &Path, + bool IsImport) { // Parse the module path. while (true) { if (!Tok.is(tok::identifier)) { @@ -2737,7 +2736,7 @@ bool Parser::ParseModuleName( } // Record this part of the module path. - Path.push_back(std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation())); + Path.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); ConsumeToken(); if (Tok.isNot(tok::period)) diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp index b19a02b8c1a09..c149cef478539 100644 --- a/clang/lib/Sema/ParsedAttr.cpp +++ b/clang/lib/Sema/ParsedAttr.cpp @@ -23,14 +23,6 @@ using namespace clang; -IdentifierLoc *IdentifierLoc::create(ASTContext &Ctx, SourceLocation Loc, - IdentifierInfo *Ident) { - IdentifierLoc *Result = new (Ctx) IdentifierLoc; - Result->Loc = Loc; - Result->Ident = Ident; - return Result; -} - size_t ParsedAttr::allocated_size() const { if (IsAvailability) return AttributeFactory::AvailabilityAllocSize; else if (IsTypeTagForDatatype) diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp index 3f53fb200a93d..5bcbe78e9d633 100644 --- a/clang/lib/Sema/SemaARM.cpp +++ b/clang/lib/Sema/SemaARM.cpp @@ -1178,7 +1178,7 @@ void SemaARM::handleBuiltinAliasAttr(Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Ident = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *Ident = AL.getArgAsIdent(0)->getIdentifierInfo(); unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index f6ec4cb0f069e..1e4e6fdc78351 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -8718,7 +8718,7 @@ static void AddProtocolResults(DeclContext *Ctx, DeclContext *CurContext, } void SemaCodeCompletion::CodeCompleteObjCProtocolReferences( - ArrayRef Protocols) { + ArrayRef Protocols) { ResultBuilder Results(SemaRef, CodeCompleter->getAllocator(), CodeCompleter->getCodeCompletionTUInfo(), CodeCompletionContext::CCC_ObjCProtocolName); @@ -8729,9 +8729,9 @@ void SemaCodeCompletion::CodeCompleteObjCProtocolReferences( // Tell the result set to ignore all of the protocols we have // already seen. // FIXME: This doesn't work when caching code-completion results. - for (const IdentifierLocPair &Pair : Protocols) - if (ObjCProtocolDecl *Protocol = - SemaRef.ObjC().LookupProtocol(Pair.first, Pair.second)) + for (const IdentifierLoc &Pair : Protocols) + if (ObjCProtocolDecl *Protocol = SemaRef.ObjC().LookupProtocol( + Pair.getIdentifierInfo(), Pair.getLoc())) Results.Ignore(Protocol); // Add all protocols. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index bc891fb009410..7dd20a8795fc9 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -135,13 +135,13 @@ bool Sema::checkStringLiteralArgumentAttr(const ParsedAttr &AL, unsigned ArgNum, // Look for identifiers. If we have one emit a hint to fix it to a literal. if (AL.isArgIdent(ArgNum)) { IdentifierLoc *Loc = AL.getArgAsIdent(ArgNum); - Diag(Loc->Loc, diag::err_attribute_argument_type) + Diag(Loc->getLoc(), diag::err_attribute_argument_type) << AL << AANT_ArgumentString - << FixItHint::CreateInsertion(Loc->Loc, "\"") - << FixItHint::CreateInsertion(getLocForEndOfToken(Loc->Loc), "\""); - Str = Loc->Ident->getName(); + << FixItHint::CreateInsertion(Loc->getLoc(), "\"") + << FixItHint::CreateInsertion(getLocForEndOfToken(Loc->getLoc()), "\""); + Str = Loc->getIdentifierInfo()->getName(); if (ArgLocation) - *ArgLocation = Loc->Loc; + *ArgLocation = Loc->getLoc(); return true; } @@ -768,7 +768,7 @@ static void handleDiagnoseAsBuiltinAttr(Sema &S, Decl *D, auto Union = AL.getArg(Index - 1); if (auto *E = dyn_cast(Union)) return E->getBeginLoc(); - return cast(Union)->Loc; + return cast(Union)->getLoc(); }(); S.Diag(Loc, diag::err_attribute_argument_n_type) << AL << Index << T; @@ -960,10 +960,10 @@ static void handleConsumableAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); - if (!ConsumableAttr::ConvertStrToConsumedState(IL->Ident->getName(), - DefaultState)) { - S.Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL - << IL->Ident; + if (!ConsumableAttr::ConvertStrToConsumedState( + IL->getIdentifierInfo()->getName(), DefaultState)) { + S.Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) + << AL << IL->getIdentifierInfo(); return; } } else { @@ -1005,8 +1005,8 @@ static void handleCallableWhenAttr(Sema &S, Decl *D, const ParsedAttr &AL) { SourceLocation Loc; if (AL.isArgIdent(ArgIndex)) { IdentifierLoc *Ident = AL.getArgAsIdent(ArgIndex); - StateString = Ident->Ident->getName(); - Loc = Ident->Loc; + StateString = Ident->getIdentifierInfo()->getName(); + Loc = Ident->getLoc(); } else { if (!S.checkStringLiteralArgumentAttr(AL, ArgIndex, StateString, &Loc)) return; @@ -1030,11 +1030,11 @@ static void handleParamTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef StateString = Ident->Ident->getName(); + StringRef StateString = Ident->getIdentifierInfo()->getName(); if (!ParamTypestateAttr::ConvertStrToConsumedState(StateString, ParamState)) { - S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) + S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) << AL << StateString; return; } @@ -1064,10 +1064,10 @@ static void handleReturnTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); - if (!ReturnTypestateAttr::ConvertStrToConsumedState(IL->Ident->getName(), - ReturnState)) { - S.Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL - << IL->Ident; + if (!ReturnTypestateAttr::ConvertStrToConsumedState( + IL->getIdentifierInfo()->getName(), ReturnState)) { + S.Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) + << AL << IL->getIdentifierInfo(); return; } } else { @@ -1111,10 +1111,10 @@ static void handleSetTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { SetTypestateAttr::ConsumedState NewState; if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef Param = Ident->Ident->getName(); + StringRef Param = Ident->getIdentifierInfo()->getName(); if (!SetTypestateAttr::ConvertStrToConsumedState(Param, NewState)) { - S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) << AL - << Param; + S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) + << AL << Param; return; } } else { @@ -1133,10 +1133,10 @@ static void handleTestTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { TestTypestateAttr::ConsumedState TestState; if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef Param = Ident->Ident->getName(); + StringRef Param = Ident->getIdentifierInfo()->getName(); if (!TestTypestateAttr::ConvertStrToConsumedState(Param, TestState)) { - S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) << AL - << Param; + S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) + << AL << Param; return; } } else { @@ -1497,7 +1497,7 @@ static void handleOwnershipAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Module = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *Module = AL.getArgAsIdent(0)->getIdentifierInfo(); StringRef ModuleName = Module->getName(); if (normalizeName(ModuleName)) { @@ -1864,10 +1864,10 @@ static void handleCPUSpecificAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } IdentifierLoc *CPUArg = AL.getArgAsIdent(ArgNo); - StringRef CPUName = CPUArg->Ident->getName().trim(); + StringRef CPUName = CPUArg->getIdentifierInfo()->getName().trim(); if (!S.Context.getTargetInfo().validateCPUSpecificCPUDispatch(CPUName)) { - S.Diag(CPUArg->Loc, diag::err_invalid_cpu_specific_dispatch_value) + S.Diag(CPUArg->getLoc(), diag::err_invalid_cpu_specific_dispatch_value) << CPUName << (AL.getKind() == ParsedAttr::AT_CPUDispatch); return; } @@ -1880,7 +1880,7 @@ static void handleCPUSpecificAttr(Sema &S, Decl *D, const ParsedAttr &AL) { S.Diag(AL.getLoc(), diag::warn_multiversion_duplicate_entries); return; } - CPUs.push_back(CPUArg->Ident); + CPUs.push_back(CPUArg->getIdentifierInfo()); } FD->setIsMultiVersion(true); @@ -2358,10 +2358,10 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; IdentifierLoc *Platform = AL.getArgAsIdent(0); - IdentifierInfo *II = Platform->Ident; + IdentifierInfo *II = Platform->getIdentifierInfo(); if (AvailabilityAttr::getPrettyPlatformName(II->getName()).empty()) - S.Diag(Platform->Loc, diag::warn_availability_unknown_platform) - << Platform->Ident; + S.Diag(Platform->getLoc(), diag::warn_availability_unknown_platform) + << Platform->getIdentifierInfo(); auto *ND = dyn_cast(D); if (!ND) // We warned about this already, so just return. @@ -2410,14 +2410,16 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) { IdentifierInfo *IIEnvironment = nullptr; if (EnvironmentLoc) { if (S.getLangOpts().HLSL) { - IIEnvironment = EnvironmentLoc->Ident; + IIEnvironment = EnvironmentLoc->getIdentifierInfo(); if (AvailabilityAttr::getEnvironmentType( - EnvironmentLoc->Ident->getName()) == + EnvironmentLoc->getIdentifierInfo()->getName()) == llvm::Triple::EnvironmentType::UnknownEnvironment) - S.Diag(EnvironmentLoc->Loc, diag::warn_availability_unknown_environment) - << EnvironmentLoc->Ident; + S.Diag(EnvironmentLoc->getLoc(), + diag::warn_availability_unknown_environment) + << EnvironmentLoc->getIdentifierInfo(); } else { - S.Diag(EnvironmentLoc->Loc, diag::err_availability_unexpected_parameter) + S.Diag(EnvironmentLoc->getLoc(), + diag::err_availability_unexpected_parameter) << "environment" << /* C/C++ */ 1; } } @@ -3630,7 +3632,7 @@ static void handleEnumExtensibilityAttr(Sema &S, Decl *D, } EnumExtensibilityAttr::Kind ExtensibilityKind; - IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); if (!EnumExtensibilityAttr::ConvertStrToKind(II->getName(), ExtensibilityKind)) { S.Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; @@ -3853,7 +3855,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, bool HasImplicitThisParam = isInstanceMethod(D); Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam; - Info->Identifier = AL.getArgAsIdent(0)->Ident; + Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo(); StringRef Format = Info->Identifier->getName(); if (normalizeName(Format)) { @@ -4017,14 +4019,14 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(I)) { IdentifierLoc *IdLoc = AL.getArgAsIdent(I); - auto It = NameIdxMapping.find(IdLoc->Ident->getName()); + auto It = NameIdxMapping.find(IdLoc->getIdentifierInfo()->getName()); if (It == UnknownName) { S.Diag(AL.getLoc(), diag::err_callback_attribute_argument_unknown) - << IdLoc->Ident << IdLoc->Loc; + << IdLoc->getIdentifierInfo() << IdLoc->getLoc(); return; } - SR = SourceRange(IdLoc->Loc); + SR = SourceRange(IdLoc->getLoc()); ArgIdx = It->second; } else if (AL.isArgExpr(I)) { Expr *IdxExpr = AL.getArgAsExpr(I); @@ -4142,13 +4144,14 @@ LifetimeCaptureByAttr *Sema::ParseLifetimeCaptureByAttr(const ParsedAttr &AL, } assert(AL.isArgIdent(I)); IdentifierLoc *IdLoc = AL.getArgAsIdent(I); - if (IdLoc->Ident->getName() == ParamName) { - Diag(IdLoc->Loc, diag::err_capture_by_references_itself) << IdLoc->Loc; + if (IdLoc->getIdentifierInfo()->getName() == ParamName) { + Diag(IdLoc->getLoc(), diag::err_capture_by_references_itself) + << IdLoc->getLoc(); IsValid = false; continue; } - ParamIdents[I] = IdLoc->Ident; - ParamLocs[I] = IdLoc->Loc; + ParamIdents[I] = IdLoc->getIdentifierInfo(); + ParamLocs[I] = IdLoc->getLoc(); } if (!IsValid) return nullptr; @@ -4754,7 +4757,7 @@ static void handleModeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Name = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *Name = AL.getArgAsIdent(0)->getIdentifierInfo(); S.AddModeAttr(D, AL, Name); } @@ -5727,8 +5730,8 @@ static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D, } D->addAttr(::new (S.Context) ArgumentWithTypeTagAttr( - S.Context, AL, AL.getArgAsIdent(0)->Ident, ArgumentIdx, TypeTagIdx, - IsPointer)); + S.Context, AL, AL.getArgAsIdent(0)->getIdentifierInfo(), ArgumentIdx, + TypeTagIdx, IsPointer)); } static void handleTypeTagForDatatypeAttr(Sema &S, Decl *D, @@ -5748,7 +5751,7 @@ static void handleTypeTagForDatatypeAttr(Sema &S, Decl *D, return; } - IdentifierInfo *PointerKind = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *PointerKind = AL.getArgAsIdent(0)->getIdentifierInfo(); TypeSourceInfo *MatchingCTypeLoc = nullptr; S.GetTypeFromParser(AL.getMatchingCType(), &MatchingCTypeLoc); assert(MatchingCTypeLoc && "no type source info for attribute argument"); @@ -5819,7 +5822,7 @@ static void handleBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Ident = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *Ident = AL.getArgAsIdent(0)->getIdentifierInfo(); unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); @@ -6585,7 +6588,7 @@ static void handleCFGuardAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } CFGuardAttr::GuardArg Arg; - IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); if (!CFGuardAttr::ConvertStrToGuardArg(II->getName(), Arg)) { S.Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; return; @@ -6687,8 +6690,9 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); if (!VTablePointerAuthenticationAttr::ConvertStrToVPtrAuthKeyType( - IL->Ident->getName(), KeyType)) { - S.Diag(IL->Loc, diag::err_invalid_authentication_key) << IL->Ident; + IL->getIdentifierInfo()->getName(), KeyType)) { + S.Diag(IL->getLoc(), diag::err_invalid_authentication_key) + << IL->getIdentifierInfo(); AL.setInvalid(); } if (KeyType == VTablePointerAuthenticationAttr::DefaultKey && @@ -6708,15 +6712,16 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(1)) { IdentifierLoc *IL = AL.getArgAsIdent(1); if (!VTablePointerAuthenticationAttr:: - ConvertStrToAddressDiscriminationMode(IL->Ident->getName(), - AddressDiversityMode)) { - S.Diag(IL->Loc, diag::err_invalid_address_discrimination) << IL->Ident; + ConvertStrToAddressDiscriminationMode( + IL->getIdentifierInfo()->getName(), AddressDiversityMode)) { + S.Diag(IL->getLoc(), diag::err_invalid_address_discrimination) + << IL->getIdentifierInfo(); AL.setInvalid(); } if (AddressDiversityMode == VTablePointerAuthenticationAttr::DefaultAddressDiscrimination && !S.getLangOpts().PointerAuthCalls) { - S.Diag(IL->Loc, diag::err_no_default_vtable_pointer_auth) << 1; + S.Diag(IL->getLoc(), diag::err_no_default_vtable_pointer_auth) << 1; AL.setInvalid(); } } else { @@ -6731,8 +6736,9 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(2)) { IdentifierLoc *IL = AL.getArgAsIdent(2); if (!VTablePointerAuthenticationAttr::ConvertStrToExtraDiscrimination( - IL->Ident->getName(), ED)) { - S.Diag(IL->Loc, diag::err_invalid_extra_discrimination) << IL->Ident; + IL->getIdentifierInfo()->getName(), ED)) { + S.Diag(IL->getLoc(), diag::err_invalid_extra_discrimination) + << IL->getIdentifierInfo(); AL.setInvalid(); } if (ED == VTablePointerAuthenticationAttr::DefaultExtraDiscrimination && diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index ba9d3dcf19617..0a14ce23a396e 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -1310,24 +1310,26 @@ static bool NestedProtocolHasNoDefinition(ObjCProtocolDecl *PDecl, /// protocol declarations in its 'Protocols' argument. void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, bool ForObjCContainer, - ArrayRef ProtocolId, + ArrayRef ProtocolId, SmallVectorImpl &Protocols) { - for (const IdentifierLocPair &Pair : ProtocolId) { - ObjCProtocolDecl *PDecl = LookupProtocol(Pair.first, Pair.second); + for (const IdentifierLoc &Pair : ProtocolId) { + ObjCProtocolDecl *PDecl = + LookupProtocol(Pair.getIdentifierInfo(), Pair.getLoc()); if (!PDecl) { DeclFilterCCC CCC{}; - TypoCorrection Corrected = - SemaRef.CorrectTypo(DeclarationNameInfo(Pair.first, Pair.second), - Sema::LookupObjCProtocolName, SemaRef.TUScope, - nullptr, CCC, Sema::CTK_ErrorRecovery); + TypoCorrection Corrected = SemaRef.CorrectTypo( + DeclarationNameInfo(Pair.getIdentifierInfo(), Pair.getLoc()), + Sema::LookupObjCProtocolName, SemaRef.TUScope, nullptr, CCC, + Sema::CTK_ErrorRecovery); if ((PDecl = Corrected.getCorrectionDeclAs())) SemaRef.diagnoseTypo(Corrected, PDiag(diag::err_undeclared_protocol_suggest) - << Pair.first); + << Pair.getIdentifierInfo()); } if (!PDecl) { - Diag(Pair.second, diag::err_undeclared_protocol) << Pair.first; + Diag(Pair.getLoc(), diag::err_undeclared_protocol) + << Pair.getIdentifierInfo(); continue; } // If this is a forward protocol declaration, get its definition. @@ -1337,7 +1339,7 @@ void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, // For an objc container, delay protocol reference checking until after we // can set the objc decl as the availability context, otherwise check now. if (!ForObjCContainer) { - (void)SemaRef.DiagnoseUseOfDecl(PDecl, Pair.second); + (void)SemaRef.DiagnoseUseOfDecl(PDecl, Pair.getLoc()); } // If this is a forward declaration and we are supposed to warn in this @@ -1347,7 +1349,8 @@ void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, if (WarnOnDeclarations && NestedProtocolHasNoDefinition(PDecl, UndefinedProtocol)) { - Diag(Pair.second, diag::warn_undef_protocolref) << Pair.first; + Diag(Pair.getLoc(), diag::warn_undef_protocolref) + << Pair.getIdentifierInfo(); Diag(UndefinedProtocol->getLocation(), diag::note_protocol_decl_undefined) << UndefinedProtocol; } @@ -1784,17 +1787,17 @@ void SemaObjC::DiagnoseClassExtensionDupMethods(ObjCCategoryDecl *CAT, /// ActOnForwardProtocolDeclaration - Handle \@protocol foo; SemaObjC::DeclGroupPtrTy SemaObjC::ActOnForwardProtocolDeclaration( - SourceLocation AtProtocolLoc, ArrayRef IdentList, + SourceLocation AtProtocolLoc, ArrayRef IdentList, const ParsedAttributesView &attrList) { ASTContext &Context = getASTContext(); SmallVector DeclsInGroup; - for (const IdentifierLocPair &IdentPair : IdentList) { - IdentifierInfo *Ident = IdentPair.first; + for (const IdentifierLoc &IdentPair : IdentList) { + IdentifierInfo *Ident = IdentPair.getIdentifierInfo(); ObjCProtocolDecl *PrevDecl = LookupProtocol( - Ident, IdentPair.second, SemaRef.forRedeclarationInCurContext()); + Ident, IdentPair.getLoc(), SemaRef.forRedeclarationInCurContext()); ObjCProtocolDecl *PDecl = ObjCProtocolDecl::Create(Context, SemaRef.CurContext, Ident, - IdentPair.second, AtProtocolLoc, PrevDecl); + IdentPair.getLoc(), AtProtocolLoc, PrevDecl); SemaRef.PushOnScopeChains(PDecl, SemaRef.TUScope); CheckObjCDeclScope(PDecl); diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 0b442b75d174d..11f156ae09216 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1274,8 +1274,8 @@ bool SemaHLSL::handleResourceTypeAttr(QualType T, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(0); - StringRef Identifier = Loc->Ident->getName(); - SourceLocation ArgLoc = Loc->Loc; + StringRef Identifier = Loc->getIdentifierInfo()->getName(); + SourceLocation ArgLoc = Loc->getLoc(); // Validate resource class value ResourceClass RC; @@ -1534,8 +1534,8 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(0); - StringRef Str = Loc->Ident->getName(); - SourceLocation ArgLoc = Loc->Loc; + StringRef Str = Loc->getIdentifierInfo()->getName(); + SourceLocation ArgLoc = Loc->getLoc(); SourceLocation SpaceArgLoc; bool SpecifiedSpace = false; @@ -1549,8 +1549,8 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(1); - Space = Loc->Ident->getName(); - SpaceArgLoc = Loc->Loc; + Space = Loc->getIdentifierInfo()->getName(); + SpaceArgLoc = Loc->getLoc(); } else { Slot = Str; } diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index 76589bff40be9..4bba57193ded6 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -15,6 +15,7 @@ #include "clang/AST/ASTMutationListener.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/StringExtras.h" @@ -68,7 +69,7 @@ static std::string stringFromPath(ModuleIdPath Path) { for (auto &Piece : Path) { if (!Name.empty()) Name += "."; - Name += Piece.first->getName(); + Name += Piece.getIdentifierInfo()->getName(); } return Name; } @@ -350,17 +351,18 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // Test the first part of the path to see if it's std[0-9]+ but allow the // name in a system header. - StringRef FirstComponentName = Path[0].first->getName(); - if (!getSourceManager().isInSystemHeader(Path[0].second) && + StringRef FirstComponentName = Path[0].getIdentifierInfo()->getName(); + if (!getSourceManager().isInSystemHeader(Path[0].getLoc()) && (FirstComponentName == "std" || (FirstComponentName.starts_with("std") && llvm::all_of(FirstComponentName.drop_front(3), &llvm::isDigit)))) - Diag(Path[0].second, diag::warn_reserved_module_name) << Path[0].first; + Diag(Path[0].getLoc(), diag::warn_reserved_module_name) + << Path[0].getIdentifierInfo(); // Then test all of the components in the path to see if any of them are // using another kind of reserved or invalid identifier. for (auto Part : Path) { - if (DiagReservedModuleName(*this, Part.first, Part.second)) + if (DiagReservedModuleName(*this, Part.getIdentifierInfo(), Part.getLoc())) return nullptr; } @@ -376,10 +378,10 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // correct. if (!getLangOpts().CurrentModule.empty() && getLangOpts().CurrentModule != ModuleName) { - Diag(Path.front().second, diag::err_current_module_name_mismatch) - << SourceRange(Path.front().second, IsPartition - ? Partition.back().second - : Path.back().second) + Diag(Path.front().getLoc(), diag::err_current_module_name_mismatch) + << SourceRange(Path.front().getLoc(), IsPartition + ? Partition.back().getLoc() + : Path.back().getLoc()) << getLangOpts().CurrentModule; return nullptr; } @@ -394,7 +396,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // We can't have parsed or imported a definition of this module or parsed a // module map defining it already. if (auto *M = Map.findModule(ModuleName)) { - Diag(Path[0].second, diag::err_module_redefinition) << ModuleName; + Diag(Path[0].getLoc(), diag::err_module_redefinition) << ModuleName; if (M->DefinitionLoc.isValid()) Diag(M->DefinitionLoc, diag::note_prev_module_definition); else if (OptionalFileEntryRef FE = M->getASTFile()) @@ -417,8 +419,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // keyword nor a module-partition implicitly imports the primary // module interface unit of the module as if by a module-import- // declaration. - std::pair ModuleNameLoc( - PP.getIdentifierInfo(ModuleName), Path[0].second); + IdentifierLoc ModuleNameLoc(Path[0].getLoc(), + PP.getIdentifierInfo(ModuleName)); // The module loader will assume we're trying to import the module that // we're building if `LangOpts.CurrentModule` equals to 'ModuleName'. @@ -490,7 +492,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // Make the import decl for the interface in the impl module. ImportDecl *Import = ImportDecl::Create(Context, CurContext, ModuleLoc, - Interface, Path[0].second); + Interface, Path[0].getLoc()); CurContext->addDecl(Import); // Sequence initialization of the imported module before that of the current @@ -579,7 +581,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, // For a C++20 module name, flatten into a single identifier with the source // location of the first component. - std::pair ModuleNameLoc; + IdentifierLoc ModuleNameLoc; std::string ModuleName; if (IsPartition) { @@ -591,11 +593,13 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, ModuleName = NamedMod->getPrimaryModuleInterfaceName().str(); ModuleName += ":"; ModuleName += stringFromPath(Path); - ModuleNameLoc = {PP.getIdentifierInfo(ModuleName), Path[0].second}; + ModuleNameLoc = + IdentifierLoc(Path[0].getLoc(), PP.getIdentifierInfo(ModuleName)); Path = ModuleIdPath(ModuleNameLoc); } else if (getLangOpts().CPlusPlusModules) { ModuleName = stringFromPath(Path); - ModuleNameLoc = {PP.getIdentifierInfo(ModuleName), Path[0].second}; + ModuleNameLoc = + IdentifierLoc(Path[0].getLoc(), PP.getIdentifierInfo(ModuleName)); Path = ModuleIdPath(ModuleNameLoc); } @@ -680,7 +684,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, IdentifierLocs.push_back(SourceLocation()); } else if (getLangOpts().CPlusPlusModules && !Mod->Parent) { // A single identifier for the whole name. - IdentifierLocs.push_back(Path[0].second); + IdentifierLocs.push_back(Path[0].getLoc()); } else { Module *ModCheck = Mod; for (unsigned I = 0, N = Path.size(); I != N; ++I) { @@ -690,7 +694,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, break; ModCheck = ModCheck->Parent; - IdentifierLocs.push_back(Path[I].second); + IdentifierLocs.push_back(Path[I].getLoc()); } } @@ -707,7 +711,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, if (getLangOpts().CPlusPlusModules && ExportLoc.isValid() && Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) { Diag(ExportLoc, diag::err_export_partition_impl) - << SourceRange(ExportLoc, Path.back().second); + << SourceRange(ExportLoc, Path.back().getLoc()); } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) { // Re-export the module if the imported module is exported. // Note that we don't need to add re-exported module to Imports field diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp index 073d9791d037b..9b24b5f052119 100644 --- a/clang/lib/Sema/SemaObjC.cpp +++ b/clang/lib/Sema/SemaObjC.cpp @@ -1446,10 +1446,8 @@ SemaObjC::ObjCSubscriptKind SemaObjC::CheckSubscriptingKind(Expr *FromE) { void SemaObjC::AddCFAuditedAttribute(Decl *D) { ASTContext &Context = getASTContext(); - IdentifierInfo *Ident; - SourceLocation Loc; - std::tie(Ident, Loc) = SemaRef.PP.getPragmaARCCFCodeAuditedInfo(); - if (!Loc.isValid()) + auto IdLoc = SemaRef.PP.getPragmaARCCFCodeAuditedInfo(); + if (!IdLoc.getLoc().isValid()) return; // Don't add a redundant or conflicting attribute. @@ -1457,7 +1455,8 @@ void SemaObjC::AddCFAuditedAttribute(Decl *D) { D->hasAttr()) return; - AttributeCommonInfo Info(Ident, SourceRange(Loc), + AttributeCommonInfo Info(IdLoc.getIdentifierInfo(), + SourceRange(IdLoc.getLoc()), AttributeCommonInfo::Form::Pragma()); D->addAttr(CFAuditedTransferAttr::CreateImplicit(Context, Info)); } @@ -1642,8 +1641,10 @@ void SemaObjC::handleMethodFamilyAttr(Decl *D, const ParsedAttr &AL) { IdentifierLoc *IL = AL.getArgAsIdent(0); ObjCMethodFamilyAttr::FamilyKind F; - if (!ObjCMethodFamilyAttr::ConvertStrToFamilyKind(IL->Ident->getName(), F)) { - Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL << IL->Ident; + if (!ObjCMethodFamilyAttr::ConvertStrToFamilyKind( + IL->getIdentifierInfo()->getName(), F)) { + Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) + << AL << IL->getIdentifierInfo(); return; } @@ -1706,7 +1707,7 @@ void SemaObjC::handleBlocksAttr(Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); BlocksAttr::BlockType type; if (!BlocksAttr::ConvertStrToBlockType(II->getName(), type)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; @@ -1998,7 +1999,7 @@ void SemaObjC::handleNSErrorDomain(Decl *D, const ParsedAttr &Attr) { IdentifierLoc *IdentLoc = Attr.isArgIdent(0) ? Attr.getArgAsIdent(0) : nullptr; - if (!IdentLoc || !IdentLoc->Ident) { + if (!IdentLoc || !IdentLoc->getIdentifierInfo()) { // Try to locate the argument directly. SourceLocation Loc = Attr.getLoc(); if (Attr.isArgExpr(0) && Attr.getArgAsExpr(0)) @@ -2009,18 +2010,18 @@ void SemaObjC::handleNSErrorDomain(Decl *D, const ParsedAttr &Attr) { } // Verify that the identifier is a valid decl in the C decl namespace. - LookupResult Result(SemaRef, DeclarationName(IdentLoc->Ident), + LookupResult Result(SemaRef, DeclarationName(IdentLoc->getIdentifierInfo()), SourceLocation(), Sema::LookupNameKind::LookupOrdinaryName); if (!SemaRef.LookupName(Result, SemaRef.TUScope) || !Result.getAsSingle()) { - Diag(IdentLoc->Loc, diag::err_nserrordomain_invalid_decl) - << 1 << IdentLoc->Ident; + Diag(IdentLoc->getLoc(), diag::err_nserrordomain_invalid_decl) + << 1 << IdentLoc->getIdentifierInfo(); return; } - D->addAttr(::new (getASTContext()) - NSErrorDomainAttr(getASTContext(), Attr, IdentLoc->Ident)); + D->addAttr(::new (getASTContext()) NSErrorDomainAttr( + getASTContext(), Attr, IdentLoc->getIdentifierInfo())); } void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { @@ -2033,7 +2034,7 @@ void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { // Typedefs only allow objc_bridge(id) and have some additional checking. if (const auto *TD = dyn_cast(D)) { - if (!Parm->Ident->isStr("id")) { + if (!Parm->getIdentifierInfo()->isStr("id")) { Diag(AL.getLoc(), diag::err_objc_attr_typedef_not_id) << AL; return; } @@ -2046,8 +2047,8 @@ void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { } } - D->addAttr(::new (getASTContext()) - ObjCBridgeAttr(getASTContext(), AL, Parm->Ident)); + D->addAttr(::new (getASTContext()) ObjCBridgeAttr(getASTContext(), AL, + Parm->getIdentifierInfo())); } void SemaObjC::handleBridgeMutableAttr(Decl *D, const ParsedAttr &AL) { @@ -2058,21 +2059,21 @@ void SemaObjC::handleBridgeMutableAttr(Decl *D, const ParsedAttr &AL) { return; } - D->addAttr(::new (getASTContext()) - ObjCBridgeMutableAttr(getASTContext(), AL, Parm->Ident)); + D->addAttr(::new (getASTContext()) ObjCBridgeMutableAttr( + getASTContext(), AL, Parm->getIdentifierInfo())); } void SemaObjC::handleBridgeRelatedAttr(Decl *D, const ParsedAttr &AL) { IdentifierInfo *RelatedClass = - AL.isArgIdent(0) ? AL.getArgAsIdent(0)->Ident : nullptr; + AL.isArgIdent(0) ? AL.getArgAsIdent(0)->getIdentifierInfo() : nullptr; if (!RelatedClass) { Diag(D->getBeginLoc(), diag::err_objc_attr_not_id) << AL << 0; return; } IdentifierInfo *ClassMethod = - AL.getArgAsIdent(1) ? AL.getArgAsIdent(1)->Ident : nullptr; + AL.getArgAsIdent(1) ? AL.getArgAsIdent(1)->getIdentifierInfo() : nullptr; IdentifierInfo *InstanceMethod = - AL.getArgAsIdent(2) ? AL.getArgAsIdent(2)->Ident : nullptr; + AL.getArgAsIdent(2) ? AL.getArgAsIdent(2)->getIdentifierInfo() : nullptr; D->addAttr(::new (getASTContext()) ObjCBridgeRelatedAttr( getASTContext(), AL, RelatedClass, ClassMethod, InstanceMethod)); } diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index ab25dcfd1a081..049baead031a1 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -1343,7 +1343,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause( // the limitation, since the Dialect requires this. if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set && Clause.getDeviceTypeArchitectures().size() > 1) { - SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].second, + SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].getLoc(), diag::err_acc_device_type_multiple_archs); return nullptr; } @@ -1369,16 +1369,17 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause( bool Diagnosed = false; auto FilterPred = [&](const DeviceTypeArgument &Arch) { // The '*' case. - if (!Arch.first) + if (!Arch.getIdentifierInfo()) return false; return llvm::find_if(ValidValues, [&](StringRef RHS) { - return Arch.first->getName().equals_insensitive(RHS); + return Arch.getIdentifierInfo()->getName().equals_insensitive(RHS); }) == ValidValues.end(); }; auto Diagnose = [&](const DeviceTypeArgument &Arch) { - Diagnosed = SemaRef.Diag(Arch.second, diag::err_acc_invalid_default_type) - << Arch.first << Clause.getClauseKind() << ValidValuesString; + Diagnosed = SemaRef.Diag(Arch.getLoc(), diag::err_acc_invalid_default_type) + << Arch.getIdentifierInfo() << Clause.getClauseKind() + << ValidValuesString; }; // There aren't stable enumertor versions of 'for-each-then-erase', so do it diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 2f719c6d7a21e..a09626c3a9a8c 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -79,9 +79,10 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, Expr *ValueExpr = A.getArgAsExpr(3); StringRef PragmaName = - llvm::StringSwitch(PragmaNameLoc->Ident->getName()) + llvm::StringSwitch( + PragmaNameLoc->getIdentifierInfo()->getName()) .Cases("unroll", "nounroll", "unroll_and_jam", "nounroll_and_jam", - PragmaNameLoc->Ident->getName()) + PragmaNameLoc->getIdentifierInfo()->getName()) .Default("clang loop"); // This could be handled automatically by adding a Subjects definition in @@ -127,10 +128,10 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, SetHints(LoopHintAttr::UnrollAndJam, LoopHintAttr::Enable); } else { // #pragma clang loop ... - assert(OptionLoc && OptionLoc->Ident && + assert(OptionLoc && OptionLoc->getIdentifierInfo() && "Attribute must have valid option info."); Option = llvm::StringSwitch( - OptionLoc->Ident->getName()) + OptionLoc->getIdentifierInfo()->getName()) .Case("vectorize", LoopHintAttr::Vectorize) .Case("vectorize_width", LoopHintAttr::VectorizeWidth) .Case("interleave", LoopHintAttr::Interleave) @@ -144,12 +145,13 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, .Case("distribute", LoopHintAttr::Distribute) .Default(LoopHintAttr::Vectorize); if (Option == LoopHintAttr::VectorizeWidth) { - assert((ValueExpr || (StateLoc && StateLoc->Ident)) && + assert((ValueExpr || (StateLoc && StateLoc->getIdentifierInfo())) && "Attribute must have a valid value expression or argument."); if (ValueExpr && S.CheckLoopHintExpr(ValueExpr, St->getBeginLoc(), /*AllowZero=*/false)) return nullptr; - if (StateLoc && StateLoc->Ident && StateLoc->Ident->isStr("scalable")) + if (StateLoc && StateLoc->getIdentifierInfo() && + StateLoc->getIdentifierInfo()->isStr("scalable")) State = LoopHintAttr::ScalableWidth; else State = LoopHintAttr::FixedWidth; @@ -167,14 +169,15 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, Option == LoopHintAttr::Unroll || Option == LoopHintAttr::Distribute || Option == LoopHintAttr::PipelineDisabled) { - assert(StateLoc && StateLoc->Ident && "Loop hint must have an argument"); - if (StateLoc->Ident->isStr("disable")) + assert(StateLoc && StateLoc->getIdentifierInfo() && + "Loop hint must have an argument"); + if (StateLoc->getIdentifierInfo()->isStr("disable")) State = LoopHintAttr::Disable; - else if (StateLoc->Ident->isStr("assume_safety")) + else if (StateLoc->getIdentifierInfo()->isStr("assume_safety")) State = LoopHintAttr::AssumeSafety; - else if (StateLoc->Ident->isStr("full")) + else if (StateLoc->getIdentifierInfo()->isStr("full")) State = LoopHintAttr::Full; - else if (StateLoc->Ident->isStr("enable")) + else if (StateLoc->getIdentifierInfo()->isStr("enable")) State = LoopHintAttr::Enable; else llvm_unreachable("bad loop hint argument"); @@ -644,8 +647,8 @@ static Attr *handleAtomicAttr(Sema &S, Stmt *St, const ParsedAttr &AL, } IdentifierLoc *Ident = AL.getArgAsIdent(ArgIndex); - OptionString = Ident->Ident->getName(); - Loc = Ident->Loc; + OptionString = Ident->getIdentifierInfo()->getName(); + Loc = Ident->getLoc(); if (!AtomicAttr::ConvertStrToConsumedOption(OptionString, Option)) { S.Diag(Loc, diag::err_attribute_invalid_atomic_argument) << OptionString; return nullptr; diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp index fe72d6c85c37a..4aae855a24b8f 100644 --- a/clang/lib/Sema/SemaSwift.cpp +++ b/clang/lib/Sema/SemaSwift.cpp @@ -148,8 +148,8 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { return true; S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type) - << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D) - << /*pointer*/ 1; + << AL << AL.getArgAsIdent(0)->getIdentifierInfo()->getName() + << isa(D) << /*pointer*/ 1; return false; }; @@ -159,8 +159,8 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { return true; S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type) - << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D) - << /*integral*/ 0; + << AL << AL.getArgAsIdent(0)->getIdentifierInfo()->getName() + << isa(D) << /*integral*/ 0; return false; }; @@ -169,10 +169,10 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { IdentifierLoc *Loc = AL.getArgAsIdent(0); SwiftErrorAttr::ConventionKind Convention; - if (!SwiftErrorAttr::ConvertStrToConventionKind(Loc->Ident->getName(), - Convention)) { + if (!SwiftErrorAttr::ConvertStrToConventionKind( + Loc->getIdentifierInfo()->getName(), Convention)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) - << AL << Loc->Ident; + << AL << Loc->getIdentifierInfo(); return; } @@ -287,10 +287,10 @@ static void checkSwiftAsyncErrorBlock(Sema &S, Decl *D, void SemaSwift::handleAsyncError(Decl *D, const ParsedAttr &AL) { IdentifierLoc *IDLoc = AL.getArgAsIdent(0); SwiftAsyncErrorAttr::ConventionKind ConvKind; - if (!SwiftAsyncErrorAttr::ConvertStrToConventionKind(IDLoc->Ident->getName(), - ConvKind)) { + if (!SwiftAsyncErrorAttr::ConvertStrToConventionKind( + IDLoc->getIdentifierInfo()->getName(), ConvKind)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) - << AL << IDLoc->Ident; + << AL << IDLoc->getIdentifierInfo(); return; } @@ -643,7 +643,7 @@ void SemaSwift::handleNewType(Decl *D, const ParsedAttr &AL) { } SwiftNewTypeAttr::NewtypeKind Kind; - IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); if (!SwiftNewTypeAttr::ConvertStrToNewtypeKind(II->getName(), Kind)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; return; @@ -667,7 +667,7 @@ void SemaSwift::handleAsyncAttr(Decl *D, const ParsedAttr &AL) { } SwiftAsyncAttr::Kind Kind; - IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; + IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); if (!SwiftAsyncAttr::ConvertStrToKind(II->getName(), Kind)) { Diag(AL.getLoc(), diag::err_swift_async_no_access) << AL << II; return; diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index dc7e3a0bf8875..87682233c5246 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -14,6 +14,7 @@ #include "clang/AST/ExprObjC.h" #include "clang/AST/TypeLoc.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/Sema.h" @@ -755,7 +756,7 @@ bool Sema::CheckParameterPacksForExpansion( bool &RetainExpansion, UnsignedOrNone &NumExpansions) { ShouldExpand = true; RetainExpansion = false; - std::pair FirstPack; + IdentifierLoc FirstPack; bool HaveFirstPack = false; UnsignedOrNone NumPartialExpansions = std::nullopt; SourceLocation PartiallySubstitutedPackLoc; @@ -867,8 +868,7 @@ bool Sema::CheckParameterPacksForExpansion( // This is the first pack we've seen for which we have an argument. // Record it. NumExpansions = NewPackSize; - FirstPack.first = Name; - FirstPack.second = ParmPack.second; + FirstPack = IdentifierLoc(ParmPack.second, Name); HaveFirstPack = true; continue; } @@ -905,9 +905,9 @@ bool Sema::CheckParameterPacksForExpansion( // the same number of arguments specified. if (HaveFirstPack) Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict) - << FirstPack.first << Name << *NumExpansions + << FirstPack.getIdentifierInfo() << Name << *NumExpansions << (LeastNewPackSize != NewPackSize) << LeastNewPackSize - << SourceRange(FirstPack.second) << SourceRange(ParmPack.second); + << SourceRange(FirstPack.getLoc()) << SourceRange(ParmPack.second); else Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_multilevel) << Name << *NumExpansions << (LeastNewPackSize != NewPackSize) diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index eba7267904fb2..6e7ee8b5506ff 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -99,8 +99,8 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, StringRef name = attr.getAttrName()->getName(); // The GC attributes are usually written with macros; special-case them. - IdentifierInfo *II = attr.isArgIdent(0) ? attr.getArgAsIdent(0)->Ident - : nullptr; + IdentifierInfo *II = + attr.isArgIdent(0) ? attr.getArgAsIdent(0)->getIdentifierInfo() : nullptr; if (useExpansionLoc && loc.isMacroID() && II) { if (II->isStr("strong")) { if (S.findMacroSpelling(loc, "__strong")) name = "__strong"; @@ -5732,8 +5732,7 @@ static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state, } IdentifierLoc *Arg = new (S.Context) IdentifierLoc; - Arg->Ident = &S.Context.Idents.get(attrStr); - Arg->Loc = SourceLocation(); + Arg->setIdentifierInfo(&S.Context.Idents.get(attrStr)); ArgsUnion Args(Arg); @@ -6633,7 +6632,7 @@ static bool handleObjCOwnershipTypeAttr(TypeProcessingState &state, return true; } - IdentifierInfo *II = attr.getArgAsIdent(0)->Ident; + IdentifierInfo *II = attr.getArgAsIdent(0)->getIdentifierInfo(); Qualifiers::ObjCLifetime lifetime; if (II->isStr("none")) lifetime = Qualifiers::OCL_ExplicitNone; @@ -6811,7 +6810,7 @@ static bool handleObjCGCTypeAttr(TypeProcessingState &state, ParsedAttr &attr, return true; } - IdentifierInfo *II = attr.getArgAsIdent(0)->Ident; + IdentifierInfo *II = attr.getArgAsIdent(0)->getIdentifierInfo(); if (II->isStr("weak")) GCAttr = Qualifiers::Weak; else if (II->isStr("strong")) @@ -7541,7 +7540,7 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) { if (Attr.isArgExpr(0)) Str = cast(Attr.getArgAsExpr(0))->getString(); else - Str = Attr.getArgAsIdent(0)->Ident->getName(); + Str = Attr.getArgAsIdent(0)->getIdentifierInfo()->getName(); PcsAttr::PCSType Type; if (!PcsAttr::ConvertStrToPCSType(Str, Type)) llvm_unreachable("already validated the attribute"); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 02c31dff620ec..b404015867087 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12811,7 +12811,7 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { for (unsigned I = 0; I < NumArchs; ++I) { IdentifierInfo *Ident = readBool() ? readIdentifier() : nullptr; SourceLocation Loc = readSourceLocation(); - Archs.emplace_back(Ident, Loc); + Archs.emplace_back(Loc, Ident); } return OpenACCDeviceTypeClause::Create(getContext(), ClauseKind, BeginLoc, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 95b5718f1d140..4dca0613cb9ae 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -8774,10 +8774,10 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(DTC->getLParenLoc()); writeUInt32(DTC->getArchitectures().size()); for (const DeviceTypeArgument &Arg : DTC->getArchitectures()) { - writeBool(Arg.first); - if (Arg.first) - AddIdentifierRef(Arg.first); - writeSourceLocation(Arg.second); + writeBool(Arg.getIdentifierInfo()); + if (Arg.getIdentifierInfo()) + AddIdentifierRef(Arg.getIdentifierInfo()); + writeSourceLocation(Arg.getLoc()); } return; } diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 429bf823616da..07856dbdba4b4 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -664,7 +664,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc, const Module *Imported) { if (MDC.ScanInstance.getPreprocessor().isInImportingCXXNamedModules()) { P1689ModuleInfo RequiredModule; - RequiredModule.ModuleName = Path[0].first->getName().str(); + RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str(); RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule; MDC.RequiredStdCXXModules.push_back(RequiredModule); return; From af28c9c65a23806a09d7929792df5ed2e9bdf946 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 06:20:34 -0700 Subject: [PATCH 114/710] [SLP]Do not reorder split node operand with reuses, if not possible Need to check if the operand node of the split vectorize node has reuses and check if it is possible to build the order for this node to reorder it correctly. Fixes #135912 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 4 +- .../X86/split-vector-operand-with-reuses.ll | 151 ++++++++++++++++++ 2 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f9acc276f37f9..f97386159d029 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7479,8 +7479,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const auto &P : Data.first->CombinedEntriesWithIndices) { TreeEntry &OpTE = *VectorizableTree[P.first].get(); OrdersType Order = OpTE.ReorderIndices; - if (Order.empty()) { - if (!OpTE.isGather()) + if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) { + if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty()) continue; const auto BestOrder = getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll new file mode 100644 index 0000000000000..dd804663ff121 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ARRAYIDX7_US_I_841:%.*]] = getelementptr i8, ptr [[P]], i64 36 +; CHECK-NEXT: [[ARRAYIDX7_US_I_1261:%.*]] = getelementptr i8, ptr [[P]], i64 52 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> , <4 x i32> [[TMP2]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <12 x i32> @llvm.vector.insert.v12i32.v4i32(<12 x i32> [[TMP6]], <4 x i32> [[TMP5]], i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> , <16 x i32> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = srem <16 x i32> [[TMP13]], +; CHECK-NEXT: [[TMP15:%.*]] = or <12 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = srem <12 x i32> [[TMP15]], +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I:.*]] +; CHECK: [[FOR_COND1_PREHEADER_US_I]]: +; CHECK-NEXT: [[A_PROMOTED253537_US_I:%.*]] = phi i32 [ [[OP_RDX8:%.*]], %[[FOR_COND1_PREHEADER_US_I]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> [[TMP16]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[OP_RDX8]] = add i32 [[OP_RDX]], 0 +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I]] +; +entry: + %arrayidx7.us.i.841 = getelementptr i8, ptr %p, i64 36 + %arrayidx7.us.i.946 = getelementptr i8, ptr %p, i64 40 + %arrayidx7.us.i.1051 = getelementptr i8, ptr %p, i64 44 + %arrayidx7.us.i.1156 = getelementptr i8, ptr %p, i64 48 + %arrayidx7.us.i.1261 = getelementptr i8, ptr %p, i64 52 + %arrayidx7.us.i.1366 = getelementptr i8, ptr %p, i64 56 + %arrayidx7.us.i.1471 = getelementptr i8, ptr %p, i64 60 + %arrayidx7.us.i.1576 = getelementptr i8, ptr %p, i64 64 + %add8.us.i.1.4 = add i32 0, 0 + %rem.us.i.1.4 = srem i32 %add8.us.i.1.4, 1 + %add8.us.i.1.5 = add i32 0, 0 + %rem.us.i.1.5 = srem i32 %add8.us.i.1.5, 1 + %invariant.op91 = add i32 %rem.us.i.1.4, %rem.us.i.1.5 + %add8.us.i.1.6 = add i32 0, 0 + %rem.us.i.1.6 = srem i32 %add8.us.i.1.6, 1 + %invariant.op92 = add i32 %invariant.op91, %rem.us.i.1.6 + %0 = load i32, ptr %arrayidx7.us.i.841, align 4 + %1 = load i32, ptr %arrayidx7.us.i.946, align 4 + %2 = load i32, ptr %arrayidx7.us.i.1051, align 4 + %3 = load i32, ptr %arrayidx7.us.i.1156, align 4 + %4 = load i32, ptr %arrayidx7.us.i.1261, align 4 + %5 = load i32, ptr %arrayidx7.us.i.1366, align 4 + %add8.us.i.7.6 = or i32 %5, 0 + %rem.us.i.7.6 = srem i32 %add8.us.i.7.6, 1 + %6 = load i32, ptr %arrayidx7.us.i.1471, align 4 + %add8.us.i.7.7 = or i32 %6, 0 + %rem.us.i.7.7 = srem i32 %add8.us.i.7.7, 1 + %invariant.op165 = add i32 %rem.us.i.7.6, %rem.us.i.7.7 + %7 = load i32, ptr %arrayidx7.us.i.1576, align 4 + %add8.us.i.7.8 = or i32 %7, 0 + %rem.us.i.7.8 = srem i32 %add8.us.i.7.8, 1 + %invariant.op166 = add i32 %invariant.op165, %rem.us.i.7.8 + %add8.us.i.8 = or i32 %0, 0 + %rem.us.i.8 = srem i32 %add8.us.i.8, 1 + %invariant.op167 = add i32 %invariant.op166, %rem.us.i.8 + %add8.us.i.8.1 = or i32 %1, 0 + %rem.us.i.8.1 = srem i32 %add8.us.i.8.1, 1 + %invariant.op168 = add i32 %invariant.op167, %rem.us.i.8.1 + %add8.us.i.8.2 = or i32 %2, 0 + %rem.us.i.8.2 = srem i32 %add8.us.i.8.2, 1 + %invariant.op169 = add i32 %invariant.op168, %rem.us.i.8.2 + %add8.us.i.8.3 = or i32 %3, 0 + %rem.us.i.8.3 = srem i32 %add8.us.i.8.3, 1 + %invariant.op170 = add i32 %invariant.op169, %rem.us.i.8.3 + %add8.us.i.8.4 = or i32 %4, 0 + %rem.us.i.8.4 = srem i32 %add8.us.i.8.4, 1 + %invariant.op171 = add i32 %invariant.op170, %rem.us.i.8.4 + %add8.us.i.8.5 = or i32 %5, 0 + %rem.us.i.8.5 = srem i32 %add8.us.i.8.5, 1 + %invariant.op172 = add i32 %invariant.op171, %rem.us.i.8.5 + %add8.us.i.8.6 = or i32 %6, 0 + %rem.us.i.8.6 = srem i32 %add8.us.i.8.6, 0 + %invariant.op173 = add i32 %invariant.op172, %rem.us.i.8.6 + %add8.us.i.8.7 = or i32 %7, 0 + %rem.us.i.8.7 = srem i32 %add8.us.i.8.7, 0 + %invariant.op174 = add i32 %invariant.op173, %rem.us.i.8.7 + %invariant.op181 = add i32 %invariant.op174, 0 + %invariant.op182 = add i32 %invariant.op181, 0 + %invariant.op183 = add i32 %invariant.op182, 0 + %invariant.op184 = add i32 %invariant.op183, 0 + %invariant.op185 = add i32 %invariant.op184, 0 + %invariant.op186 = add i32 %invariant.op185, 0 + %invariant.op187 = add i32 %invariant.op186, 0 + %invariant.op188 = add i32 %invariant.op187, 0 + %add8.us.i.11.1 = or i32 %4, 0 + %rem.us.i.11.1 = srem i32 %add8.us.i.11.1, 1 + %invariant.op189 = add i32 %invariant.op188, %rem.us.i.11.1 + %add8.us.i.11.2 = add i32 0, 0 + %rem.us.i.11.2 = srem i32 %add8.us.i.11.2, 1 + %invariant.op190 = add i32 %invariant.op189, %rem.us.i.11.2 + %add8.us.i.11.3 = add i32 %6, %2 + %rem.us.i.11.3 = srem i32 %add8.us.i.11.3, 1 + %invariant.op191 = add i32 %invariant.op190, %rem.us.i.11.3 + %add8.us.i.11.4 = add i32 %7, %2 + %rem.us.i.11.4 = srem i32 %add8.us.i.11.4, 1 + %invariant.op192 = add i32 %invariant.op191, %rem.us.i.11.4 + %8 = load i32, ptr %p, align 4 + %add8.us.i.12 = add i32 %4, %8 + %rem.us.i.12 = srem i32 %add8.us.i.12, 1 + %invariant.op193 = add i32 %invariant.op192, %rem.us.i.12 + %add8.us.i.12.1 = add i32 %5, %8 + %rem.us.i.12.1 = srem i32 %add8.us.i.12.1, 1 + %invariant.op194 = add i32 %invariant.op193, %rem.us.i.12.1 + %add8.us.i.12.2 = add i32 0, 0 + %rem.us.i.12.2 = srem i32 %add8.us.i.12.2, 1 + %invariant.op195 = add i32 %invariant.op194, %rem.us.i.12.2 + %add8.us.i.12.3 = add i32 0, 0 + %rem.us.i.12.3 = srem i32 %add8.us.i.12.3, 1 + %invariant.op196 = add i32 %invariant.op195, %rem.us.i.12.3 + %add8.us.i.13 = add i32 0, 0 + %rem.us.i.13 = srem i32 %add8.us.i.13, 0 + %invariant.op197 = add i32 %invariant.op196, %rem.us.i.13 + %add8.us.i.13.1 = add i32 0, 0 + %rem.us.i.13.1 = srem i32 %add8.us.i.13.1, 0 + %invariant.op198 = add i32 %invariant.op197, %rem.us.i.13.1 + %add8.us.i.13.2 = add i32 0, 0 + %rem.us.i.13.2 = srem i32 %add8.us.i.13.2, 1 + %invariant.op199 = add i32 %invariant.op198, %rem.us.i.13.2 + %add8.us.i.14 = add i32 0, 0 + %rem.us.i.14 = srem i32 %add8.us.i.14, 1 + %invariant.op200 = add i32 %invariant.op199, %rem.us.i.14 + %add8.us.i.14.1 = add i32 0, 0 + %rem.us.i.14.1 = srem i32 %add8.us.i.14.1, 1 + %invariant.op201 = add i32 %invariant.op200, %rem.us.i.14.1 + %add8.us.i.15 = add i32 0, 0 + %rem.us.i.15 = srem i32 %add8.us.i.15, 1 + %invariant.op202 = add i32 %invariant.op201, %rem.us.i.15 + br label %for.cond1.preheader.us.i + +for.cond1.preheader.us.i: + %a.promoted253537.us.i = phi i32 [ %add9.us.i.15.reass, %for.cond1.preheader.us.i ], [ 0, %entry ] + %add9.us.i.15.reass = add i32 %invariant.op92, %invariant.op202 + br label %for.cond1.preheader.us.i +} + From 1bfd44462886b167f0d82e44e6a9856a830c1f8b Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Wed, 16 Apr 2025 06:48:42 -0700 Subject: [PATCH 115/710] [DAGCombiner] Fold and/or of NaN SETCC (#135645) Fold an AND or OR of two NaN SETCC nodes into a single SETCC where possible. This optimization already exists in InstCombine but adding in here as well can allow for additional folding if more logical operations are exposed. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++ llvm/test/CodeGen/NVPTX/and-or-setcc.ll | 45 +++++++++++++++ llvm/test/CodeGen/X86/and-or-setcc.ll | 57 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/and-or-setcc.ll create mode 100644 llvm/test/CodeGen/X86/and-or-setcc.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d72be359867ca..ab8e18267f3f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6448,6 +6448,12 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) { } } + if (LHS0 == LHS1 && RHS0 == RHS1 && CCL == CCR && + LHS0.getValueType() == RHS0.getValueType() && + ((LogicOp->getOpcode() == ISD::AND && CCL == ISD::SETO) || + (LogicOp->getOpcode() == ISD::OR && CCL == ISD::SETUO))) + return DAG.getSetCC(DL, VT, LHS0, RHS0, CCL); + if (TargetPreference == AndOrSETCCFoldKind::None) return SDValue(); diff --git a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll new file mode 100644 index 0000000000000..21be9df94d553 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i1 @and_ord(float %a, float %b) { +; CHECK-LABEL: and_ord( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [and_ord_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [and_ord_param_1]; +; CHECK-NEXT: setp.num.f32 %p1, %f1, %f2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %c = fcmp ord float %a, 0.0 + %d = fcmp ord float %b, 0.0 + %e = and i1 %c, %d + ret i1 %e +} + +define i1 @or_uno(float %a, float %b) { +; CHECK-LABEL: or_uno( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [or_uno_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [or_uno_param_1]; +; CHECK-NEXT: setp.nan.f32 %p1, %f1, %f2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %c = fcmp uno float %a, 0.0 + %d = fcmp uno float %b, 0.0 + %e = or i1 %c, %d + ret i1 %e +} diff --git a/llvm/test/CodeGen/X86/and-or-setcc.ll b/llvm/test/CodeGen/X86/and-or-setcc.ll new file mode 100644 index 0000000000000..cb8ecca9348e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/and-or-setcc.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +define i1 @and_ord(float %a, float %b) { +; X86-LABEL: and_ord: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setnp %al +; X86-NEXT: retl +; +; X64-LABEL: and_ord: +; X64: # %bb.0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: cmpordps %xmm2, %xmm1 +; X64-NEXT: cmpordps %xmm2, %xmm0 +; X64-NEXT: andps %xmm1, %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %c = fcmp ord float %a, 0.0 + %d = fcmp ord float %b, 0.0 + %e = and i1 %c, %d + ret i1 %e +} + +define i1 @or_uno(float %a, float %b) { +; X86-LABEL: or_uno: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %al +; X86-NEXT: retl +; +; X64-LABEL: or_uno: +; X64: # %bb.0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: cmpunordps %xmm2, %xmm1 +; X64-NEXT: cmpunordps %xmm2, %xmm0 +; X64-NEXT: orps %xmm1, %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %c = fcmp uno float %a, 0.0 + %d = fcmp uno float %b, 0.0 + %e = or i1 %c, %d + ret i1 %e +} From 181872ffcc7dc7f20ed2b84e8fa39beba41cb6d3 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 16 Apr 2025 10:03:08 -0400 Subject: [PATCH 116/710] [AMDGPU][True16][MC] update a few mc test for true16 (#135816) This is another NFC patch. Update mc test for a few true16 instructions by duplicating the file to fake16 versions and udpate `mattr` flag with +/-real-true16. Also added some fake16 file that are not properly created before --- llvm/test/MC/AMDGPU/bf16_imm-fake16.s | 114 ++++++ llvm/test/MC/AMDGPU/bf16_imm.s | 64 ++-- llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s | 353 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11-promotions.s | 66 ++-- .../MC/AMDGPU/gfx1150_asm_features-fake16.s | 48 +++ llvm/test/MC/AMDGPU/gfx1150_asm_features.s | 20 +- .../MC/AMDGPU/gfx11_asm_vinterp_err-fake16.s | 43 +++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 30 ++ .../MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s | 15 + llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s | 12 +- llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s | 77 ++++ llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 31 ++ 12 files changed, 792 insertions(+), 81 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx1150_asm_features-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vinterp_err-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s diff --git a/llvm/test/MC/AMDGPU/bf16_imm-fake16.s b/llvm/test/MC/AMDGPU/bf16_imm-fake16.s new file mode 100644 index 0000000000000..ee697bee6ab2d --- /dev/null +++ b/llvm/test/MC/AMDGPU/bf16_imm-fake16.s @@ -0,0 +1,114 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] + +v_dot2_bf16_bf16 v2, v0, 1.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 1.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe5,0x09,0x04] + +v_dot2_bf16_bf16 v2, 1.0, v0, v2 +// CHECK: v_dot2_bf16_bf16 v2, 1.0, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xf2,0x00,0x0a,0x04] + +v_dot2_bf16_bf16 v5, v1, v2, 1.0 +// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] + +v_dot2_bf16_bf16 v2, v0, -1.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, -1.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe7,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, 0.5, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 0.5, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe1,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, -0.5, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, -0.5, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe3,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, 2.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 2.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe9,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, -2.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, -2.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xeb,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, 4.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 4.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xed,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, -4.0, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, -4.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xef,0x09,0x04] + +// Check 1/(2*pi) rounded value and ideomatic fp32 0.15915494 value +// which cannot be accurately represented in bf16. + +v_dot2_bf16_bf16 v2, v0, 0.158203125, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, 0x3e22, v2 +// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] + +v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 +// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03] + +v_dot2_f32_bf16 v2, v1, 0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.5, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -0.5, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 1.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -1.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 2.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -2.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 4.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -4.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.15915494, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0x3e22, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] + +v_dot2_f32_bf16 v2, 0.5, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -0.5, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 1.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -1.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 2.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -2.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 4.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -4.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 100.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 0x42c8, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00] + +v_dot2_f32_bf16 v2, v1, 100.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0x42c8, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/bf16_imm.s b/llvm/test/MC/AMDGPU/bf16_imm.s index 7cf18103adfe5..d79649073aa89 100644 --- a/llvm/test/MC/AMDGPU/bf16_imm.s +++ b/llvm/test/MC/AMDGPU/bf16_imm.s @@ -1,54 +1,54 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s | FileCheck %s -v_dot2_bf16_bf16 v5, v1, v2, 100.0 -// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] +v_dot2_bf16_bf16 v5.l, v1, v2, 100.0 +// CHECK: v_dot2_bf16_bf16 v5.l, v1, v2, 0x42c8 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0xc8,0x42,0x00,0x00] -v_dot2_bf16_bf16 v2, v0, 1.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 1.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe5,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 1.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 1.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe5,0x09,0x04] -v_dot2_bf16_bf16 v2, 1.0, v0, v2 -// CHECK: v_dot2_bf16_bf16 v2, 1.0, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xf2,0x00,0x0a,0x04] +v_dot2_bf16_bf16 v2.l, 1.0, v0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, 1.0, v0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0xf2,0x00,0x0a,0x04] -v_dot2_bf16_bf16 v5, v1, v2, 1.0 -// CHECK: v_dot2_bf16_bf16 v5, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] +v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 +// CHECK: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] -v_dot2_bf16_bf16 v2, v0, -1.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, -1.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe7,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, -1.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, -1.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe7,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, 0.5, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 0.5, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe1,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 0.5, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 0.5, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe1,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, -0.5, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, -0.5, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe3,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, -0.5, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, -0.5, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe3,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, 2.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 2.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe9,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 2.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 2.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xe9,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, -2.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, -2.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xeb,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, -2.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, -2.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xeb,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, 4.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 4.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xed,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 4.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 4.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xed,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, -4.0, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, -4.0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xef,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, -4.0, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, -4.0, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xef,0x09,0x04] // Check 1/(2*pi) rounded value and ideomatic fp32 0.15915494 value // which cannot be accurately represented in bf16. -v_dot2_bf16_bf16 v2, v0, 0.158203125, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 0.158203125, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 0.15915494, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 0.15915494, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 0.15915494, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, 0x3e22, v2 -// CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] +v_dot2_bf16_bf16 v2.l, v0, 0x3e22, v2.l +// CHECK: v_dot2_bf16_bf16 v2.l, v0, 0.15915494, v2.l ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] -v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 -// CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03] +v_dot2_bf16_bf16 v2.l, v0, v2, 0.15915494 +// CHECK: v_dot2_bf16_bf16 v2.l, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03] v_dot2_f32_bf16 v2, v1, 0, v2 // CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c] diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s b/llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s new file mode 100644 index 0000000000000..95a52ffe103fa --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11-promotions-fake16.s @@ -0,0 +1,353 @@ +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 %s | FileCheck --check-prefix=GFX11 %s + +// Check opcode promotions and forced suffices. +// 1. When a suffix is optional, check that it may be omitted. +// 2. When a suffix is optional, check that it may be specified w/o any effect. +// 3. When a suffix is required, check that specifying it enforces opcode promotion. +// 4. When a suffix is required, check that omitting the suffix results in a different encoding. + +//===----------------------------------------------------------------------===// +// VOP1. +//===----------------------------------------------------------------------===// + +v_mov_b32 v0, v1 +// GFX11: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] + +v_mov_b32_e32 v0, v1 +// GFX11: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] + +//===----------------------------------------------------------------------===// +// VOP2. +//===----------------------------------------------------------------------===// + +v_add_f16 v5, v1, v2 +// GFX11: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] + +v_add_f16_e32 v5, v1, v2 +// GFX11: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] + +//===----------------------------------------------------------------------===// +// VOPC. +//===----------------------------------------------------------------------===// + +v_cmp_lt_f32 vcc_lo, v1, v2 +// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2 ; encoding: [0x01,0x05,0x22,0x7c] + +v_cmp_lt_f32_e32 vcc_lo, v1, v2 +// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2 ; encoding: [0x01,0x05,0x22,0x7c] + +//===----------------------------------------------------------------------===// +// VOPCX. +//===----------------------------------------------------------------------===// + +v_cmpx_class_f16 v1, v2 +// GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] + +v_cmpx_class_f16_e32 v1, v2 +// GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] + +//===----------------------------------------------------------------------===// +// VOP1.DPP8. +//===----------------------------------------------------------------------===// + +v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] + +v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOP1.DPP16. +//===----------------------------------------------------------------------===// + +v_bfrev_b32 v5, v1 quad_perm:[3,2,1,0] +// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOP2.DPP8. +//===----------------------------------------------------------------------===// + +v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOP2.DPP16. +//===----------------------------------------------------------------------===// + +v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] + +v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOPC.DPP8. +//===----------------------------------------------------------------------===// + +v_cmp_le_u16 v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 +// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] + +v_cmp_le_u16_dpp v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 +// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] + +//===----------------------------------------------------------------------===// +// VOPC.DPP16. +//===----------------------------------------------------------------------===// + +v_cmp_gt_u16 v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] + +v_cmp_gt_u16_dpp v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] + +//===----------------------------------------------------------------------===// +// VOPCX.DPP8. +//===----------------------------------------------------------------------===// + +v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOPCX.DPP16. +//===----------------------------------------------------------------------===// + +v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOP1 -> VOP3. +//===----------------------------------------------------------------------===// + +v_sin_f32 v5, 0.5 mul:2 +// GFX11: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_f32_e64 v5, 0.5 mul:2 +// GFX11: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_f32_e64 v5, v1 +// GFX11: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f32 v5, v1 +// GFX11: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] + +//===----------------------------------------------------------------------===// +// VOP2 -> VOP3. +//===----------------------------------------------------------------------===// + +v_add_f32 v5, v1, -v2 +// GFX11: v_add_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40] + +v_add_f32_e64 v5, v1, -v2 +// GFX11: v_add_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40] + +v_add_f32_e64 v5, v1, v2 +// GFX11: v_add_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x00] + +v_add_f32 v5, v1, v2 +// GFX11: v_add_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x06] + +//===----------------------------------------------------------------------===// +// VOPC -> VOP3. +//===----------------------------------------------------------------------===// + +v_cmp_f_f32 s10, -v1, v2 +// GFX11: v_cmp_f_f32_e64 s10, -v1, v2 ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20] + +v_cmp_f_f32_e64 s10, -v1, v2 +// GFX11: v_cmp_f_f32_e64 s10, -v1, v2 ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20] + +v_cmp_f_f32_e64 vcc_lo, v1, v2 +// GFX11: v_cmp_f_f32_e64 vcc_lo, v1, v2 ; encoding: [0x6a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00] + +v_cmp_f_f32 vcc_lo, v1, v2 +// GFX11: v_cmp_f_f32_e32 vcc_lo, v1, v2 ; encoding: [0x01,0x05,0x20,0x7c] + +//===----------------------------------------------------------------------===// +// VOPCX -> VOP3. +//===----------------------------------------------------------------------===// + +v_cmpx_f_f32 -v1, v2 +// GFX11: v_cmpx_f_f32_e64 -v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20] + +v_cmpx_f_f32_e64 -v1, v2 +// GFX11: v_cmpx_f_f32_e64 -v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20] + +v_cmpx_f_f32_e64 v1, v2 +// GFX11: v_cmpx_f_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_f_f32 v1, v2 +// GFX11: v_cmpx_f_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x20,0x7d] + +//===----------------------------------------------------------------------===// +// VOP3. +//===----------------------------------------------------------------------===// + +v_add3_u32 v5, v1, v2, s3 +// GFX11: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] + +v_add3_u32_e64 v5, v1, v2, s3 +// GFX11: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] + +//===----------------------------------------------------------------------===// +// VOP1 -> VOP3.DPP8. +//===----------------------------------------------------------------------===// + +v_sin_f32 v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00] + +v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00] + +v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOP2 -> VOP3.DPP8. +//===----------------------------------------------------------------------===// + +v_add_f32 v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOPC -> VOP3.DPP8. +//===----------------------------------------------------------------------===// + +v_cmp_class_f32 s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOPCX -> VOP3.DPP8. +//===----------------------------------------------------------------------===// + +v_cmpx_class_f32 -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOP1 -> VOP3.DPP16. +//===----------------------------------------------------------------------===// + +v_sin_f32 v5, v1 div:2 row_xmask:15 +// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff] + +v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 +// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff] + +v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sin_f32 v5, v1 quad_perm:[3,2,1,0] +// GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOP2 -> VOP3.DPP16. +//===----------------------------------------------------------------------===// + +v_add_f32 v5, v1, v2 div:2 quad_perm:[3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff] + +v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff] + +v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOPC -> VOP3.DPP16. +//===----------------------------------------------------------------------===// + +v_cmp_class_f32 s5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff] + +//===----------------------------------------------------------------------===// +// VOPCX -> VOP3.DPP16. +//===----------------------------------------------------------------------===// + +v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff] + +//===----------------------------------------------------------------------===// +// VOP3P. +//===----------------------------------------------------------------------===// + +v_dot2_f32_f16 v0, v1, v2, v3 +// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c] + +v_dot2_f32_f16_e64 v0, v1, v2, v3 +// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c] + +//===----------------------------------------------------------------------===// +// VOP3P.DPP8. +//===----------------------------------------------------------------------===// + +v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05] + +v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05] + +//===----------------------------------------------------------------------===// +// VOP3P.DPP16. +//===----------------------------------------------------------------------===// + +v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff] + +v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s index 0d1568e7c2765..d9499b37b6994 100644 --- a/llvm/test/MC/AMDGPU/gfx11-promotions.s +++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1100 -mattr=+wavefrontsize32 %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1100 -mattr=+real-true16 %s | FileCheck --check-prefix=GFX11 %s // Check opcode promotions and forced suffices. // 1. When a suffix is optional, check that it may be omitted. @@ -21,11 +21,11 @@ v_mov_b32_e32 v0, v1 // VOP2. //===----------------------------------------------------------------------===// -v_add_f16 v5, v1, v2 -// GFX11: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] +v_add_f16 v5.l, v1.l, v2.l +// GFX11: v_add_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x64] -v_add_f16_e32 v5, v1, v2 -// GFX11: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] +v_add_f16_e32 v5.l, v1.l, v2.l +// GFX11: v_add_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x64] //===----------------------------------------------------------------------===// // VOPC. @@ -41,11 +41,11 @@ v_cmp_lt_f32_e32 vcc_lo, v1, v2 // VOPCX. //===----------------------------------------------------------------------===// -v_cmpx_class_f16 v1, v2 -// GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +v_cmpx_class_f16 v1.l, v2.l +// GFX11: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] -v_cmpx_class_f16_e32 v1, v2 -// GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] +v_cmpx_class_f16_e32 v1.l, v2.l +// GFX11: v_cmpx_class_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0xfa,0x7d] //===----------------------------------------------------------------------===// // VOP1.DPP8. @@ -71,61 +71,61 @@ v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] // VOP2.DPP8. //===----------------------------------------------------------------------===// -v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] +v_add_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] -v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] +v_add_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] //===----------------------------------------------------------------------===// // VOP2.DPP16. //===----------------------------------------------------------------------===// -v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] +v_add_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] -v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] +v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] //===----------------------------------------------------------------------===// // VOPC.DPP8. //===----------------------------------------------------------------------===// -v_cmp_le_u16 v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 -// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] +v_cmp_le_u16 v1.l, v2.l dpp8:[7,7,7,3,4,4,6,7] fi:1 +// GFX11: v_cmp_le_u16 vcc_lo, v1.l, v2.l dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] -v_cmp_le_u16_dpp v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 -// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] +v_cmp_le_u16_dpp v1.l, v2.l dpp8:[7,7,7,3,4,4,6,7] fi:1 +// GFX11: v_cmp_le_u16 vcc_lo, v1.l, v2.l dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa] //===----------------------------------------------------------------------===// // VOPC.DPP16. //===----------------------------------------------------------------------===// -v_cmp_gt_u16 v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 -// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] +v_cmp_gt_u16 v1.l, v2.l row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: v_cmp_gt_u16 vcc_lo, v1.l, v2.l row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] -v_cmp_gt_u16_dpp v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 -// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] +v_cmp_gt_u16_dpp v1.l, v2.l row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: v_cmp_gt_u16 vcc_lo, v1.l, v2.l row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00] //===----------------------------------------------------------------------===// // VOPCX.DPP8. //===----------------------------------------------------------------------===// -v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_class_f16_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] //===----------------------------------------------------------------------===// // VOPCX.DPP16. //===----------------------------------------------------------------------===// -v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_class_f16_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_class_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] //===----------------------------------------------------------------------===// // VOP1 -> VOP3. diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features-fake16.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features-fake16.s new file mode 100644 index 0000000000000..b3297377ed584 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features-fake16.s @@ -0,0 +1,48 @@ +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1150 -mattr=-real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1151 -mattr=-real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1152 -mattr=-real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1153 -mattr=-real-true16 %s | FileCheck --check-prefix=GFX1150 %s + +// +// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable +// constant. +// + +v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05] + +v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05] + +v_add_f32_e64_dpp v5, v1, s2 row_mirror +// GFX1150: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff] + +v_min3_f16 v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff] + +v_cmp_le_f32 vcc_lo, v1, v2 row_mirror +// GFX1150: encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff] + +v_cmp_le_f32 vcc_lo, v1, s2 row_mirror +// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff] + +v_cmp_le_f32 vcc_lo, v1, s2 quad_perm:[1,1,1,1] +// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x55,0x00,0xff] + +v_cmpx_neq_f16 v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_class_f16 v1, 2.0 quad_perm:[1,1,1,1] +// GFX1150: encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x55,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s index 5c498a28fd8c7..6e5bda3c3d0c9 100644 --- a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1152 %s | FileCheck --check-prefix=GFX1150 %s -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1153 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1150 -mattr=+real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1151 -mattr=+real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1152 -mattr=+real-true16 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1153 -mattr=+real-true16 %s | FileCheck --check-prefix=GFX1150 %s // Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable // constant. @@ -28,8 +28,8 @@ v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] v_add_f32_e64_dpp v5, v1, s2 row_mirror // GFX1150: v_add_f32_e64_dpp v5, v1, s2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff] -v_min3_f16 v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf -// GFX1150: v_min3_f16_e64_dpp v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff] +v_min3_f16 v5.h, v1.h, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +// GFX1150: v_min3_f16_e64_dpp v5.h, v1.h, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff] v_cmp_le_f32 vcc_lo, v1, v2 row_mirror // GFX1150: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff] @@ -40,8 +40,8 @@ v_cmp_le_f32 vcc_lo, v1, s2 row_mirror v_cmp_le_f32 vcc_lo, v1, s2 quad_perm:[1,1,1,1] // GFX1150: v_cmp_le_f32_e64_dpp vcc_lo, v1, s2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x55,0x00,0xff] -v_cmpx_neq_f16 v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] -// GFX1150: v_cmpx_neq_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16 v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: v_cmpx_neq_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] -v_cmpx_class_f16 v1, 2.0 quad_perm:[1,1,1,1] -// GFX1150: v_cmpx_class_f16_e64_dpp v1, 2.0 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x55,0x00,0xff] +v_cmpx_class_f16 v1.l, 2.0 quad_perm:[1,1,1,1] +// GFX1150: v_cmpx_class_f16_e64_dpp v1.l, 2.0 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x55,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp_err-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp_err-fake16.s new file mode 100644 index 0000000000000..7450d4b86cabc --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp_err-fake16.s @@ -0,0 +1,43 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s 2>&1 | FileCheck %s -check-prefix=GCN-ERR --implicit-check-not=error: --strict-whitespace +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 %s 2>&1 | FileCheck %s -check-prefix=GCN-ERR --implicit-check-not=error: --strict-whitespace + +//===----------------------------------------------------------------------===// +// VINTERP src operands must be VGPRs. +// Check that other operand kinds are rejected by assembler. +//===----------------------------------------------------------------------===// + +v_interp_p10_f32 v0, s1, v2, v3 +// GCN-ERR: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_interp_p10_f32 v0, v1, s2, v3 +// GCN-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_interp_p10_f32 v0, v1, v2, s3 +// GCN-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction + +v_interp_p2_f32 v0, 1, v2, v3 +// GCN-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_interp_p2_f32 v0, v1, 2, v3 +// GCN-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_interp_p2_f32 v0, v1, v2, 3 +// GCN-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction + +v_interp_p10_f16_f32 v0, s1, v2, v3 +// GCN-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_interp_p10_f16_f32 v0, v1, s2, v3 +// GCN-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction + +v_interp_p10_f16_f32 v0, v1, v2, s3 +// GCN-ERR: :[[@LINE-1]]:34: error: invalid operand for instruction + +v_interp_p2_f16_f32 v0, 1, v2, v3 +// GCN-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_interp_p2_f16_f32 v0, v1, 2, v3 +// GCN-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction + +v_interp_p2_f16_f32 v0, v1, v2, 3 +// GCN-ERR: :[[@LINE-1]]:33: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 1aefd1f0a7d19..58641332a08da 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2630,6 +2630,36 @@ v_log_f32 v5, src_scc v_log_f32 v255, 0xaf123456 // GFX11: v_log_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_mov_b16_e32 v0.l, v1.l +// GFX11: v_mov_b16_e32 v0.l, v1.l ; encoding: [0x01,0x39,0x00,0x7e] + +v_mov_b16_e32 v0.l, s1 +// GFX11: v_mov_b16_e32 v0.l, s1 ; encoding: [0x01,0x38,0x00,0x7e] + +v_mov_b16_e32 v0.h, 0 +// GFX11: v_mov_b16_e32 v0.h, 0 ; encoding: [0x80,0x38,0x00,0x7f] + +v_mov_b16_e32 v0.h, 1.0 +// GFX11: v_mov_b16_e32 v0.h, 1.0 ; encoding: [0xf2,0x38,0x00,0x7f] + +v_mov_b16_e32 v0.l, 0x1234 +// GFX11: v_mov_b16_e32 v0.l, 0x1234 ; encoding: [0xff,0x38,0x00,0x7e,0x34,0x12,0x00,0x00] + +v_mov_b16_e64 v0.l, v1.l +// GFX11: v_mov_b16_e64 v0.l, v1.l ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b16_e64 v200.l, v1.h +// GFX11: v_mov_b16_e64 v200.l, v1.h op_sel:[1,0] ; encoding: [0xc8,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b16_e64 v0.l, s1 +// GFX11: v_mov_b16_e64 v0.l, s1 ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00] + +v_mov_b16_e64 v200.h, 1 +// GFX11: v_mov_b16_e64 v200.h, 1 op_sel:[0,1] ; encoding: [0xc8,0x40,0x9c,0xd5,0x81,0x00,0x00,0x00] + +v_mov_b16_e64 v0.l, 0x1234 +// GFX11: v_mov_b16_e64 v0.l, 0x1234 ; encoding: [0x00,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + v_mov_b32 v5, v1 // GFX11: v_mov_b32_e32 v5, v1 ; encoding: [0x01,0x03,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s new file mode 100644 index 0000000000000..34f519e1e0e47 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias-fake16.s @@ -0,0 +1,15 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck -check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck -check-prefix=GFX11 %s + +v_cvt_pknorm_i16_f16 v5, v1, v2 +// GFX11: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00] + +v_cvt_pknorm_u16_f16 v5, v1, v2 +// GFX11: v_cvt_pk_norm_u16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00] + +v_add3_nc_u32 v5, v1, v2, s3 +// GFX11: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] + +v_xor_add_u32 v5, v1, v2, s3 +// GFX11: v_xad_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x45,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s index f6ac190e16285..b34c94de7160a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck -check-prefix=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck -check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck -check-prefix=GFX11 %s -v_cvt_pknorm_i16_f16 v5, v1, v2 -// GFX11: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00] +v_cvt_pknorm_i16_f16 v5, v1.l, v2.l +// GFX11: v_cvt_pk_norm_i16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00] -v_cvt_pknorm_u16_f16 v5, v1, v2 -// GFX11: v_cvt_pk_norm_u16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00] +v_cvt_pknorm_u16_f16 v5, v1.l, v2.l +// GFX11: v_cvt_pk_norm_u16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00] v_add3_nc_u32 v5, v1, v2, s3 // GFX11: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s new file mode 100644 index 0000000000000..e15a48a469a2a --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_features.s @@ -0,0 +1,77 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR,GFX11-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR,GFX11-ERR --implicit-check-not=error: %s + +//===----------------------------------------------------------------------===// +// HW correctly handles fp inline constants for src2 (they have f16 type). +// Check that inline constants are not converted to literals. +//===----------------------------------------------------------------------===// + +v_cmp_class_f16_e64 s[10:11], v1.l, 0.5 +// W64: v_cmp_class_f16_e64 s[10:11], v1.l, 0.5 ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00] +// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cmp_class_f16_e64 s10, v1.l, 0.5 +// W32: v_cmp_class_f16_e64 s10, v1.l, 0.5 ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0xe1,0x01,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cmpx_class_f16_e64 v1.l, 0.5 +// GFX11: v_cmpx_class_f16_e64 v1.l, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] + +//===----------------------------------------------------------------------===// +// src0 and src2 are packed operands. +// Check that op_sel is not allowed with these operands. +//===----------------------------------------------------------------------===// + +v_dot2_f16_f16_e64 v0.l, v1.h, v2, v3.l +// GFX11-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_dot2_f16_f16_e64_dpp v0.l, v1, v2.h, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:34: error: invalid operand for instruction + +v_dot2_f16_f16_e64_dpp v0.l, v1.h, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64 v0.l, v1, v2.h, v3.l +// GFX11-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, v1.h, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2.h, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:36: error: invalid operand for instruction + +//===----------------------------------------------------------------------===// +// src0 and src1 are vector operands. +// Check that SGPRs are not allowed for these operands. +//===----------------------------------------------------------------------===// + +v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction + +v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:34: error: invalid operand for instruction + +v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction + +v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:34: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:36: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// GFX11-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction + +v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// GFX11-ERR: :[[@LINE-1]]:36: error: invalid operand for instruction + +// Ensure bits 8-15 are not zeroed out and .h which should be present on src0 and dst are present. +v_mul_f16_e64 v5.h, v1.h, v2.l +// GFX11: v_mul_f16_e64 v5.h, v1.h, v2.l op_sel:[1,0,1] ; encoding: [0x05,0x48,0x35,0xd5,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index e21e5bf827ed1..fe8858dabfbc2 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2696,6 +2696,37 @@ v_log_f32 v5, src_scc v_log_f32 v255, 0xaf123456 // GFX12: v_log_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_mov_b16_e32 v0.l, v1.l +// GFX12: v_mov_b16_e32 v0.l, v1.l ; encoding: [0x01,0x39,0x00,0x7e] + +v_mov_b16_e32 v0.l, s1 +// GFX12: v_mov_b16_e32 v0.l, s1 ; encoding: [0x01,0x38,0x00,0x7e] + +v_mov_b16_e32 v0.h, 0 +// GFX12: v_mov_b16_e32 v0.h, 0 ; encoding: [0x80,0x38,0x00,0x7f] + +v_mov_b16_e32 v0.h, 1.0 +// GFX12-ASM: v_mov_b16_e32 v0.h, 1.0 ; encoding: [0xf2,0x38,0x00,0x7f] +// GFX12-DIS: v_mov_b16_e32 v0.h, 0x3c00 ; encoding: [0xff,0x38,0x00,0x7f,0x00,0x3c,0x00,0x00] + +v_mov_b16_e32 v0.l, 0x1234 +// GFX12: v_mov_b16_e32 v0.l, 0x1234 ; encoding: [0xff,0x38,0x00,0x7e,0x34,0x12,0x00,0x00] + +v_mov_b16_e64 v0.l, v1.l +// GFX12: v_mov_b16_e64 v0.l, v1.l ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b16_e64 v200.l, v1.h +// GFX12: v_mov_b16_e64 v200.l, v1.h op_sel:[1,0] ; encoding: [0xc8,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b16_e64 v0.l, s1 +// GFX12: v_mov_b16_e64 v0.l, s1 ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00] + +v_mov_b16_e64 v200.h, 1 +// GFX12: v_mov_b16_e64 v200.h, 1 op_sel:[0,1] ; encoding: [0xc8,0x40,0x9c,0xd5,0x81,0x00,0x00,0x00] + +v_mov_b16_e64 v0.l, 0x1234 +// GFX12: v_mov_b16_e64 v0.l, 0x1234 ; encoding: [0x00,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + v_mov_b32 v5, v1 // GFX12: v_mov_b32_e32 v5, v1 ; encoding: [0x01,0x03,0x0a,0x7e] From 6cfec29cb9bc44ec907eeda99df508985ecbd49b Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 16 Apr 2025 10:13:30 -0400 Subject: [PATCH 117/710] [Offload][SYCL] Refactor OffloadKind implementation (#135809) Following are the changes: 1. Make OffloadKind enum values to be powers of two so we can use them like a bitfield 2. Include OFK_SYCL enum value 3. Modify ActiveOffloadKinds support in clang-linker-wrapper to use bitfields instead of a vector. Thanks --------- Signed-off-by: Arvind Sudarsanam --- clang/test/CodeGenCUDA/offloading-entries.cu | 24 +++++++++---------- clang/test/Driver/linker-wrapper-image.c | 2 +- .../ClangLinkerWrapper.cpp | 10 ++++---- llvm/include/llvm/Object/OffloadBinary.h | 9 +++---- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu index c053cf586f8f5..ac0680ff08ea2 100644 --- a/clang/test/CodeGenCUDA/offloading-entries.cu +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -34,17 +34,17 @@ // HIP: @managed.managed = global i32 0, align 4 // HIP: @managed = externally_initialized global ptr null // HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries" // HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries" // HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries" // HIP: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries" +// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries" // HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries" // HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "llvm_offload_entries" //. // CUDA-COFF: @managed = dso_local global i32 undef, align 4 // CUDA-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" @@ -63,17 +63,17 @@ // HIP-COFF: @managed.managed = dso_local global i32 0, align 4 // HIP-COFF: @managed = dso_local externally_initialized global ptr null // HIP-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" // HIP-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" // HIP-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE" // HIP-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries$OE" // HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" // HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" //. // CUDA-LABEL: @_Z18__device_stub__foov( // CUDA-NEXT: entry: diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 775385137c75f..c0de56d58196a 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -220,7 +220,7 @@ // HIP-NEXT: %constant = lshr i32 %11, 4 // HIP-NEXT: %12 = and i32 %flags, 32 // HIP-NEXT: %normalized = lshr i32 %12, 5 -// HIP-NEXT: %13 = icmp eq i16 %kind, 3 +// HIP-NEXT: %13 = icmp eq i16 %kind, 4 // HIP-NEXT: br i1 %13, label %if.kind, label %if.end // // HIP: if.kind: diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 52d922abbcaec..082355e6c716f 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -923,10 +923,9 @@ Expected> linkAndWrapDeviceFiles( }); auto LinkerArgs = getLinkerArgs(Input, BaseArgs); - DenseSet ActiveOffloadKinds; + uint16_t ActiveOffloadKindMask = 0u; for (const auto &File : Input) - if (File.getBinary()->getOffloadKind() != OFK_None) - ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind()); + ActiveOffloadKindMask |= File.getBinary()->getOffloadKind(); // Write any remaining device inputs to an output file. SmallVector InputFiles; @@ -943,7 +942,10 @@ Expected> linkAndWrapDeviceFiles( return OutputOrErr.takeError(); // Store the offloading image for each linked output file. - for (OffloadKind Kind : ActiveOffloadKinds) { + for (OffloadKind Kind = OFK_OpenMP; Kind != OFK_LAST; + Kind = static_cast((uint16_t)(Kind) << 1)) { + if ((ActiveOffloadKindMask & Kind) == 0) + continue; llvm::ErrorOr> FileOrErr = llvm::MemoryBuffer::getFileOrSTDIN(*OutputOrErr); if (std::error_code EC = FileOrErr.getError()) { diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index c02aec8d956ed..a3b78b8ec6261 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -32,10 +32,11 @@ namespace object { /// The producer of the associated offloading image. enum OffloadKind : uint16_t { OFK_None = 0, - OFK_OpenMP, - OFK_Cuda, - OFK_HIP, - OFK_LAST, + OFK_OpenMP = (1 << 0), + OFK_Cuda = (1 << 1), + OFK_HIP = (1 << 2), + OFK_SYCL = (1 << 3), + OFK_LAST = (1 << 4), }; /// The type of contents the offloading image contains. From de90487fc17fb928de7d0cd75d47a44db5181c14 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 16 Apr 2025 15:20:05 +0100 Subject: [PATCH 118/710] [AARCH64] Add FEAT_SSVE_FEXPA and fix unsupported features list (#134368) This patch adds new feature introduced in [2025-03 release](https://developer.arm.com/documentation/ddi0602/2025-03/SVE-Instructions/FEXPA--Floating-point-exponential-accelerator-) and changes feature requirements for fexpa instructions and intrinsics. Additionally it fixes unsupported features list by moving fearures dependent on sme2p1 to correct location. --- clang/include/clang/Basic/arm_sve.td | 6 ++++-- .../CodeGen/AArch64/sve-intrinsics/acle_sve_expa.c | 14 +++++++++++--- .../Driver/print-supported-extensions-aarch64.c | 1 + llvm/lib/Target/AArch64/AArch64.td | 9 +++++---- llvm/lib/Target/AArch64/AArch64Features.td | 4 +++- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 ++++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 2 +- llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll | 2 +- llvm/test/MC/AArch64/SVE/fexpa.s | 8 ++++---- 9 files changed, 34 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 35263541b67ae..f09f40ce9202e 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -836,9 +836,11 @@ defm SVRINTP : SInstZPZ<"svrintp", "hfd", "aarch64_sve_frintp">; defm SVRINTX : SInstZPZ<"svrintx", "hfd", "aarch64_sve_frintx">; defm SVRINTZ : SInstZPZ<"svrintz", "hfd", "aarch64_sve_frintz">; defm SVSQRT : SInstZPZ<"svsqrt", "hfd", "aarch64_sve_fsqrt">; - +def SVEXPA : SInst<"svexpa[_{d}]", "du", "hfd", MergeNone, "aarch64_sve_fexpa_x", [VerifyRuntimeMode]>{ + let SVETargetGuard = "sve"; + let SMETargetGuard = "sme2,ssve-fexpa"; +} let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in { -def SVEXPA : SInst<"svexpa[_{d}]", "du", "hfd", MergeNone, "aarch64_sve_fexpa_x">; def SVTMAD : SInst<"svtmad[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_ftmad_x", [], [ImmCheck<2, ImmCheck0_7>]>; def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftsmul_x">; def SVTSSEL : SInst<"svtssel[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftssel_x">; diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_expa.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_expa.c index 52b6822a833f7..8c34017b7750b 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_expa.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_expa.c @@ -1,10 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +ssve-fexpa -target-feature +sme2 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +ssve-fexpa -target-feature +sme2 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS @@ -14,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + // CHECK-LABEL: @test_svexpa_f16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fexpa.x.nxv8f16( [[OP:%.*]]) @@ -24,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fexpa.x.nxv8f16( [[OP:%.*]]) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16_t test_svexpa_f16(svuint16_t op) +svfloat16_t test_svexpa_f16(svuint16_t op) STREAMING { return SVE_ACLE_FUNC(svexpa,_f16,,)(op); } @@ -39,7 +47,7 @@ svfloat16_t test_svexpa_f16(svuint16_t op) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fexpa.x.nxv4f32( [[OP:%.*]]) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32_t test_svexpa_f32(svuint32_t op) +svfloat32_t test_svexpa_f32(svuint32_t op) STREAMING { return SVE_ACLE_FUNC(svexpa,_f32,,)(op); } @@ -54,7 +62,7 @@ svfloat32_t test_svexpa_f32(svuint32_t op) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fexpa.x.nxv2f64( [[OP:%.*]]) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64_t test_svexpa_f64(svuint64_t op) +svfloat64_t test_svexpa_f64(svuint64_t op) STREAMING { return SVE_ACLE_FUNC(svexpa,_f64,,)(op); } diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c index 38a3f54eb4794..539c1937a9712 100644 --- a/clang/test/Driver/print-supported-extensions-aarch64.c +++ b/clang/test/Driver/print-supported-extensions-aarch64.c @@ -81,6 +81,7 @@ // CHECK-NEXT: ssbs FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit // CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE AES support in streaming SVE mode // CHECK-NEXT: ssve-bitperm FEAT_SSVE_BitPerm Enable Armv9.6-A SVE BitPerm support in streaming SVE mode +// CHECK-NEXT: ssve-fexpa FEAT_SSVE_FEXPA Enable SVE FEXPA instruction in Streaming SVE mode // CHECK-NEXT: ssve-fp8dot2 FEAT_SSVE_FP8DOT2 Enable SVE2 FP8 2-way dot product instructions // CHECK-NEXT: ssve-fp8dot4 FEAT_SSVE_FP8DOT4 Enable SVE2 FP8 4-way dot product instructions // CHECK-NEXT: ssve-fp8fma FEAT_SSVE_FP8FMA Enable SVE2 FP8 multiply-add instructions diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 3677f669c3481..29dd6227ba021 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -74,13 +74,14 @@ def SVEUnsupported : AArch64Unsupported { } let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2, - HasNonStreamingSVE2p2_or_SME2p2, HasNonStreamingSVE2_or_SSVE_BitPerm, - HasSME_MOP4, HasSME_TMOP] in + HasNonStreamingSVE2p2_or_SME2p2] in def SME2p2Unsupported : AArch64Unsupported; def SME2p1Unsupported : AArch64Unsupported { - let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1, HasNonStreamingSVE2p1_or_SSVE_AES], - SME2p2Unsupported.F); + let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1, HasNonStreamingSVE2p1_or_SSVE_AES, + HasSME_MOP4, HasSME_TMOP, HasNonStreamingSVE_or_SSVE_FEXPA, + HasNonStreamingSVE2_or_SSVE_BitPerm], + SME2p2Unsupported.F); } def SME2Unsupported : AArch64Unsupported { diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 357f526d5e308..f4f931a5cdab1 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -576,7 +576,9 @@ def FeatureSME_MOP4: ExtensionWithMArch<"sme-mop4", "SME_MOP4", "FEAT_SME_MOP4", def FeatureSME_TMOP: ExtensionWithMArch<"sme-tmop", "SME_TMOP", "FEAT_SME_TMOP", "Enable SME Structured sparsity outer product instructions.", [FeatureSME2]>; -//===----------------------------------------------------------------------===// +def FeatureSSVE_FEXPA : ExtensionWithMArch<"ssve-fexpa", "SSVE_FEXPA", "FEAT_SSVE_FEXPA", + "Enable SVE FEXPA instruction in Streaming SVE mode", [FeatureSME2]>; + // Other Features //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 99f2b79d31bb7..a7a01ed785afa 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -294,6 +294,10 @@ def HasNonStreamingSVE2_or_SSVE_BitPerm : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">; +def HasNonStreamingSVE_or_SSVE_FEXPA + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_FEXPA())">, + AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSSVE_FEXPA), "sve or ssve-fexpa">; // A subset of NEON instructions are legal in Streaming SVE execution mode, // so don't need the additional check for 'isNeonAvailable'. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index a2f326c994c2f..b40c82a25e7ba 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -980,7 +980,7 @@ let Predicates = [HasSVE_or_SME] in { def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; } // End HasSVE_or_SME -let Predicates = [HasNonStreamingSVE_or_SME2p2] in { +let Predicates = [HasNonStreamingSVE_or_SSVE_FEXPA] in { defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; } // End HasSVE diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll index 00e000f642377..021d4855905e7 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -force-streaming -mattr=+sme2p2 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -force-streaming -mattr=+ssve-fexpa < %s | FileCheck %s define @fexpa_h( %a) { ; CHECK-LABEL: fexpa_h: diff --git a/llvm/test/MC/AArch64/SVE/fexpa.s b/llvm/test/MC/AArch64/SVE/fexpa.s index c51b1e2b1d3e5..c6386255b274e 100644 --- a/llvm/test/MC/AArch64/SVE/fexpa.s +++ b/llvm/test/MC/AArch64/SVE/fexpa.s @@ -1,6 +1,6 @@ // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fexpa < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR @@ -14,17 +14,17 @@ fexpa z0.h, z31.h // CHECK-INST: fexpa z0.h, z31.h // CHECK-ENCODING: [0xe0,0xbb,0x60,0x04] -// CHECK-ERROR: instruction requires: sve or sme2p2 +// CHECK-ERROR: instruction requires: sve or ssve-fexpa // CHECK-UNKNOWN: 0460bbe0 fexpa z0.s, z31.s // CHECK-INST: fexpa z0.s, z31.s // CHECK-ENCODING: [0xe0,0xbb,0xa0,0x04] -// CHECK-ERROR: instruction requires: sve or sme2p2 +// CHECK-ERROR: instruction requires: sve or ssve-fexpa // CHECK-UNKNOWN: 04a0bbe0 fexpa z0.d, z31.d // CHECK-INST: fexpa z0.d, z31.d // CHECK-ENCODING: [0xe0,0xbb,0xe0,0x04] -// CHECK-ERROR: instruction requires: sve or sme2p2 +// CHECK-ERROR: instruction requires: sve or ssve-fexpa // CHECK-UNKNOWN: 04e0bbe0 From 9483aaaaaa427b5dcb9a7af8f232a4696eef94bf Mon Sep 17 00:00:00 2001 From: Benjamin Chetioui <3920784+bchetioui@users.noreply.github.com> Date: Wed, 16 Apr 2025 16:32:14 +0200 Subject: [PATCH 119/710] [bazel] Fix bazel build after 00eaff3 #2. (#135962) The linter messed up the order of includes, which is necessary as is. --- .../Dialect/Bufferization/IR/BufferizationTypeInterfaces.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h index 8672aa60a43c5..5faa1479ee542 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h @@ -13,7 +13,8 @@ // Bufferization Type Interfaces //===----------------------------------------------------------------------===// -#include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h.inc" #include "mlir/IR/Types.h" +#include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h.inc" + #endif // MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZATIONTYPEINTERFACES_H_ From 183cb45c1280b80a0022649d1db8a93544bb97b0 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 16 Apr 2025 07:52:16 -0700 Subject: [PATCH 120/710] [lldb-dap] Fixing a race during disconnect. (#135872) While attempting to disconnect the DAP transport reader thread is setting `disconnecting` as soon as it sees a [disconnect request](https://microsoft.github.io/debug-adapter-protocol/specification#Requests_Disconnect). However, if it is processing another request when this disconnect arrives the `DAP::Loop` handler may exit the loop without replying to the disconnect request. There has been some instability on the CI jobs due to this race, for example https://lab.llvm.org/buildbot/#/builders/59/builds/16076 To address this, ensure we only return from `DAP::Loop` once we've emptied the queue. --- lldb/tools/lldb-dap/DAP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index b752e9cfaeb85..597fe3a1e323b 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -921,7 +921,7 @@ llvm::Error DAP::Loop() { StopEventHandlers(); }); - while (!disconnecting) { + while (!disconnecting || !m_queue.empty()) { std::unique_lock lock(m_queue_mutex); m_queue_cv.wait(lock, [&] { return disconnecting || !m_queue.empty(); }); From ef1abbe32e66c16118ded6dd9f7b1a55dea8c2b6 Mon Sep 17 00:00:00 2001 From: Yonah Goldberg Date: Wed, 16 Apr 2025 08:02:34 -0700 Subject: [PATCH 121/710] [NVPTX] Remove extraneous initializeNVVMReflectLegacyPassPass declaration (#135825) This was already declared in NVPTX.h and I accidentally added it back in #134416. --- llvm/lib/Target/NVPTX/NVVMReflect.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 7273b30e4ae2e..30b522efa4361 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -52,10 +52,6 @@ using namespace llvm; #define DEBUG_TYPE "nvvm-reflect" -namespace llvm { -void initializeNVVMReflectLegacyPassPass(PassRegistry &); -} // namespace llvm - namespace { class NVVMReflect { // Map from reflect function call arguments to the value to replace the call From 99c08ff1cb96fc4f471aca0dd253060b3f32e8bc Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 16 Apr 2025 17:05:53 +0200 Subject: [PATCH 122/710] Revert "[clang] Unify `SourceLocation` and `IdentifierInfo*` pair-like data structures to `IdentifierLoc`" (#135974) Reverts llvm/llvm-project#135808 Example from the LLDB macOS CI: https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/as-lldb-cmake/24084/execution/node/54/log/?consoleFull ``` /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/llvm-project/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp:360:49: error: no viable conversion from 'std::pair' to 'clang::ModuleIdPath' (aka 'ArrayRef') clang::Module *top_level_module = DoGetModule(clang_path.front(), false); ^~~~~~~~~~~~~~~~~~ /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/llvm-project/llvm/include/llvm/ADT/ArrayRef.h:41:40: note: candidate constructor (the implicit copy constructor) not viable: no known conversion from 'std::pair' to 'const llvm::ArrayRef &' for 1st argument class LLVM_GSL_POINTER [[nodiscard]] ArrayRef { ^ /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/llvm-project/llvm/include/llvm/ADT/ArrayRef.h:41:40: note: candidate constructor (the implicit move constructor) not viable: no known conversion from 'std::pair' to 'llvm::ArrayRef &&' for 1st argument /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/llvm-project/llvm/include/llvm/ADT/ArrayRef.h:70:18: note: candidate constructor not viable: no known conversion from 'std::pair' to 'std::nullopt_t' for 1st argument /*implicit*/ ArrayRef(std::nullopt_t) {} ``` --- .../pp-trace/PPCallbacksTracker.cpp | 4 +- clang/include/clang/AST/OpenACCClause.h | 20 +-- clang/include/clang/Basic/IdentifierTable.h | 26 +--- clang/include/clang/Lex/ModuleLoader.h | 3 +- clang/include/clang/Lex/PPCallbacks.h | 1 - clang/include/clang/Lex/Preprocessor.h | 9 +- clang/include/clang/Parse/LoopHint.h | 2 +- clang/include/clang/Parse/Parser.h | 13 +- clang/include/clang/Sema/ParsedAttr.h | 10 ++ clang/include/clang/Sema/Sema.h | 2 +- clang/include/clang/Sema/SemaCodeCompletion.h | 3 +- clang/include/clang/Sema/SemaObjC.h | 4 +- clang/include/clang/Sema/SemaOpenACC.h | 2 +- clang/lib/AST/OpenACCClause.cpp | 4 +- clang/lib/AST/TextNodeDumper.cpp | 4 +- clang/lib/Frontend/CompilerInstance.cpp | 53 ++++---- clang/lib/Frontend/FrontendActions.cpp | 4 +- clang/lib/Lex/PPDirectives.cpp | 22 ++-- clang/lib/Lex/PPLexerChange.cpp | 6 +- clang/lib/Lex/Pragma.cpp | 73 ++++++----- clang/lib/Lex/Preprocessor.cpp | 16 +-- clang/lib/Parse/ParseDecl.cpp | 28 ++-- clang/lib/Parse/ParseExpr.cpp | 7 +- clang/lib/Parse/ParseHLSL.cpp | 2 +- clang/lib/Parse/ParseObjc.cpp | 38 +++--- clang/lib/Parse/ParseOpenACC.cpp | 12 +- clang/lib/Parse/ParsePragma.cpp | 15 ++- clang/lib/Parse/ParseStmt.cpp | 6 +- clang/lib/Parse/Parser.cpp | 19 +-- clang/lib/Sema/ParsedAttr.cpp | 8 ++ clang/lib/Sema/SemaARM.cpp | 2 +- clang/lib/Sema/SemaCodeComplete.cpp | 8 +- clang/lib/Sema/SemaDeclAttr.cpp | 124 +++++++++--------- clang/lib/Sema/SemaDeclObjC.cpp | 35 +++-- clang/lib/Sema/SemaHLSL.cpp | 12 +- clang/lib/Sema/SemaModule.cpp | 42 +++--- clang/lib/Sema/SemaObjC.cpp | 45 ++++--- clang/lib/Sema/SemaOpenACCClause.cpp | 11 +- clang/lib/Sema/SemaStmtAttr.cpp | 29 ++-- clang/lib/Sema/SemaSwift.cpp | 24 ++-- clang/lib/Sema/SemaTemplateVariadic.cpp | 10 +- clang/lib/Sema/SemaType.cpp | 13 +- clang/lib/Serialization/ASTReader.cpp | 2 +- clang/lib/Serialization/ASTWriter.cpp | 8 +- .../DependencyScanning/ModuleDepCollector.cpp | 2 +- 45 files changed, 384 insertions(+), 399 deletions(-) diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp index 4c916fa30685b..3bb30fd15b2e1 100644 --- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp +++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp @@ -547,8 +547,8 @@ void PPCallbacksTracker::appendArgument(const char *Name, ModuleIdPath Value) { if (I) SS << ", "; SS << "{" - << "Name: " << Value[I].getIdentifierInfo()->getName() << ", " - << "Loc: " << getSourceLocationString(PP, Value[I].getLoc()) << "}"; + << "Name: " << Value[I].first->getName() << ", " + << "Loc: " << getSourceLocationString(PP, Value[I].second) << "}"; } SS << "]"; appendArgument(Name, SS.str()); diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index f18a6cf62f2c5..681567228cbb0 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -258,7 +258,7 @@ inline bool operator!=(const OpenACCBindClause &LHS, return !(LHS == RHS); } -using DeviceTypeArgument = IdentifierLoc; +using DeviceTypeArgument = std::pair; /// A 'device_type' or 'dtype' clause, takes a list of either an 'asterisk' or /// an identifier. The 'asterisk' means 'the rest'. class OpenACCDeviceTypeClause final @@ -280,16 +280,16 @@ class OpenACCDeviceTypeClause final "Invalid clause kind for device-type"); assert(!llvm::any_of(Archs, [](const DeviceTypeArgument &Arg) { - return Arg.getLoc().isInvalid(); + return Arg.second.isInvalid(); }) && "Invalid SourceLocation for an argument"); - assert((Archs.size() == 1 || - !llvm::any_of(Archs, - [](const DeviceTypeArgument &Arg) { - return Arg.getIdentifierInfo() == nullptr; - })) && - "Only a single asterisk version is permitted, and must be the " - "only one"); + assert( + (Archs.size() == 1 || !llvm::any_of(Archs, + [](const DeviceTypeArgument &Arg) { + return Arg.first == nullptr; + })) && + "Only a single asterisk version is permitted, and must be the " + "only one"); std::uninitialized_copy(Archs.begin(), Archs.end(), getTrailingObjects()); @@ -302,7 +302,7 @@ class OpenACCDeviceTypeClause final } bool hasAsterisk() const { return getArchitectures().size() > 0 && - getArchitectures()[0].getIdentifierInfo() == nullptr; + getArchitectures()[0].first == nullptr; } ArrayRef getArchitectures() const { diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index 1275b056227b5..0347880244a40 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -18,7 +18,6 @@ #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticIDs.h" #include "clang/Basic/LLVM.h" -#include "clang/Basic/SourceLocation.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/FoldingSet.h" @@ -77,6 +76,9 @@ inline bool isReservedInAllContexts(ReservedIdentifierStatus Status) { Status != ReservedIdentifierStatus::StartsWithUnderscoreAndIsExternC; } +/// A simple pair of identifier info and location. +using IdentifierLocPair = std::pair; + /// IdentifierInfo and other related classes are aligned to /// 8 bytes so that DeclarationName can use the lower 3 bits /// of a pointer to one of these classes. @@ -1163,28 +1165,6 @@ class SelectorTable { static std::string getPropertyNameFromSetterSelector(Selector Sel); }; -/// A simple pair of identifier info and location. -class IdentifierLoc { - SourceLocation Loc; - IdentifierInfo *II = nullptr; - -public: - IdentifierLoc() = default; - IdentifierLoc(SourceLocation L, IdentifierInfo *Ident) : Loc(L), II(Ident) {} - - void setLoc(SourceLocation L) { Loc = L; } - void setIdentifierInfo(IdentifierInfo *Ident) { II = Ident; } - SourceLocation getLoc() const { return Loc; } - IdentifierInfo *getIdentifierInfo() const { return II; } - - bool operator==(const IdentifierLoc &X) const { - return Loc == X.Loc && II == X.II; - } - - bool operator!=(const IdentifierLoc &X) const { - return Loc != X.Loc || II != X.II; - } -}; } // namespace clang namespace llvm { diff --git a/clang/include/clang/Lex/ModuleLoader.h b/clang/include/clang/Lex/ModuleLoader.h index a58407200c41c..f880a9091a2ed 100644 --- a/clang/include/clang/Lex/ModuleLoader.h +++ b/clang/include/clang/Lex/ModuleLoader.h @@ -14,7 +14,6 @@ #ifndef LLVM_CLANG_LEX_MODULELOADER_H #define LLVM_CLANG_LEX_MODULELOADER_H -#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/Module.h" #include "clang/Basic/SourceLocation.h" @@ -30,7 +29,7 @@ class IdentifierInfo; /// A sequence of identifier/location pairs used to describe a particular /// module or submodule, e.g., std.vector. -using ModuleIdPath = ArrayRef; +using ModuleIdPath = ArrayRef>; /// Describes the result of attempting to load a module. class ModuleLoadResult { diff --git a/clang/include/clang/Lex/PPCallbacks.h b/clang/include/clang/Lex/PPCallbacks.h index 313b730afbab8..46cc564086f1c 100644 --- a/clang/include/clang/Lex/PPCallbacks.h +++ b/clang/include/clang/Lex/PPCallbacks.h @@ -15,7 +15,6 @@ #define LLVM_CLANG_LEX_PPCALLBACKS_H #include "clang/Basic/DiagnosticIDs.h" -#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/ModuleLoader.h" diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index f8f2f567f9171..24bb524783e93 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -327,7 +327,7 @@ class Preprocessor { SourceLocation ModuleImportLoc; /// The import path for named module that we're currently processing. - SmallVector NamedModuleImportPath; + SmallVector, 2> NamedModuleImportPath; llvm::DenseMap> CheckPoints; unsigned CheckPointCounter = 0; @@ -622,7 +622,7 @@ class Preprocessor { /// The identifier and source location of the currently-active /// \#pragma clang arc_cf_code_audited begin. - IdentifierLoc PragmaARCCFCodeAuditedInfo; + std::pair PragmaARCCFCodeAuditedInfo; /// The source location of the currently-active /// \#pragma clang assume_nonnull begin. @@ -1998,7 +1998,8 @@ class Preprocessor { /// arc_cf_code_audited begin. /// /// Returns an invalid location if there is no such pragma active. - IdentifierLoc getPragmaARCCFCodeAuditedInfo() const { + std::pair + getPragmaARCCFCodeAuditedInfo() const { return PragmaARCCFCodeAuditedInfo; } @@ -2006,7 +2007,7 @@ class Preprocessor { /// arc_cf_code_audited begin. An invalid location ends the pragma. void setPragmaARCCFCodeAuditedInfo(IdentifierInfo *Ident, SourceLocation Loc) { - PragmaARCCFCodeAuditedInfo = IdentifierLoc(Loc, Ident); + PragmaARCCFCodeAuditedInfo = {Ident, Loc}; } /// The location of the currently-active \#pragma clang diff --git a/clang/include/clang/Parse/LoopHint.h b/clang/include/clang/Parse/LoopHint.h index 72be043d3c5a4..cec5605ea3615 100644 --- a/clang/include/clang/Parse/LoopHint.h +++ b/clang/include/clang/Parse/LoopHint.h @@ -9,12 +9,12 @@ #ifndef LLVM_CLANG_PARSE_LOOPHINT_H #define LLVM_CLANG_PARSE_LOOPHINT_H -#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/SourceLocation.h" namespace clang { class Expr; +struct IdentifierLoc; /// Loop optimization hint for loop and unroll pragmas. struct LoopHint { diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 662f54d0e8d8a..9ebcf144ba59e 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1725,8 +1725,8 @@ class Parser : public CodeCompletionHandler { ObjCTypeParamList *parseObjCTypeParamList(); ObjCTypeParamList *parseObjCTypeParamListOrProtocolRefs( ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, - SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, - bool mayBeProtocolList = true); + SmallVectorImpl &protocolIdents, + SourceLocation &rAngleLoc, bool mayBeProtocolList = true); void HelperActionsForIvarDeclarations(ObjCContainerDecl *interfaceDecl, SourceLocation atLoc, @@ -3818,7 +3818,8 @@ class Parser : public CodeCompletionHandler { SourceLocation Loc, llvm::SmallVectorImpl &IntExprs); /// Parses the 'device-type-list', which is a list of identifiers. - bool ParseOpenACCDeviceTypeList(llvm::SmallVector &Archs); + bool ParseOpenACCDeviceTypeList( + llvm::SmallVector> &Archs); /// Parses the 'async-argument', which is an integral value with two /// 'special' values that are likely negative (but come from Macros). OpenACCIntExprParseResult ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, @@ -3950,8 +3951,10 @@ class Parser : public CodeCompletionHandler { return false; } - bool ParseModuleName(SourceLocation UseLoc, - SmallVectorImpl &Path, bool IsImport); + bool ParseModuleName( + SourceLocation UseLoc, + SmallVectorImpl> &Path, + bool IsImport); //===--------------------------------------------------------------------===// // C++11/G++: Type Traits [Type-Traits.html in the GCC manual] diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 428d3111de80d..b88b871dc8821 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -40,6 +40,7 @@ class LangOptions; class Sema; class Stmt; class TargetInfo; +struct IdentifierLoc; /// Represents information about a change in availability for /// an entity, which is part of the encoding of the 'availability' @@ -98,6 +99,15 @@ struct PropertyData { } // namespace detail +/// Wraps an identifier and optional source location for the identifier. +struct IdentifierLoc { + SourceLocation Loc; + IdentifierInfo *Ident; + + static IdentifierLoc *create(ASTContext &Ctx, SourceLocation Loc, + IdentifierInfo *Ident); +}; + /// A union of the various pointer types that can be passed to an /// ParsedAttr as an argument. using ArgsUnion = llvm::PointerUnion; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 1f23b754a69cb..fe37fd7701ce3 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -143,7 +143,7 @@ enum class LangAS : unsigned int; class LocalInstantiationScope; class LookupResult; class MangleNumberingContext; -typedef ArrayRef ModuleIdPath; +typedef ArrayRef> ModuleIdPath; class ModuleLoader; class MultiLevelTemplateArgumentList; struct NormalizedConstraint; diff --git a/clang/include/clang/Sema/SemaCodeCompletion.h b/clang/include/clang/Sema/SemaCodeCompletion.h index 3029e56e5cfe2..72159de3a6e72 100644 --- a/clang/include/clang/Sema/SemaCodeCompletion.h +++ b/clang/include/clang/Sema/SemaCodeCompletion.h @@ -193,7 +193,8 @@ class SemaCodeCompletion : public SemaBase { void CodeCompleteObjCForCollection(Scope *S, DeclGroupPtrTy IterationVar); void CodeCompleteObjCSelector(Scope *S, ArrayRef SelIdents); - void CodeCompleteObjCProtocolReferences(ArrayRef Protocols); + void + CodeCompleteObjCProtocolReferences(ArrayRef Protocols); void CodeCompleteObjCProtocolDecl(Scope *S); void CodeCompleteObjCInterfaceDecl(Scope *S); void CodeCompleteObjCClassForwardDecl(Scope *S); diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h index 4cda41a82b61f..791a7f45b832f 100644 --- a/clang/include/clang/Sema/SemaObjC.h +++ b/clang/include/clang/Sema/SemaObjC.h @@ -307,11 +307,11 @@ class SemaObjC : public SemaBase { DeclGroupPtrTy ActOnForwardProtocolDeclaration(SourceLocation AtProtoclLoc, - ArrayRef IdentList, + ArrayRef IdentList, const ParsedAttributesView &attrList); void FindProtocolDeclaration(bool WarnOnDeclarations, bool ForObjCContainer, - ArrayRef ProtocolId, + ArrayRef ProtocolId, SmallVectorImpl &Protocols); void DiagnoseTypeArgsAndProtocols(IdentifierInfo *ProtocolId, diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 8d31d46444c7e..4c3a13a3b044f 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -212,7 +212,7 @@ class SemaOpenACC : public SemaBase { } LoopWithoutSeqInfo; // Redeclaration of the version in OpenACCClause.h. - using DeviceTypeArgument = IdentifierLoc; + using DeviceTypeArgument = std::pair; /// A type to represent all the data for an OpenACC Clause that has been /// parsed, but not yet created/semantically analyzed. This is effectively a diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index 2820d7b288658..d7cbb51335359 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -891,10 +891,10 @@ void OpenACCClausePrinter::VisitDeviceTypeClause( OS << "("; llvm::interleaveComma(C.getArchitectures(), OS, [&](const DeviceTypeArgument &Arch) { - if (Arch.getIdentifierInfo() == nullptr) + if (Arch.first == nullptr) OS << "*"; else - OS << Arch.getIdentifierInfo()->getName(); + OS << Arch.first->getName(); }); OS << ")"; } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 1bd94a3ac6431..c8b459ee78e6b 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -500,10 +500,10 @@ void TextNodeDumper::Visit(const OpenACCClause *C) { llvm::interleaveComma( cast(C)->getArchitectures(), OS, [&](const DeviceTypeArgument &Arch) { - if (Arch.getIdentifierInfo() == nullptr) + if (Arch.first == nullptr) OS << "*"; else - OS << Arch.getIdentifierInfo()->getName(); + OS << Arch.first->getName(); }); OS << ")"; break; diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 93e4e31c2891d..243e0a3c15b05 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -35,7 +35,6 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" -#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" #include "clang/Serialization/ASTReader.h" #include "clang/Serialization/GlobalModuleIndex.h" @@ -2010,8 +2009,8 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, Module::NameVisibilityKind Visibility, bool IsInclusionDirective) { // Determine what file we're searching from. - StringRef ModuleName = Path[0].getIdentifierInfo()->getName(); - SourceLocation ModuleNameLoc = Path[0].getLoc(); + StringRef ModuleName = Path[0].first->getName(); + SourceLocation ModuleNameLoc = Path[0].second; // If we've already handled this import, just return the cached result. // This one-element cache is important to eliminate redundant diagnostics @@ -2027,7 +2026,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // If we don't already have information on this module, load the module now. Module *Module = nullptr; ModuleMap &MM = getPreprocessor().getHeaderSearchInfo().getModuleMap(); - if (auto MaybeModule = MM.getCachedModuleLoad(*Path[0].getIdentifierInfo())) { + if (auto MaybeModule = MM.getCachedModuleLoad(*Path[0].first)) { // Use the cached result, which may be nullptr. Module = *MaybeModule; // Config macros are already checked before building a module, but they need @@ -2047,7 +2046,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // * `Preprocessor::HandleHeaderIncludeOrImport` will never call this // function as the `#include` or `#import` is textual. - MM.cacheModuleLoad(*Path[0].getIdentifierInfo(), Module); + MM.cacheModuleLoad(*Path[0].first, Module); } else { ModuleLoadResult Result = findOrCompileModuleAndReadAST( ModuleName, ImportLoc, ModuleNameLoc, IsInclusionDirective); @@ -2056,7 +2055,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (!Result) DisableGeneratingGlobalModuleIndex = true; Module = Result; - MM.cacheModuleLoad(*Path[0].getIdentifierInfo(), Module); + MM.cacheModuleLoad(*Path[0].first, Module); } // If we never found the module, fail. Otherwise, verify the module and link @@ -2068,7 +2067,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // a submodule. bool MapPrivateSubModToTopLevel = false; for (unsigned I = 1, N = Path.size(); I != N; ++I) { - StringRef Name = Path[I].getIdentifierInfo()->getName(); + StringRef Name = Path[I].first->getName(); clang::Module *Sub = Module->findSubmodule(Name); // If the user is requesting Foo.Private and it doesn't exist, try to @@ -2079,10 +2078,10 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, SmallString<128> PrivateModule(Module->Name); PrivateModule.append("_Private"); - SmallVector PrivPath; + SmallVector, 2> PrivPath; auto &II = PP->getIdentifierTable().get( PrivateModule, PP->getIdentifierInfo(Module->Name)->getTokenID()); - PrivPath.emplace_back(Path[0].getLoc(), &II); + PrivPath.push_back(std::make_pair(&II, Path[0].second)); std::string FileName; // If there is a modulemap module or prebuilt module, load it. @@ -2096,12 +2095,11 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, PP->markClangModuleAsAffecting(Module); if (!getDiagnostics().isIgnored( diag::warn_no_priv_submodule_use_toplevel, ImportLoc)) { - getDiagnostics().Report(Path[I].getLoc(), + getDiagnostics().Report(Path[I].second, diag::warn_no_priv_submodule_use_toplevel) - << Path[I].getIdentifierInfo() << Module->getFullModuleName() - << PrivateModule - << SourceRange(Path[0].getLoc(), Path[I].getLoc()) - << FixItHint::CreateReplacement(SourceRange(Path[0].getLoc()), + << Path[I].first << Module->getFullModuleName() << PrivateModule + << SourceRange(Path[0].second, Path[I].second) + << FixItHint::CreateReplacement(SourceRange(Path[0].second), PrivateModule); getDiagnostics().Report(Sub->DefinitionLoc, diag::note_private_top_level_defined); @@ -2130,11 +2128,10 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // If there was a clear winner, user it. if (Best.size() == 1) { - getDiagnostics().Report(Path[I].getLoc(), - diag::err_no_submodule_suggest) - << Path[I].getIdentifierInfo() << Module->getFullModuleName() - << Best[0] << SourceRange(Path[0].getLoc(), Path[I - 1].getLoc()) - << FixItHint::CreateReplacement(SourceRange(Path[I].getLoc()), + getDiagnostics().Report(Path[I].second, diag::err_no_submodule_suggest) + << Path[I].first << Module->getFullModuleName() << Best[0] + << SourceRange(Path[0].second, Path[I - 1].second) + << FixItHint::CreateReplacement(SourceRange(Path[I].second), Best[0]); Sub = Module->findSubmodule(Best[0]); @@ -2144,9 +2141,9 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (!Sub) { // No submodule by this name. Complain, and don't look for further // submodules. - getDiagnostics().Report(Path[I].getLoc(), diag::err_no_submodule) - << Path[I].getIdentifierInfo() << Module->getFullModuleName() - << SourceRange(Path[0].getLoc(), Path[I - 1].getLoc()); + getDiagnostics().Report(Path[I].second, diag::err_no_submodule) + << Path[I].first << Module->getFullModuleName() + << SourceRange(Path[0].second, Path[I - 1].second); break; } @@ -2164,8 +2161,8 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, // FIXME: Should we detect this at module load time? It seems fairly // expensive (and rare). getDiagnostics().Report(ImportLoc, diag::warn_missing_submodule) - << Module->getFullModuleName() - << SourceRange(Path.front().getLoc(), Path.back().getLoc()); + << Module->getFullModuleName() + << SourceRange(Path.front().second, Path.back().second); return ModuleLoadResult(Module, ModuleLoadResult::MissingExpected); } @@ -2174,7 +2171,7 @@ CompilerInstance::loadModule(SourceLocation ImportLoc, if (Preprocessor::checkModuleIsAvailable(getLangOpts(), getTarget(), *Module, getDiagnostics())) { getDiagnostics().Report(ImportLoc, diag::note_module_import_here) - << SourceRange(Path.front().getLoc(), Path.back().getLoc()); + << SourceRange(Path.front().second, Path.back().second); LastModuleImportLoc = ImportLoc; LastModuleImportResult = ModuleLoadResult(); return ModuleLoadResult(); @@ -2299,9 +2296,9 @@ GlobalModuleIndex *CompilerInstance::loadGlobalModuleIndex( Module *TheModule = I->second; OptionalFileEntryRef Entry = TheModule->getASTFile(); if (!Entry) { - SmallVector Path; - Path.emplace_back(TriggerLoc, - getPreprocessor().getIdentifierInfo(TheModule->Name)); + SmallVector, 2> Path; + Path.push_back(std::make_pair( + getPreprocessor().getIdentifierInfo(TheModule->Name), TriggerLoc)); std::reverse(Path.begin(), Path.end()); // Load a module as hidden. This also adds it to the global index. loadModule(TheModule->DefinitionLoc, Path, Module::Hidden, false); diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index e6c7b9f32c29b..c5aeb92c7af73 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -1216,9 +1216,9 @@ void GetDependenciesByModuleNameAction::ExecuteAction() { SourceManager &SM = PP.getSourceManager(); FileID MainFileID = SM.getMainFileID(); SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID); - SmallVector Path; + SmallVector, 2> Path; IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName); - Path.emplace_back(FileStart, ModuleID); + Path.push_back(std::make_pair(ModuleID, FileStart)); auto ModResult = CI.loadModule(FileStart, Path, Module::Hidden, false); PPCallbacks *CB = PP.getPPCallbacks(); CB->moduleImport(SourceLocation(), Path, ModResult); diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 21ec83b437ef4..8411526019f3e 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1916,15 +1916,15 @@ void Preprocessor::EnterAnnotationToken(SourceRange Range, /// Produce a diagnostic informing the user that a #include or similar /// was implicitly treated as a module import. -static void diagnoseAutoModuleImport(Preprocessor &PP, SourceLocation HashLoc, - Token &IncludeTok, - ArrayRef Path, - SourceLocation PathEnd) { +static void diagnoseAutoModuleImport( + Preprocessor &PP, SourceLocation HashLoc, Token &IncludeTok, + ArrayRef> Path, + SourceLocation PathEnd) { SmallString<128> PathString; for (size_t I = 0, N = Path.size(); I != N; ++I) { if (I) PathString += '.'; - PathString += Path[I].getIdentifierInfo()->getName(); + PathString += Path[I].first->getName(); } int IncludeKind = 0; @@ -2273,12 +2273,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( SourceLocation StartLoc = IsImportDecl ? IncludeTok.getLocation() : HashLoc; // Complain about attempts to #include files in an audit pragma. - if (PragmaARCCFCodeAuditedInfo.getLoc().isValid()) { + if (PragmaARCCFCodeAuditedInfo.second.isValid()) { Diag(StartLoc, diag::err_pp_include_in_arc_cf_code_audited) << IsImportDecl; - Diag(PragmaARCCFCodeAuditedInfo.getLoc(), diag::note_pragma_entered_here); + Diag(PragmaARCCFCodeAuditedInfo.second, diag::note_pragma_entered_here); // Immediately leave the pragma. - PragmaARCCFCodeAuditedInfo = IdentifierLoc(); + PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; } // Complain about attempts to #include files in an assume-nonnull pragma. @@ -2403,10 +2403,10 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // Compute the module access path corresponding to this module. // FIXME: Should we have a second loadModule() overload to avoid this // extra lookup step? - SmallVector Path; + SmallVector, 2> Path; for (Module *Mod = ModuleToImport; Mod; Mod = Mod->Parent) - Path.emplace_back(FilenameTok.getLocation(), - getIdentifierInfo(Mod->Name)); + Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name), + FilenameTok.getLocation())); std::reverse(Path.begin(), Path.end()); // Warn that we're replacing the include/import with a module import. diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp index db6069e31fa46..a373a52506a24 100644 --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -409,13 +409,13 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) { // Complain about reaching a true EOF within arc_cf_code_audited. // We don't want to complain about reaching the end of a macro // instantiation or a _Pragma. - if (PragmaARCCFCodeAuditedInfo.getLoc().isValid() && !isEndOfMacro && + if (PragmaARCCFCodeAuditedInfo.second.isValid() && !isEndOfMacro && !(CurLexer && CurLexer->Is_PragmaLexer)) { - Diag(PragmaARCCFCodeAuditedInfo.getLoc(), + Diag(PragmaARCCFCodeAuditedInfo.second, diag::err_pp_eof_in_arc_cf_code_audited); // Recover by leaving immediately. - PragmaARCCFCodeAuditedInfo = IdentifierLoc(); + PragmaARCCFCodeAuditedInfo = {nullptr, SourceLocation()}; } // Complain about reaching a true EOF within assume_nonnull. diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp index 5b6a29bdad910..91c1619e35623 100644 --- a/clang/lib/Lex/Pragma.cpp +++ b/clang/lib/Lex/Pragma.cpp @@ -763,19 +763,20 @@ void Preprocessor::HandlePragmaIncludeAlias(Token &Tok) { // Lex a component of a module name: either an identifier or a string literal; // for components that can be expressed both ways, the two forms are equivalent. -static bool LexModuleNameComponent(Preprocessor &PP, Token &Tok, - IdentifierLoc &ModuleNameComponent, - bool First) { +static bool LexModuleNameComponent( + Preprocessor &PP, Token &Tok, + std::pair &ModuleNameComponent, + bool First) { PP.LexUnexpandedToken(Tok); if (Tok.is(tok::string_literal) && !Tok.hasUDSuffix()) { StringLiteralParser Literal(Tok, PP); if (Literal.hadError) return true; - ModuleNameComponent = IdentifierLoc( - Tok.getLocation(), PP.getIdentifierInfo(Literal.GetString())); + ModuleNameComponent = std::make_pair( + PP.getIdentifierInfo(Literal.GetString()), Tok.getLocation()); } else if (!Tok.isAnnotation() && Tok.getIdentifierInfo()) { ModuleNameComponent = - IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); + std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()); } else { PP.Diag(Tok.getLocation(), diag::err_pp_expected_module_name) << First; return true; @@ -783,10 +784,12 @@ static bool LexModuleNameComponent(Preprocessor &PP, Token &Tok, return false; } -static bool LexModuleName(Preprocessor &PP, Token &Tok, - llvm::SmallVectorImpl &ModuleName) { +static bool LexModuleName( + Preprocessor &PP, Token &Tok, + llvm::SmallVectorImpl> + &ModuleName) { while (true) { - IdentifierLoc NameComponent; + std::pair NameComponent; if (LexModuleNameComponent(PP, Tok, NameComponent, ModuleName.empty())) return true; ModuleName.push_back(NameComponent); @@ -800,10 +803,10 @@ static bool LexModuleName(Preprocessor &PP, Token &Tok, void Preprocessor::HandlePragmaModuleBuild(Token &Tok) { SourceLocation Loc = Tok.getLocation(); - IdentifierLoc ModuleNameLoc; + std::pair ModuleNameLoc; if (LexModuleNameComponent(*this, Tok, ModuleNameLoc, true)) return; - IdentifierInfo *ModuleName = ModuleNameLoc.getIdentifierInfo(); + IdentifierInfo *ModuleName = ModuleNameLoc.first; LexUnexpandedToken(Tok); if (Tok.isNot(tok::eod)) { @@ -1106,17 +1109,17 @@ struct PragmaDebugHandler : public PragmaHandler { PP.Diag(MacroName, diag::warn_pragma_debug_missing_argument) << II->getName(); } else if (II->isStr("module_map")) { - llvm::SmallVector ModuleName; + llvm::SmallVector, 8> + ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; ModuleMap &MM = PP.getHeaderSearchInfo().getModuleMap(); Module *M = nullptr; for (auto IIAndLoc : ModuleName) { - M = MM.lookupModuleQualified(IIAndLoc.getIdentifierInfo()->getName(), - M); + M = MM.lookupModuleQualified(IIAndLoc.first->getName(), M); if (!M) { - PP.Diag(IIAndLoc.getLoc(), diag::warn_pragma_debug_unknown_module) - << IIAndLoc.getIdentifierInfo()->getName(); + PP.Diag(IIAndLoc.second, diag::warn_pragma_debug_unknown_module) + << IIAndLoc.first->getName(); return; } } @@ -1704,7 +1707,8 @@ struct PragmaModuleImportHandler : public PragmaHandler { SourceLocation ImportLoc = Tok.getLocation(); // Read the module name. - llvm::SmallVector ModuleName; + llvm::SmallVector, 8> + ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1719,7 +1723,7 @@ struct PragmaModuleImportHandler : public PragmaHandler { return; PP.makeModuleVisible(Imported, ImportLoc); - PP.EnterAnnotationToken(SourceRange(ImportLoc, ModuleName.back().getLoc()), + PP.EnterAnnotationToken(SourceRange(ImportLoc, ModuleName.back().second), tok::annot_module_include, Imported); if (auto *CB = PP.getPPCallbacks()) CB->moduleImport(ImportLoc, ModuleName, Imported); @@ -1740,7 +1744,8 @@ struct PragmaModuleBeginHandler : public PragmaHandler { SourceLocation BeginLoc = Tok.getLocation(); // Read the module name. - llvm::SmallVector ModuleName; + llvm::SmallVector, 8> + ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1749,11 +1754,10 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // We can only enter submodules of the current module. StringRef Current = PP.getLangOpts().CurrentModule; - if (ModuleName.front().getIdentifierInfo()->getName() != Current) { - PP.Diag(ModuleName.front().getLoc(), - diag::err_pp_module_begin_wrong_module) - << ModuleName.front().getIdentifierInfo() << (ModuleName.size() > 1) - << Current.empty() << Current; + if (ModuleName.front().first->getName() != Current) { + PP.Diag(ModuleName.front().second, diag::err_pp_module_begin_wrong_module) + << ModuleName.front().first << (ModuleName.size() > 1) + << Current.empty() << Current; return; } @@ -1761,19 +1765,17 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // be loaded or implicitly loadable. auto &HSI = PP.getHeaderSearchInfo(); auto &MM = HSI.getModuleMap(); - Module *M = HSI.lookupModule(Current, ModuleName.front().getLoc()); + Module *M = HSI.lookupModule(Current, ModuleName.front().second); if (!M) { - PP.Diag(ModuleName.front().getLoc(), - diag::err_pp_module_begin_no_module_map) - << Current; + PP.Diag(ModuleName.front().second, + diag::err_pp_module_begin_no_module_map) << Current; return; } for (unsigned I = 1; I != ModuleName.size(); ++I) { - auto *NewM = MM.findOrInferSubmodule( - M, ModuleName[I].getIdentifierInfo()->getName()); + auto *NewM = MM.findOrInferSubmodule(M, ModuleName[I].first->getName()); if (!NewM) { - PP.Diag(ModuleName[I].getLoc(), diag::err_pp_module_begin_no_submodule) - << M->getFullModuleName() << ModuleName[I].getIdentifierInfo(); + PP.Diag(ModuleName[I].second, diag::err_pp_module_begin_no_submodule) + << M->getFullModuleName() << ModuleName[I].first; return; } M = NewM; @@ -1789,7 +1791,7 @@ struct PragmaModuleBeginHandler : public PragmaHandler { // Enter the scope of the submodule. PP.EnterSubmodule(M, BeginLoc, /*ForPragma*/true); - PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().getLoc()), + PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().second), tok::annot_module_begin, M); } }; @@ -1833,7 +1835,8 @@ struct PragmaModuleLoadHandler : public PragmaHandler { SourceLocation Loc = Tok.getLocation(); // Read the module name. - llvm::SmallVector ModuleName; + llvm::SmallVector, 8> + ModuleName; if (LexModuleName(PP, Tok, ModuleName)) return; @@ -1898,7 +1901,7 @@ struct PragmaARCCFCodeAuditedHandler : public PragmaHandler { PP.Diag(Tok, diag::ext_pp_extra_tokens_at_eol) << "pragma"; // The start location of the active audit. - SourceLocation BeginLoc = PP.getPragmaARCCFCodeAuditedInfo().getLoc(); + SourceLocation BeginLoc = PP.getPragmaARCCFCodeAuditedInfo().second; // The start location we want after processing this. SourceLocation NewLoc; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 4c050bf1f5bb2..c25a3efd899e0 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -1159,8 +1159,8 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { if (Result.is(tok::colon) && ModuleDeclState.isNamedModule()) { std::string Name = ModuleDeclState.getPrimaryName().str(); Name += ":"; - NamedModuleImportPath.emplace_back(Result.getLocation(), - getIdentifierInfo(Name)); + NamedModuleImportPath.push_back( + {getIdentifierInfo(Name), Result.getLocation()}); CurLexerCallback = CLK_LexAfterModuleImport; return true; } @@ -1258,8 +1258,8 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { if (ModuleImportExpectsIdentifier && Result.getKind() == tok::identifier) { // We expected to see an identifier here, and we did; continue handling // identifiers. - NamedModuleImportPath.emplace_back(Result.getLocation(), - Result.getIdentifierInfo()); + NamedModuleImportPath.push_back( + std::make_pair(Result.getIdentifierInfo(), Result.getLocation())); ModuleImportExpectsIdentifier = false; CurLexerCallback = CLK_LexAfterModuleImport; return true; @@ -1302,12 +1302,12 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) { // If the FlatModuleName ends with colon, it implies it is a partition. if (!FlatModuleName.empty() && FlatModuleName.back() != ':') FlatModuleName += "."; - FlatModuleName += Piece.getIdentifierInfo()->getName(); + FlatModuleName += Piece.first->getName(); } - SourceLocation FirstPathLoc = NamedModuleImportPath[0].getLoc(); + SourceLocation FirstPathLoc = NamedModuleImportPath[0].second; NamedModuleImportPath.clear(); - NamedModuleImportPath.emplace_back(FirstPathLoc, - getIdentifierInfo(FlatModuleName)); + NamedModuleImportPath.push_back( + std::make_pair(getIdentifierInfo(FlatModuleName), FirstPathLoc)); } Module *Imported = nullptr; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 8444ff3332e08..8fa74ecff19aa 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -432,8 +432,9 @@ static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II, IdentifierLoc *Parser::ParseIdentifierLoc() { assert(Tok.is(tok::identifier) && "expected an identifier"); - IdentifierLoc *IL = new (Actions.Context) - IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); + IdentifierLoc *IL = IdentifierLoc::create(Actions.Context, + Tok.getLocation(), + Tok.getIdentifierInfo()); ConsumeToken(); return IL; } @@ -1352,21 +1353,20 @@ void Parser::ParseAvailabilityAttribute( return; } IdentifierLoc *Platform = ParseIdentifierLoc(); - if (const IdentifierInfo *const Ident = Platform->getIdentifierInfo()) { + if (const IdentifierInfo *const Ident = Platform->Ident) { // Disallow xrOS for availability attributes. if (Ident->getName().contains("xrOS") || Ident->getName().contains("xros")) - Diag(Platform->getLoc(), diag::warn_availability_unknown_platform) - << Ident; + Diag(Platform->Loc, diag::warn_availability_unknown_platform) << Ident; // Canonicalize platform name from "macosx" to "macos". else if (Ident->getName() == "macosx") - Platform->setIdentifierInfo(PP.getIdentifierInfo("macos")); + Platform->Ident = PP.getIdentifierInfo("macos"); // Canonicalize platform name from "macosx_app_extension" to // "macos_app_extension". else if (Ident->getName() == "macosx_app_extension") - Platform->setIdentifierInfo(PP.getIdentifierInfo("macos_app_extension")); + Platform->Ident = PP.getIdentifierInfo("macos_app_extension"); else - Platform->setIdentifierInfo(PP.getIdentifierInfo( - AvailabilityAttr::canonicalizePlatformName(Ident->getName()))); + Platform->Ident = PP.getIdentifierInfo( + AvailabilityAttr::canonicalizePlatformName(Ident->getName())); } // Parse the ',' following the platform name. @@ -1418,8 +1418,8 @@ void Parser::ParseAvailabilityAttribute( continue; } - if (Keyword == Ident_deprecated && Platform->getIdentifierInfo() && - Platform->getIdentifierInfo()->isStr("swift")) { + if (Keyword == Ident_deprecated && Platform->Ident && + Platform->Ident->isStr("swift")) { // For swift, we deprecate for all versions. if (Changes[Deprecated].KeywordLoc.isValid()) { Diag(KeywordLoc, diag::err_availability_redundant) @@ -1436,7 +1436,7 @@ void Parser::ParseAvailabilityAttribute( if (Keyword == Ident_environment) { if (EnvironmentLoc != nullptr) { Diag(KeywordLoc, diag::err_availability_redundant) - << Keyword << SourceRange(EnvironmentLoc->getLoc()); + << Keyword << SourceRange(EnvironmentLoc->Loc); } } @@ -1792,8 +1792,8 @@ void Parser::ParseSwiftNewTypeAttribute( return; } - auto *SwiftType = new (Actions.Context) - IdentifierLoc(Tok.getLocation(), Tok.getIdentifierInfo()); + auto *SwiftType = IdentifierLoc::create(Actions.Context, Tok.getLocation(), + Tok.getIdentifierInfo()); ConsumeToken(); // Closing ')' diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 1416d52157dca..0a22f7372a9f9 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -4006,20 +4006,19 @@ std::optional Parser::ParseAvailabilitySpec() { if (Version.empty()) return std::nullopt; - StringRef GivenPlatform = - PlatformIdentifier->getIdentifierInfo()->getName(); + StringRef GivenPlatform = PlatformIdentifier->Ident->getName(); StringRef Platform = AvailabilityAttr::canonicalizePlatformName(GivenPlatform); if (AvailabilityAttr::getPrettyPlatformName(Platform).empty() || (GivenPlatform.contains("xros") || GivenPlatform.contains("xrOS"))) { - Diag(PlatformIdentifier->getLoc(), + Diag(PlatformIdentifier->Loc, diag::err_avail_query_unrecognized_platform_name) << GivenPlatform; return std::nullopt; } - return AvailabilitySpec(Version, Platform, PlatformIdentifier->getLoc(), + return AvailabilitySpec(Version, Platform, PlatformIdentifier->Loc, VersionRange.getEnd()); } } diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp index 7ffacc4b79f79..f4c109f9a81a2 100644 --- a/clang/lib/Parse/ParseHLSL.cpp +++ b/clang/lib/Parse/ParseHLSL.cpp @@ -115,7 +115,7 @@ static void fixSeparateAttrArgAndNumber(StringRef ArgStr, SourceLocation ArgLoc, << FixedArg << FixItHint::CreateReplacement(SourceRange(ArgLoc, EndNumLoc), FixedArg); ArgsUnion &Slot = ArgExprs.back(); - Slot = new (Ctx) IdentifierLoc(ArgLoc, PP.getIdentifierInfo(FixedArg)); + Slot = IdentifierLoc::create(Ctx, ArgLoc, PP.getIdentifierInfo(FixedArg)); } void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs, diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp index d872177b3d7aa..bcbf4dfbabafa 100644 --- a/clang/lib/Parse/ParseObjc.cpp +++ b/clang/lib/Parse/ParseObjc.cpp @@ -261,7 +261,7 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, // case, LAngleLoc will be valid and ProtocolIdents will capture the // protocol references (that have not yet been resolved). SourceLocation LAngleLoc, EndProtoLoc; - SmallVector ProtocolIdents; + SmallVector ProtocolIdents; ObjCTypeParamList *typeParameterList = nullptr; ObjCTypeParamListScope typeParamScope(Actions, getCurScope()); if (Tok.is(tok::less)) @@ -361,8 +361,8 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, if (!ProtocolIdents.empty()) { // We already parsed the protocols named when we thought we had a // type parameter list. Translate them into actual protocol references. - for (const auto &Loc : ProtocolIdents) { - protocolLocs.push_back(Loc.getLoc()); + for (const auto &pair : ProtocolIdents) { + protocolLocs.push_back(pair.second); } Actions.ObjC().FindProtocolDeclaration(/*WarnOnDeclarations=*/true, /*ForObjCContainer=*/true, @@ -459,8 +459,8 @@ static void addContextSensitiveTypeNullability(Parser &P, /// \param rAngleLoc The location of the ending '>'. ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, - SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, - bool mayBeProtocolList) { + SmallVectorImpl &protocolIdents, + SourceLocation &rAngleLoc, bool mayBeProtocolList) { assert(Tok.is(tok::less) && "Not at the beginning of a type parameter list"); // Within the type parameter list, don't treat '>' as an operator. @@ -474,8 +474,7 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( for (const auto &pair : protocolIdents) { DeclResult typeParam = Actions.ObjC().actOnObjCTypeParam( getCurScope(), ObjCTypeParamVariance::Invariant, SourceLocation(), - index++, pair.getIdentifierInfo(), pair.getLoc(), SourceLocation(), - nullptr); + index++, pair.first, pair.second, SourceLocation(), nullptr); if (typeParam.isUsable()) typeParams.push_back(typeParam.get()); } @@ -547,7 +546,7 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( } else if (mayBeProtocolList) { // If this could still be a protocol list, just capture the identifier. // We don't want to turn it into a parameter. - protocolIdents.emplace_back(paramLoc, paramName); + protocolIdents.push_back(std::make_pair(paramName, paramLoc)); continue; } @@ -607,7 +606,7 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( /// Parse an objc-type-parameter-list. ObjCTypeParamList *Parser::parseObjCTypeParamList() { SourceLocation lAngleLoc; - SmallVector protocolIdents; + SmallVector protocolIdents; SourceLocation rAngleLoc; ObjCTypeParamListScope Scope(Actions, getCurScope()); @@ -1599,7 +1598,7 @@ ParseObjCProtocolReferences(SmallVectorImpl &Protocols, LAngleLoc = ConsumeToken(); // the "<" - SmallVector ProtocolIdents; + SmallVector ProtocolIdents; while (true) { if (Tok.is(tok::code_completion)) { @@ -1613,7 +1612,8 @@ ParseObjCProtocolReferences(SmallVectorImpl &Protocols, SkipUntil(tok::greater, StopAtSemi); return true; } - ProtocolIdents.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); + ProtocolIdents.push_back(std::make_pair(Tok.getIdentifierInfo(), + Tok.getLocation())); ProtocolLocs.push_back(Tok.getLocation()); ConsumeToken(); @@ -1693,9 +1693,10 @@ void Parser::parseObjCTypeArgsOrProtocolQualifiers( if (Tok.is(tok::code_completion)) { // FIXME: Also include types here. - SmallVector identifierLocPairs; + SmallVector identifierLocPairs; for (unsigned i = 0, n = identifiers.size(); i != n; ++i) { - identifierLocPairs.emplace_back(identifierLocs[i], identifiers[i]); + identifierLocPairs.push_back(IdentifierLocPair(identifiers[i], + identifierLocs[i])); } QualType BaseT = Actions.GetTypeFromParser(baseType); @@ -2093,7 +2094,7 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, SourceLocation nameLoc = ConsumeToken(); if (TryConsumeToken(tok::semi)) { // forward declaration of one protocol. - IdentifierLoc ProtoInfo(nameLoc, protocolName); + IdentifierLocPair ProtoInfo(protocolName, nameLoc); return Actions.ObjC().ActOnForwardProtocolDeclaration(AtLoc, ProtoInfo, attrs); } @@ -2101,8 +2102,8 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, CheckNestedObjCContexts(AtLoc); if (Tok.is(tok::comma)) { // list of forward declarations. - SmallVector ProtocolRefs; - ProtocolRefs.emplace_back(nameLoc, protocolName); + SmallVector ProtocolRefs; + ProtocolRefs.push_back(std::make_pair(protocolName, nameLoc)); // Parse the list of forward declarations. while (true) { @@ -2111,7 +2112,8 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, SkipUntil(tok::semi); return nullptr; } - ProtocolRefs.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); + ProtocolRefs.push_back(IdentifierLocPair(Tok.getIdentifierInfo(), + Tok.getLocation())); ConsumeToken(); // the identifier if (Tok.isNot(tok::comma)) @@ -2194,7 +2196,7 @@ Parser::ParseObjCAtImplementationDeclaration(SourceLocation AtLoc, // permitted here. Parse and diagnose them. if (Tok.is(tok::less)) { SourceLocation lAngleLoc, rAngleLoc; - SmallVector protocolIdents; + SmallVector protocolIdents; SourceLocation diagLoc = Tok.getLocation(); ObjCTypeParamListScope typeParamScope(Actions, getCurScope()); if (parseObjCTypeParamListOrProtocolRefs(typeParamScope, lAngleLoc, diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 337b3eca49764..64916995907c5 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -15,7 +15,6 @@ #include "clang/Basic/OpenACCKinds.h" #include "clang/Parse/Parser.h" #include "clang/Parse/RAIIObjectsForParser.h" -#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/SemaOpenACC.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -815,7 +814,7 @@ bool Parser::ParseOpenACCIntExprList(OpenACCDirectiveKind DK, /// /// The device_type clause may be abbreviated to dtype. bool Parser::ParseOpenACCDeviceTypeList( - llvm::SmallVector &Archs) { + llvm::SmallVector> &Archs) { if (expectIdentifierOrKeyword(*this)) { SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end, @@ -823,7 +822,7 @@ bool Parser::ParseOpenACCDeviceTypeList( return true; } IdentifierInfo *Ident = getCurToken().getIdentifierInfo(); - Archs.emplace_back(ConsumeToken(), Ident); + Archs.emplace_back(Ident, ConsumeToken()); while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) { ExpectAndConsume(tok::comma); @@ -834,7 +833,7 @@ bool Parser::ParseOpenACCDeviceTypeList( return true; } Ident = getCurToken().getIdentifierInfo(); - Archs.emplace_back(ConsumeToken(), Ident); + Archs.emplace_back(Ident, ConsumeToken()); } return false; } @@ -1155,12 +1154,11 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( } case OpenACCClauseKind::DType: case OpenACCClauseKind::DeviceType: { - llvm::SmallVector Archs; + llvm::SmallVector> Archs; if (getCurToken().is(tok::star)) { // FIXME: We want to mark that this is an 'everything else' type of // device_type in Sema. - ParsedClause.setDeviceTypeDetails( - {IdentifierLoc(ConsumeToken(), nullptr)}); + ParsedClause.setDeviceTypeDetails({{nullptr, ConsumeToken()}}); } else if (!ParseOpenACCDeviceTypeList(Archs)) { ParsedClause.setDeviceTypeDetails(std::move(Archs)); } else { diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp index 17b2b30942582..21ebff1e50559 100644 --- a/clang/lib/Parse/ParsePragma.cpp +++ b/clang/lib/Parse/ParsePragma.cpp @@ -1419,16 +1419,16 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { static_cast(Tok.getAnnotationValue()); IdentifierInfo *PragmaNameInfo = Info->PragmaName.getIdentifierInfo(); - Hint.PragmaNameLoc = new (Actions.Context) - IdentifierLoc(Info->PragmaName.getLocation(), PragmaNameInfo); + Hint.PragmaNameLoc = IdentifierLoc::create( + Actions.Context, Info->PragmaName.getLocation(), PragmaNameInfo); // It is possible that the loop hint has no option identifier, such as // #pragma unroll(4). IdentifierInfo *OptionInfo = Info->Option.is(tok::identifier) ? Info->Option.getIdentifierInfo() : nullptr; - Hint.OptionLoc = new (Actions.Context) - IdentifierLoc(Info->Option.getLocation(), OptionInfo); + Hint.OptionLoc = IdentifierLoc::create( + Actions.Context, Info->Option.getLocation(), OptionInfo); llvm::ArrayRef Toks = Info->Toks; @@ -1508,7 +1508,7 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { if (Toks.size() > 2) Diag(Tok.getLocation(), diag::warn_pragma_extra_tokens_at_eol) << PragmaLoopHintString(Info->PragmaName, Info->Option); - Hint.StateLoc = new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); + Hint.StateLoc = IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); } else if (OptionInfo && OptionInfo->getName() == "vectorize_width") { PP.EnterTokenStream(Toks, /*DisableMacroExpansion=*/false, /*IsReinject=*/false); @@ -1529,7 +1529,8 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { ConsumeAnyToken(); } - Hint.StateLoc = new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); + Hint.StateLoc = + IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); ConsumeToken(); // Consume the constant expression eof terminator. } else { @@ -1553,7 +1554,7 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) { Arg2Error = true; } else Hint.StateLoc = - new (Actions.Context) IdentifierLoc(StateLoc, StateInfo); + IdentifierLoc::create(Actions.Context, StateLoc, StateInfo); PP.Lex(Tok); // Identifier } diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 4a82d57fe566b..e8ec140fbe3e5 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -2545,9 +2545,9 @@ StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts, ArgsUnion ArgHints[] = {Hint.PragmaNameLoc, Hint.OptionLoc, Hint.StateLoc, ArgsUnion(Hint.ValueExpr)}; - TempAttrs.addNew(Hint.PragmaNameLoc->getIdentifierInfo(), Hint.Range, - /*scopeName=*/nullptr, Hint.PragmaNameLoc->getLoc(), - ArgHints, /*numArgs=*/4, ParsedAttr::Form::Pragma()); + TempAttrs.addNew(Hint.PragmaNameLoc->Ident, Hint.Range, nullptr, + Hint.PragmaNameLoc->Loc, ArgHints, 4, + ParsedAttr::Form::Pragma()); } // Get the next statement. diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index d528664bca352..f3191762b1244 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -2541,17 +2541,17 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { return Actions.ActOnPrivateModuleFragmentDecl(ModuleLoc, PrivateLoc); } - SmallVector Path; + SmallVector, 2> Path; if (ParseModuleName(ModuleLoc, Path, /*IsImport*/ false)) return nullptr; // Parse the optional module-partition. - SmallVector Partition; + SmallVector, 2> Partition; if (Tok.is(tok::colon)) { SourceLocation ColonLoc = ConsumeToken(); if (!getLangOpts().CPlusPlusModules) Diag(ColonLoc, diag::err_unsupported_module_partition) - << SourceRange(ColonLoc, Partition.back().getLoc()); + << SourceRange(ColonLoc, Partition.back().second); // Recover by ignoring the partition name. else if (ParseModuleName(ModuleLoc, Partition, /*IsImport*/ false)) return nullptr; @@ -2600,7 +2600,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, SourceLocation ImportLoc = ConsumeToken(); // For C++20 modules, we can have "name" or ":Partition name" as valid input. - SmallVector Path; + SmallVector, 2> Path; bool IsPartition = false; Module *HeaderUnit = nullptr; if (Tok.is(tok::header_name)) { @@ -2616,7 +2616,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, SourceLocation ColonLoc = ConsumeToken(); if (!getLangOpts().CPlusPlusModules) Diag(ColonLoc, diag::err_unsupported_module_partition) - << SourceRange(ColonLoc, Path.back().getLoc()); + << SourceRange(ColonLoc, Path.back().second); // Recover by leaving partition empty. else if (ParseModuleName(ColonLoc, Path, /*IsImport*/ true)) return nullptr; @@ -2718,9 +2718,10 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, /// module-name-qualifier[opt] identifier /// module-name-qualifier: /// module-name-qualifier[opt] identifier '.' -bool Parser::ParseModuleName(SourceLocation UseLoc, - SmallVectorImpl &Path, - bool IsImport) { +bool Parser::ParseModuleName( + SourceLocation UseLoc, + SmallVectorImpl> &Path, + bool IsImport) { // Parse the module path. while (true) { if (!Tok.is(tok::identifier)) { @@ -2736,7 +2737,7 @@ bool Parser::ParseModuleName(SourceLocation UseLoc, } // Record this part of the module path. - Path.emplace_back(Tok.getLocation(), Tok.getIdentifierInfo()); + Path.push_back(std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation())); ConsumeToken(); if (Tok.isNot(tok::period)) diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp index c149cef478539..b19a02b8c1a09 100644 --- a/clang/lib/Sema/ParsedAttr.cpp +++ b/clang/lib/Sema/ParsedAttr.cpp @@ -23,6 +23,14 @@ using namespace clang; +IdentifierLoc *IdentifierLoc::create(ASTContext &Ctx, SourceLocation Loc, + IdentifierInfo *Ident) { + IdentifierLoc *Result = new (Ctx) IdentifierLoc; + Result->Loc = Loc; + Result->Ident = Ident; + return Result; +} + size_t ParsedAttr::allocated_size() const { if (IsAvailability) return AttributeFactory::AvailabilityAllocSize; else if (IsTypeTagForDatatype) diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp index 5bcbe78e9d633..3f53fb200a93d 100644 --- a/clang/lib/Sema/SemaARM.cpp +++ b/clang/lib/Sema/SemaARM.cpp @@ -1178,7 +1178,7 @@ void SemaARM::handleBuiltinAliasAttr(Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Ident = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *Ident = AL.getArgAsIdent(0)->Ident; unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 1e4e6fdc78351..f6ec4cb0f069e 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -8718,7 +8718,7 @@ static void AddProtocolResults(DeclContext *Ctx, DeclContext *CurContext, } void SemaCodeCompletion::CodeCompleteObjCProtocolReferences( - ArrayRef Protocols) { + ArrayRef Protocols) { ResultBuilder Results(SemaRef, CodeCompleter->getAllocator(), CodeCompleter->getCodeCompletionTUInfo(), CodeCompletionContext::CCC_ObjCProtocolName); @@ -8729,9 +8729,9 @@ void SemaCodeCompletion::CodeCompleteObjCProtocolReferences( // Tell the result set to ignore all of the protocols we have // already seen. // FIXME: This doesn't work when caching code-completion results. - for (const IdentifierLoc &Pair : Protocols) - if (ObjCProtocolDecl *Protocol = SemaRef.ObjC().LookupProtocol( - Pair.getIdentifierInfo(), Pair.getLoc())) + for (const IdentifierLocPair &Pair : Protocols) + if (ObjCProtocolDecl *Protocol = + SemaRef.ObjC().LookupProtocol(Pair.first, Pair.second)) Results.Ignore(Protocol); // Add all protocols. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 7dd20a8795fc9..bc891fb009410 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -135,13 +135,13 @@ bool Sema::checkStringLiteralArgumentAttr(const ParsedAttr &AL, unsigned ArgNum, // Look for identifiers. If we have one emit a hint to fix it to a literal. if (AL.isArgIdent(ArgNum)) { IdentifierLoc *Loc = AL.getArgAsIdent(ArgNum); - Diag(Loc->getLoc(), diag::err_attribute_argument_type) + Diag(Loc->Loc, diag::err_attribute_argument_type) << AL << AANT_ArgumentString - << FixItHint::CreateInsertion(Loc->getLoc(), "\"") - << FixItHint::CreateInsertion(getLocForEndOfToken(Loc->getLoc()), "\""); - Str = Loc->getIdentifierInfo()->getName(); + << FixItHint::CreateInsertion(Loc->Loc, "\"") + << FixItHint::CreateInsertion(getLocForEndOfToken(Loc->Loc), "\""); + Str = Loc->Ident->getName(); if (ArgLocation) - *ArgLocation = Loc->getLoc(); + *ArgLocation = Loc->Loc; return true; } @@ -768,7 +768,7 @@ static void handleDiagnoseAsBuiltinAttr(Sema &S, Decl *D, auto Union = AL.getArg(Index - 1); if (auto *E = dyn_cast(Union)) return E->getBeginLoc(); - return cast(Union)->getLoc(); + return cast(Union)->Loc; }(); S.Diag(Loc, diag::err_attribute_argument_n_type) << AL << Index << T; @@ -960,10 +960,10 @@ static void handleConsumableAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); - if (!ConsumableAttr::ConvertStrToConsumedState( - IL->getIdentifierInfo()->getName(), DefaultState)) { - S.Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) - << AL << IL->getIdentifierInfo(); + if (!ConsumableAttr::ConvertStrToConsumedState(IL->Ident->getName(), + DefaultState)) { + S.Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL + << IL->Ident; return; } } else { @@ -1005,8 +1005,8 @@ static void handleCallableWhenAttr(Sema &S, Decl *D, const ParsedAttr &AL) { SourceLocation Loc; if (AL.isArgIdent(ArgIndex)) { IdentifierLoc *Ident = AL.getArgAsIdent(ArgIndex); - StateString = Ident->getIdentifierInfo()->getName(); - Loc = Ident->getLoc(); + StateString = Ident->Ident->getName(); + Loc = Ident->Loc; } else { if (!S.checkStringLiteralArgumentAttr(AL, ArgIndex, StateString, &Loc)) return; @@ -1030,11 +1030,11 @@ static void handleParamTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef StateString = Ident->getIdentifierInfo()->getName(); + StringRef StateString = Ident->Ident->getName(); if (!ParamTypestateAttr::ConvertStrToConsumedState(StateString, ParamState)) { - S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) + S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) << AL << StateString; return; } @@ -1064,10 +1064,10 @@ static void handleReturnTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); - if (!ReturnTypestateAttr::ConvertStrToConsumedState( - IL->getIdentifierInfo()->getName(), ReturnState)) { - S.Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) - << AL << IL->getIdentifierInfo(); + if (!ReturnTypestateAttr::ConvertStrToConsumedState(IL->Ident->getName(), + ReturnState)) { + S.Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL + << IL->Ident; return; } } else { @@ -1111,10 +1111,10 @@ static void handleSetTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { SetTypestateAttr::ConsumedState NewState; if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef Param = Ident->getIdentifierInfo()->getName(); + StringRef Param = Ident->Ident->getName(); if (!SetTypestateAttr::ConvertStrToConsumedState(Param, NewState)) { - S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) - << AL << Param; + S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) << AL + << Param; return; } } else { @@ -1133,10 +1133,10 @@ static void handleTestTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) { TestTypestateAttr::ConsumedState TestState; if (AL.isArgIdent(0)) { IdentifierLoc *Ident = AL.getArgAsIdent(0); - StringRef Param = Ident->getIdentifierInfo()->getName(); + StringRef Param = Ident->Ident->getName(); if (!TestTypestateAttr::ConvertStrToConsumedState(Param, TestState)) { - S.Diag(Ident->getLoc(), diag::warn_attribute_type_not_supported) - << AL << Param; + S.Diag(Ident->Loc, diag::warn_attribute_type_not_supported) << AL + << Param; return; } } else { @@ -1497,7 +1497,7 @@ static void handleOwnershipAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Module = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *Module = AL.getArgAsIdent(0)->Ident; StringRef ModuleName = Module->getName(); if (normalizeName(ModuleName)) { @@ -1864,10 +1864,10 @@ static void handleCPUSpecificAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } IdentifierLoc *CPUArg = AL.getArgAsIdent(ArgNo); - StringRef CPUName = CPUArg->getIdentifierInfo()->getName().trim(); + StringRef CPUName = CPUArg->Ident->getName().trim(); if (!S.Context.getTargetInfo().validateCPUSpecificCPUDispatch(CPUName)) { - S.Diag(CPUArg->getLoc(), diag::err_invalid_cpu_specific_dispatch_value) + S.Diag(CPUArg->Loc, diag::err_invalid_cpu_specific_dispatch_value) << CPUName << (AL.getKind() == ParsedAttr::AT_CPUDispatch); return; } @@ -1880,7 +1880,7 @@ static void handleCPUSpecificAttr(Sema &S, Decl *D, const ParsedAttr &AL) { S.Diag(AL.getLoc(), diag::warn_multiversion_duplicate_entries); return; } - CPUs.push_back(CPUArg->getIdentifierInfo()); + CPUs.push_back(CPUArg->Ident); } FD->setIsMultiVersion(true); @@ -2358,10 +2358,10 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; IdentifierLoc *Platform = AL.getArgAsIdent(0); - IdentifierInfo *II = Platform->getIdentifierInfo(); + IdentifierInfo *II = Platform->Ident; if (AvailabilityAttr::getPrettyPlatformName(II->getName()).empty()) - S.Diag(Platform->getLoc(), diag::warn_availability_unknown_platform) - << Platform->getIdentifierInfo(); + S.Diag(Platform->Loc, diag::warn_availability_unknown_platform) + << Platform->Ident; auto *ND = dyn_cast(D); if (!ND) // We warned about this already, so just return. @@ -2410,16 +2410,14 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) { IdentifierInfo *IIEnvironment = nullptr; if (EnvironmentLoc) { if (S.getLangOpts().HLSL) { - IIEnvironment = EnvironmentLoc->getIdentifierInfo(); + IIEnvironment = EnvironmentLoc->Ident; if (AvailabilityAttr::getEnvironmentType( - EnvironmentLoc->getIdentifierInfo()->getName()) == + EnvironmentLoc->Ident->getName()) == llvm::Triple::EnvironmentType::UnknownEnvironment) - S.Diag(EnvironmentLoc->getLoc(), - diag::warn_availability_unknown_environment) - << EnvironmentLoc->getIdentifierInfo(); + S.Diag(EnvironmentLoc->Loc, diag::warn_availability_unknown_environment) + << EnvironmentLoc->Ident; } else { - S.Diag(EnvironmentLoc->getLoc(), - diag::err_availability_unexpected_parameter) + S.Diag(EnvironmentLoc->Loc, diag::err_availability_unexpected_parameter) << "environment" << /* C/C++ */ 1; } } @@ -3632,7 +3630,7 @@ static void handleEnumExtensibilityAttr(Sema &S, Decl *D, } EnumExtensibilityAttr::Kind ExtensibilityKind; - IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; if (!EnumExtensibilityAttr::ConvertStrToKind(II->getName(), ExtensibilityKind)) { S.Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; @@ -3855,7 +3853,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, bool HasImplicitThisParam = isInstanceMethod(D); Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam; - Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo(); + Info->Identifier = AL.getArgAsIdent(0)->Ident; StringRef Format = Info->Identifier->getName(); if (normalizeName(Format)) { @@ -4019,14 +4017,14 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (AL.isArgIdent(I)) { IdentifierLoc *IdLoc = AL.getArgAsIdent(I); - auto It = NameIdxMapping.find(IdLoc->getIdentifierInfo()->getName()); + auto It = NameIdxMapping.find(IdLoc->Ident->getName()); if (It == UnknownName) { S.Diag(AL.getLoc(), diag::err_callback_attribute_argument_unknown) - << IdLoc->getIdentifierInfo() << IdLoc->getLoc(); + << IdLoc->Ident << IdLoc->Loc; return; } - SR = SourceRange(IdLoc->getLoc()); + SR = SourceRange(IdLoc->Loc); ArgIdx = It->second; } else if (AL.isArgExpr(I)) { Expr *IdxExpr = AL.getArgAsExpr(I); @@ -4144,14 +4142,13 @@ LifetimeCaptureByAttr *Sema::ParseLifetimeCaptureByAttr(const ParsedAttr &AL, } assert(AL.isArgIdent(I)); IdentifierLoc *IdLoc = AL.getArgAsIdent(I); - if (IdLoc->getIdentifierInfo()->getName() == ParamName) { - Diag(IdLoc->getLoc(), diag::err_capture_by_references_itself) - << IdLoc->getLoc(); + if (IdLoc->Ident->getName() == ParamName) { + Diag(IdLoc->Loc, diag::err_capture_by_references_itself) << IdLoc->Loc; IsValid = false; continue; } - ParamIdents[I] = IdLoc->getIdentifierInfo(); - ParamLocs[I] = IdLoc->getLoc(); + ParamIdents[I] = IdLoc->Ident; + ParamLocs[I] = IdLoc->Loc; } if (!IsValid) return nullptr; @@ -4757,7 +4754,7 @@ static void handleModeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Name = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *Name = AL.getArgAsIdent(0)->Ident; S.AddModeAttr(D, AL, Name); } @@ -5730,8 +5727,8 @@ static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D, } D->addAttr(::new (S.Context) ArgumentWithTypeTagAttr( - S.Context, AL, AL.getArgAsIdent(0)->getIdentifierInfo(), ArgumentIdx, - TypeTagIdx, IsPointer)); + S.Context, AL, AL.getArgAsIdent(0)->Ident, ArgumentIdx, TypeTagIdx, + IsPointer)); } static void handleTypeTagForDatatypeAttr(Sema &S, Decl *D, @@ -5751,7 +5748,7 @@ static void handleTypeTagForDatatypeAttr(Sema &S, Decl *D, return; } - IdentifierInfo *PointerKind = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *PointerKind = AL.getArgAsIdent(0)->Ident; TypeSourceInfo *MatchingCTypeLoc = nullptr; S.GetTypeFromParser(AL.getMatchingCType(), &MatchingCTypeLoc); assert(MatchingCTypeLoc && "no type source info for attribute argument"); @@ -5822,7 +5819,7 @@ static void handleBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *Ident = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *Ident = AL.getArgAsIdent(0)->Ident; unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); @@ -6588,7 +6585,7 @@ static void handleCFGuardAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } CFGuardAttr::GuardArg Arg; - IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; if (!CFGuardAttr::ConvertStrToGuardArg(II->getName(), Arg)) { S.Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; return; @@ -6690,9 +6687,8 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(0)) { IdentifierLoc *IL = AL.getArgAsIdent(0); if (!VTablePointerAuthenticationAttr::ConvertStrToVPtrAuthKeyType( - IL->getIdentifierInfo()->getName(), KeyType)) { - S.Diag(IL->getLoc(), diag::err_invalid_authentication_key) - << IL->getIdentifierInfo(); + IL->Ident->getName(), KeyType)) { + S.Diag(IL->Loc, diag::err_invalid_authentication_key) << IL->Ident; AL.setInvalid(); } if (KeyType == VTablePointerAuthenticationAttr::DefaultKey && @@ -6712,16 +6708,15 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(1)) { IdentifierLoc *IL = AL.getArgAsIdent(1); if (!VTablePointerAuthenticationAttr:: - ConvertStrToAddressDiscriminationMode( - IL->getIdentifierInfo()->getName(), AddressDiversityMode)) { - S.Diag(IL->getLoc(), diag::err_invalid_address_discrimination) - << IL->getIdentifierInfo(); + ConvertStrToAddressDiscriminationMode(IL->Ident->getName(), + AddressDiversityMode)) { + S.Diag(IL->Loc, diag::err_invalid_address_discrimination) << IL->Ident; AL.setInvalid(); } if (AddressDiversityMode == VTablePointerAuthenticationAttr::DefaultAddressDiscrimination && !S.getLangOpts().PointerAuthCalls) { - S.Diag(IL->getLoc(), diag::err_no_default_vtable_pointer_auth) << 1; + S.Diag(IL->Loc, diag::err_no_default_vtable_pointer_auth) << 1; AL.setInvalid(); } } else { @@ -6736,9 +6731,8 @@ static void handleVTablePointerAuthentication(Sema &S, Decl *D, if (AL.isArgIdent(2)) { IdentifierLoc *IL = AL.getArgAsIdent(2); if (!VTablePointerAuthenticationAttr::ConvertStrToExtraDiscrimination( - IL->getIdentifierInfo()->getName(), ED)) { - S.Diag(IL->getLoc(), diag::err_invalid_extra_discrimination) - << IL->getIdentifierInfo(); + IL->Ident->getName(), ED)) { + S.Diag(IL->Loc, diag::err_invalid_extra_discrimination) << IL->Ident; AL.setInvalid(); } if (ED == VTablePointerAuthenticationAttr::DefaultExtraDiscrimination && diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index 0a14ce23a396e..ba9d3dcf19617 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -1310,26 +1310,24 @@ static bool NestedProtocolHasNoDefinition(ObjCProtocolDecl *PDecl, /// protocol declarations in its 'Protocols' argument. void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, bool ForObjCContainer, - ArrayRef ProtocolId, + ArrayRef ProtocolId, SmallVectorImpl &Protocols) { - for (const IdentifierLoc &Pair : ProtocolId) { - ObjCProtocolDecl *PDecl = - LookupProtocol(Pair.getIdentifierInfo(), Pair.getLoc()); + for (const IdentifierLocPair &Pair : ProtocolId) { + ObjCProtocolDecl *PDecl = LookupProtocol(Pair.first, Pair.second); if (!PDecl) { DeclFilterCCC CCC{}; - TypoCorrection Corrected = SemaRef.CorrectTypo( - DeclarationNameInfo(Pair.getIdentifierInfo(), Pair.getLoc()), - Sema::LookupObjCProtocolName, SemaRef.TUScope, nullptr, CCC, - Sema::CTK_ErrorRecovery); + TypoCorrection Corrected = + SemaRef.CorrectTypo(DeclarationNameInfo(Pair.first, Pair.second), + Sema::LookupObjCProtocolName, SemaRef.TUScope, + nullptr, CCC, Sema::CTK_ErrorRecovery); if ((PDecl = Corrected.getCorrectionDeclAs())) SemaRef.diagnoseTypo(Corrected, PDiag(diag::err_undeclared_protocol_suggest) - << Pair.getIdentifierInfo()); + << Pair.first); } if (!PDecl) { - Diag(Pair.getLoc(), diag::err_undeclared_protocol) - << Pair.getIdentifierInfo(); + Diag(Pair.second, diag::err_undeclared_protocol) << Pair.first; continue; } // If this is a forward protocol declaration, get its definition. @@ -1339,7 +1337,7 @@ void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, // For an objc container, delay protocol reference checking until after we // can set the objc decl as the availability context, otherwise check now. if (!ForObjCContainer) { - (void)SemaRef.DiagnoseUseOfDecl(PDecl, Pair.getLoc()); + (void)SemaRef.DiagnoseUseOfDecl(PDecl, Pair.second); } // If this is a forward declaration and we are supposed to warn in this @@ -1349,8 +1347,7 @@ void SemaObjC::FindProtocolDeclaration(bool WarnOnDeclarations, if (WarnOnDeclarations && NestedProtocolHasNoDefinition(PDecl, UndefinedProtocol)) { - Diag(Pair.getLoc(), diag::warn_undef_protocolref) - << Pair.getIdentifierInfo(); + Diag(Pair.second, diag::warn_undef_protocolref) << Pair.first; Diag(UndefinedProtocol->getLocation(), diag::note_protocol_decl_undefined) << UndefinedProtocol; } @@ -1787,17 +1784,17 @@ void SemaObjC::DiagnoseClassExtensionDupMethods(ObjCCategoryDecl *CAT, /// ActOnForwardProtocolDeclaration - Handle \@protocol foo; SemaObjC::DeclGroupPtrTy SemaObjC::ActOnForwardProtocolDeclaration( - SourceLocation AtProtocolLoc, ArrayRef IdentList, + SourceLocation AtProtocolLoc, ArrayRef IdentList, const ParsedAttributesView &attrList) { ASTContext &Context = getASTContext(); SmallVector DeclsInGroup; - for (const IdentifierLoc &IdentPair : IdentList) { - IdentifierInfo *Ident = IdentPair.getIdentifierInfo(); + for (const IdentifierLocPair &IdentPair : IdentList) { + IdentifierInfo *Ident = IdentPair.first; ObjCProtocolDecl *PrevDecl = LookupProtocol( - Ident, IdentPair.getLoc(), SemaRef.forRedeclarationInCurContext()); + Ident, IdentPair.second, SemaRef.forRedeclarationInCurContext()); ObjCProtocolDecl *PDecl = ObjCProtocolDecl::Create(Context, SemaRef.CurContext, Ident, - IdentPair.getLoc(), AtProtocolLoc, PrevDecl); + IdentPair.second, AtProtocolLoc, PrevDecl); SemaRef.PushOnScopeChains(PDecl, SemaRef.TUScope); CheckObjCDeclScope(PDecl); diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 11f156ae09216..0b442b75d174d 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1274,8 +1274,8 @@ bool SemaHLSL::handleResourceTypeAttr(QualType T, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(0); - StringRef Identifier = Loc->getIdentifierInfo()->getName(); - SourceLocation ArgLoc = Loc->getLoc(); + StringRef Identifier = Loc->Ident->getName(); + SourceLocation ArgLoc = Loc->Loc; // Validate resource class value ResourceClass RC; @@ -1534,8 +1534,8 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(0); - StringRef Str = Loc->getIdentifierInfo()->getName(); - SourceLocation ArgLoc = Loc->getLoc(); + StringRef Str = Loc->Ident->getName(); + SourceLocation ArgLoc = Loc->Loc; SourceLocation SpaceArgLoc; bool SpecifiedSpace = false; @@ -1549,8 +1549,8 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { } IdentifierLoc *Loc = AL.getArgAsIdent(1); - Space = Loc->getIdentifierInfo()->getName(); - SpaceArgLoc = Loc->getLoc(); + Space = Loc->Ident->getName(); + SpaceArgLoc = Loc->Loc; } else { Slot = Str; } diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index 4bba57193ded6..76589bff40be9 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -15,7 +15,6 @@ #include "clang/AST/ASTMutationListener.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" -#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/StringExtras.h" @@ -69,7 +68,7 @@ static std::string stringFromPath(ModuleIdPath Path) { for (auto &Piece : Path) { if (!Name.empty()) Name += "."; - Name += Piece.getIdentifierInfo()->getName(); + Name += Piece.first->getName(); } return Name; } @@ -351,18 +350,17 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // Test the first part of the path to see if it's std[0-9]+ but allow the // name in a system header. - StringRef FirstComponentName = Path[0].getIdentifierInfo()->getName(); - if (!getSourceManager().isInSystemHeader(Path[0].getLoc()) && + StringRef FirstComponentName = Path[0].first->getName(); + if (!getSourceManager().isInSystemHeader(Path[0].second) && (FirstComponentName == "std" || (FirstComponentName.starts_with("std") && llvm::all_of(FirstComponentName.drop_front(3), &llvm::isDigit)))) - Diag(Path[0].getLoc(), diag::warn_reserved_module_name) - << Path[0].getIdentifierInfo(); + Diag(Path[0].second, diag::warn_reserved_module_name) << Path[0].first; // Then test all of the components in the path to see if any of them are // using another kind of reserved or invalid identifier. for (auto Part : Path) { - if (DiagReservedModuleName(*this, Part.getIdentifierInfo(), Part.getLoc())) + if (DiagReservedModuleName(*this, Part.first, Part.second)) return nullptr; } @@ -378,10 +376,10 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // correct. if (!getLangOpts().CurrentModule.empty() && getLangOpts().CurrentModule != ModuleName) { - Diag(Path.front().getLoc(), diag::err_current_module_name_mismatch) - << SourceRange(Path.front().getLoc(), IsPartition - ? Partition.back().getLoc() - : Path.back().getLoc()) + Diag(Path.front().second, diag::err_current_module_name_mismatch) + << SourceRange(Path.front().second, IsPartition + ? Partition.back().second + : Path.back().second) << getLangOpts().CurrentModule; return nullptr; } @@ -396,7 +394,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // We can't have parsed or imported a definition of this module or parsed a // module map defining it already. if (auto *M = Map.findModule(ModuleName)) { - Diag(Path[0].getLoc(), diag::err_module_redefinition) << ModuleName; + Diag(Path[0].second, diag::err_module_redefinition) << ModuleName; if (M->DefinitionLoc.isValid()) Diag(M->DefinitionLoc, diag::note_prev_module_definition); else if (OptionalFileEntryRef FE = M->getASTFile()) @@ -419,8 +417,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // keyword nor a module-partition implicitly imports the primary // module interface unit of the module as if by a module-import- // declaration. - IdentifierLoc ModuleNameLoc(Path[0].getLoc(), - PP.getIdentifierInfo(ModuleName)); + std::pair ModuleNameLoc( + PP.getIdentifierInfo(ModuleName), Path[0].second); // The module loader will assume we're trying to import the module that // we're building if `LangOpts.CurrentModule` equals to 'ModuleName'. @@ -492,7 +490,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // Make the import decl for the interface in the impl module. ImportDecl *Import = ImportDecl::Create(Context, CurContext, ModuleLoc, - Interface, Path[0].getLoc()); + Interface, Path[0].second); CurContext->addDecl(Import); // Sequence initialization of the imported module before that of the current @@ -581,7 +579,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, // For a C++20 module name, flatten into a single identifier with the source // location of the first component. - IdentifierLoc ModuleNameLoc; + std::pair ModuleNameLoc; std::string ModuleName; if (IsPartition) { @@ -593,13 +591,11 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, ModuleName = NamedMod->getPrimaryModuleInterfaceName().str(); ModuleName += ":"; ModuleName += stringFromPath(Path); - ModuleNameLoc = - IdentifierLoc(Path[0].getLoc(), PP.getIdentifierInfo(ModuleName)); + ModuleNameLoc = {PP.getIdentifierInfo(ModuleName), Path[0].second}; Path = ModuleIdPath(ModuleNameLoc); } else if (getLangOpts().CPlusPlusModules) { ModuleName = stringFromPath(Path); - ModuleNameLoc = - IdentifierLoc(Path[0].getLoc(), PP.getIdentifierInfo(ModuleName)); + ModuleNameLoc = {PP.getIdentifierInfo(ModuleName), Path[0].second}; Path = ModuleIdPath(ModuleNameLoc); } @@ -684,7 +680,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, IdentifierLocs.push_back(SourceLocation()); } else if (getLangOpts().CPlusPlusModules && !Mod->Parent) { // A single identifier for the whole name. - IdentifierLocs.push_back(Path[0].getLoc()); + IdentifierLocs.push_back(Path[0].second); } else { Module *ModCheck = Mod; for (unsigned I = 0, N = Path.size(); I != N; ++I) { @@ -694,7 +690,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, break; ModCheck = ModCheck->Parent; - IdentifierLocs.push_back(Path[I].getLoc()); + IdentifierLocs.push_back(Path[I].second); } } @@ -711,7 +707,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, if (getLangOpts().CPlusPlusModules && ExportLoc.isValid() && Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) { Diag(ExportLoc, diag::err_export_partition_impl) - << SourceRange(ExportLoc, Path.back().getLoc()); + << SourceRange(ExportLoc, Path.back().second); } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) { // Re-export the module if the imported module is exported. // Note that we don't need to add re-exported module to Imports field diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp index 9b24b5f052119..073d9791d037b 100644 --- a/clang/lib/Sema/SemaObjC.cpp +++ b/clang/lib/Sema/SemaObjC.cpp @@ -1446,8 +1446,10 @@ SemaObjC::ObjCSubscriptKind SemaObjC::CheckSubscriptingKind(Expr *FromE) { void SemaObjC::AddCFAuditedAttribute(Decl *D) { ASTContext &Context = getASTContext(); - auto IdLoc = SemaRef.PP.getPragmaARCCFCodeAuditedInfo(); - if (!IdLoc.getLoc().isValid()) + IdentifierInfo *Ident; + SourceLocation Loc; + std::tie(Ident, Loc) = SemaRef.PP.getPragmaARCCFCodeAuditedInfo(); + if (!Loc.isValid()) return; // Don't add a redundant or conflicting attribute. @@ -1455,8 +1457,7 @@ void SemaObjC::AddCFAuditedAttribute(Decl *D) { D->hasAttr()) return; - AttributeCommonInfo Info(IdLoc.getIdentifierInfo(), - SourceRange(IdLoc.getLoc()), + AttributeCommonInfo Info(Ident, SourceRange(Loc), AttributeCommonInfo::Form::Pragma()); D->addAttr(CFAuditedTransferAttr::CreateImplicit(Context, Info)); } @@ -1641,10 +1642,8 @@ void SemaObjC::handleMethodFamilyAttr(Decl *D, const ParsedAttr &AL) { IdentifierLoc *IL = AL.getArgAsIdent(0); ObjCMethodFamilyAttr::FamilyKind F; - if (!ObjCMethodFamilyAttr::ConvertStrToFamilyKind( - IL->getIdentifierInfo()->getName(), F)) { - Diag(IL->getLoc(), diag::warn_attribute_type_not_supported) - << AL << IL->getIdentifierInfo(); + if (!ObjCMethodFamilyAttr::ConvertStrToFamilyKind(IL->Ident->getName(), F)) { + Diag(IL->Loc, diag::warn_attribute_type_not_supported) << AL << IL->Ident; return; } @@ -1707,7 +1706,7 @@ void SemaObjC::handleBlocksAttr(Decl *D, const ParsedAttr &AL) { return; } - IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; BlocksAttr::BlockType type; if (!BlocksAttr::ConvertStrToBlockType(II->getName(), type)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; @@ -1999,7 +1998,7 @@ void SemaObjC::handleNSErrorDomain(Decl *D, const ParsedAttr &Attr) { IdentifierLoc *IdentLoc = Attr.isArgIdent(0) ? Attr.getArgAsIdent(0) : nullptr; - if (!IdentLoc || !IdentLoc->getIdentifierInfo()) { + if (!IdentLoc || !IdentLoc->Ident) { // Try to locate the argument directly. SourceLocation Loc = Attr.getLoc(); if (Attr.isArgExpr(0) && Attr.getArgAsExpr(0)) @@ -2010,18 +2009,18 @@ void SemaObjC::handleNSErrorDomain(Decl *D, const ParsedAttr &Attr) { } // Verify that the identifier is a valid decl in the C decl namespace. - LookupResult Result(SemaRef, DeclarationName(IdentLoc->getIdentifierInfo()), + LookupResult Result(SemaRef, DeclarationName(IdentLoc->Ident), SourceLocation(), Sema::LookupNameKind::LookupOrdinaryName); if (!SemaRef.LookupName(Result, SemaRef.TUScope) || !Result.getAsSingle()) { - Diag(IdentLoc->getLoc(), diag::err_nserrordomain_invalid_decl) - << 1 << IdentLoc->getIdentifierInfo(); + Diag(IdentLoc->Loc, diag::err_nserrordomain_invalid_decl) + << 1 << IdentLoc->Ident; return; } - D->addAttr(::new (getASTContext()) NSErrorDomainAttr( - getASTContext(), Attr, IdentLoc->getIdentifierInfo())); + D->addAttr(::new (getASTContext()) + NSErrorDomainAttr(getASTContext(), Attr, IdentLoc->Ident)); } void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { @@ -2034,7 +2033,7 @@ void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { // Typedefs only allow objc_bridge(id) and have some additional checking. if (const auto *TD = dyn_cast(D)) { - if (!Parm->getIdentifierInfo()->isStr("id")) { + if (!Parm->Ident->isStr("id")) { Diag(AL.getLoc(), diag::err_objc_attr_typedef_not_id) << AL; return; } @@ -2047,8 +2046,8 @@ void SemaObjC::handleBridgeAttr(Decl *D, const ParsedAttr &AL) { } } - D->addAttr(::new (getASTContext()) ObjCBridgeAttr(getASTContext(), AL, - Parm->getIdentifierInfo())); + D->addAttr(::new (getASTContext()) + ObjCBridgeAttr(getASTContext(), AL, Parm->Ident)); } void SemaObjC::handleBridgeMutableAttr(Decl *D, const ParsedAttr &AL) { @@ -2059,21 +2058,21 @@ void SemaObjC::handleBridgeMutableAttr(Decl *D, const ParsedAttr &AL) { return; } - D->addAttr(::new (getASTContext()) ObjCBridgeMutableAttr( - getASTContext(), AL, Parm->getIdentifierInfo())); + D->addAttr(::new (getASTContext()) + ObjCBridgeMutableAttr(getASTContext(), AL, Parm->Ident)); } void SemaObjC::handleBridgeRelatedAttr(Decl *D, const ParsedAttr &AL) { IdentifierInfo *RelatedClass = - AL.isArgIdent(0) ? AL.getArgAsIdent(0)->getIdentifierInfo() : nullptr; + AL.isArgIdent(0) ? AL.getArgAsIdent(0)->Ident : nullptr; if (!RelatedClass) { Diag(D->getBeginLoc(), diag::err_objc_attr_not_id) << AL << 0; return; } IdentifierInfo *ClassMethod = - AL.getArgAsIdent(1) ? AL.getArgAsIdent(1)->getIdentifierInfo() : nullptr; + AL.getArgAsIdent(1) ? AL.getArgAsIdent(1)->Ident : nullptr; IdentifierInfo *InstanceMethod = - AL.getArgAsIdent(2) ? AL.getArgAsIdent(2)->getIdentifierInfo() : nullptr; + AL.getArgAsIdent(2) ? AL.getArgAsIdent(2)->Ident : nullptr; D->addAttr(::new (getASTContext()) ObjCBridgeRelatedAttr( getASTContext(), AL, RelatedClass, ClassMethod, InstanceMethod)); } diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index 049baead031a1..ab25dcfd1a081 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -1343,7 +1343,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause( // the limitation, since the Dialect requires this. if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set && Clause.getDeviceTypeArchitectures().size() > 1) { - SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].getLoc(), + SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].second, diag::err_acc_device_type_multiple_archs); return nullptr; } @@ -1369,17 +1369,16 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause( bool Diagnosed = false; auto FilterPred = [&](const DeviceTypeArgument &Arch) { // The '*' case. - if (!Arch.getIdentifierInfo()) + if (!Arch.first) return false; return llvm::find_if(ValidValues, [&](StringRef RHS) { - return Arch.getIdentifierInfo()->getName().equals_insensitive(RHS); + return Arch.first->getName().equals_insensitive(RHS); }) == ValidValues.end(); }; auto Diagnose = [&](const DeviceTypeArgument &Arch) { - Diagnosed = SemaRef.Diag(Arch.getLoc(), diag::err_acc_invalid_default_type) - << Arch.getIdentifierInfo() << Clause.getClauseKind() - << ValidValuesString; + Diagnosed = SemaRef.Diag(Arch.second, diag::err_acc_invalid_default_type) + << Arch.first << Clause.getClauseKind() << ValidValuesString; }; // There aren't stable enumertor versions of 'for-each-then-erase', so do it diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index a09626c3a9a8c..2f719c6d7a21e 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -79,10 +79,9 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, Expr *ValueExpr = A.getArgAsExpr(3); StringRef PragmaName = - llvm::StringSwitch( - PragmaNameLoc->getIdentifierInfo()->getName()) + llvm::StringSwitch(PragmaNameLoc->Ident->getName()) .Cases("unroll", "nounroll", "unroll_and_jam", "nounroll_and_jam", - PragmaNameLoc->getIdentifierInfo()->getName()) + PragmaNameLoc->Ident->getName()) .Default("clang loop"); // This could be handled automatically by adding a Subjects definition in @@ -128,10 +127,10 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, SetHints(LoopHintAttr::UnrollAndJam, LoopHintAttr::Enable); } else { // #pragma clang loop ... - assert(OptionLoc && OptionLoc->getIdentifierInfo() && + assert(OptionLoc && OptionLoc->Ident && "Attribute must have valid option info."); Option = llvm::StringSwitch( - OptionLoc->getIdentifierInfo()->getName()) + OptionLoc->Ident->getName()) .Case("vectorize", LoopHintAttr::Vectorize) .Case("vectorize_width", LoopHintAttr::VectorizeWidth) .Case("interleave", LoopHintAttr::Interleave) @@ -145,13 +144,12 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, .Case("distribute", LoopHintAttr::Distribute) .Default(LoopHintAttr::Vectorize); if (Option == LoopHintAttr::VectorizeWidth) { - assert((ValueExpr || (StateLoc && StateLoc->getIdentifierInfo())) && + assert((ValueExpr || (StateLoc && StateLoc->Ident)) && "Attribute must have a valid value expression or argument."); if (ValueExpr && S.CheckLoopHintExpr(ValueExpr, St->getBeginLoc(), /*AllowZero=*/false)) return nullptr; - if (StateLoc && StateLoc->getIdentifierInfo() && - StateLoc->getIdentifierInfo()->isStr("scalable")) + if (StateLoc && StateLoc->Ident && StateLoc->Ident->isStr("scalable")) State = LoopHintAttr::ScalableWidth; else State = LoopHintAttr::FixedWidth; @@ -169,15 +167,14 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, Option == LoopHintAttr::Unroll || Option == LoopHintAttr::Distribute || Option == LoopHintAttr::PipelineDisabled) { - assert(StateLoc && StateLoc->getIdentifierInfo() && - "Loop hint must have an argument"); - if (StateLoc->getIdentifierInfo()->isStr("disable")) + assert(StateLoc && StateLoc->Ident && "Loop hint must have an argument"); + if (StateLoc->Ident->isStr("disable")) State = LoopHintAttr::Disable; - else if (StateLoc->getIdentifierInfo()->isStr("assume_safety")) + else if (StateLoc->Ident->isStr("assume_safety")) State = LoopHintAttr::AssumeSafety; - else if (StateLoc->getIdentifierInfo()->isStr("full")) + else if (StateLoc->Ident->isStr("full")) State = LoopHintAttr::Full; - else if (StateLoc->getIdentifierInfo()->isStr("enable")) + else if (StateLoc->Ident->isStr("enable")) State = LoopHintAttr::Enable; else llvm_unreachable("bad loop hint argument"); @@ -647,8 +644,8 @@ static Attr *handleAtomicAttr(Sema &S, Stmt *St, const ParsedAttr &AL, } IdentifierLoc *Ident = AL.getArgAsIdent(ArgIndex); - OptionString = Ident->getIdentifierInfo()->getName(); - Loc = Ident->getLoc(); + OptionString = Ident->Ident->getName(); + Loc = Ident->Loc; if (!AtomicAttr::ConvertStrToConsumedOption(OptionString, Option)) { S.Diag(Loc, diag::err_attribute_invalid_atomic_argument) << OptionString; return nullptr; diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp index 4aae855a24b8f..fe72d6c85c37a 100644 --- a/clang/lib/Sema/SemaSwift.cpp +++ b/clang/lib/Sema/SemaSwift.cpp @@ -148,8 +148,8 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { return true; S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type) - << AL << AL.getArgAsIdent(0)->getIdentifierInfo()->getName() - << isa(D) << /*pointer*/ 1; + << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D) + << /*pointer*/ 1; return false; }; @@ -159,8 +159,8 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { return true; S.Diag(AL.getLoc(), diag::err_attr_swift_error_return_type) - << AL << AL.getArgAsIdent(0)->getIdentifierInfo()->getName() - << isa(D) << /*integral*/ 0; + << AL << AL.getArgAsIdent(0)->Ident->getName() << isa(D) + << /*integral*/ 0; return false; }; @@ -169,10 +169,10 @@ void SemaSwift::handleError(Decl *D, const ParsedAttr &AL) { IdentifierLoc *Loc = AL.getArgAsIdent(0); SwiftErrorAttr::ConventionKind Convention; - if (!SwiftErrorAttr::ConvertStrToConventionKind( - Loc->getIdentifierInfo()->getName(), Convention)) { + if (!SwiftErrorAttr::ConvertStrToConventionKind(Loc->Ident->getName(), + Convention)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) - << AL << Loc->getIdentifierInfo(); + << AL << Loc->Ident; return; } @@ -287,10 +287,10 @@ static void checkSwiftAsyncErrorBlock(Sema &S, Decl *D, void SemaSwift::handleAsyncError(Decl *D, const ParsedAttr &AL) { IdentifierLoc *IDLoc = AL.getArgAsIdent(0); SwiftAsyncErrorAttr::ConventionKind ConvKind; - if (!SwiftAsyncErrorAttr::ConvertStrToConventionKind( - IDLoc->getIdentifierInfo()->getName(), ConvKind)) { + if (!SwiftAsyncErrorAttr::ConvertStrToConventionKind(IDLoc->Ident->getName(), + ConvKind)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) - << AL << IDLoc->getIdentifierInfo(); + << AL << IDLoc->Ident; return; } @@ -643,7 +643,7 @@ void SemaSwift::handleNewType(Decl *D, const ParsedAttr &AL) { } SwiftNewTypeAttr::NewtypeKind Kind; - IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; if (!SwiftNewTypeAttr::ConvertStrToNewtypeKind(II->getName(), Kind)) { Diag(AL.getLoc(), diag::warn_attribute_type_not_supported) << AL << II; return; @@ -667,7 +667,7 @@ void SemaSwift::handleAsyncAttr(Decl *D, const ParsedAttr &AL) { } SwiftAsyncAttr::Kind Kind; - IdentifierInfo *II = AL.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = AL.getArgAsIdent(0)->Ident; if (!SwiftAsyncAttr::ConvertStrToKind(II->getName(), Kind)) { Diag(AL.getLoc(), diag::err_swift_async_no_access) << AL << II; return; diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 87682233c5246..dc7e3a0bf8875 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -14,7 +14,6 @@ #include "clang/AST/ExprObjC.h" #include "clang/AST/TypeLoc.h" #include "clang/Sema/Lookup.h" -#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/Sema.h" @@ -756,7 +755,7 @@ bool Sema::CheckParameterPacksForExpansion( bool &RetainExpansion, UnsignedOrNone &NumExpansions) { ShouldExpand = true; RetainExpansion = false; - IdentifierLoc FirstPack; + std::pair FirstPack; bool HaveFirstPack = false; UnsignedOrNone NumPartialExpansions = std::nullopt; SourceLocation PartiallySubstitutedPackLoc; @@ -868,7 +867,8 @@ bool Sema::CheckParameterPacksForExpansion( // This is the first pack we've seen for which we have an argument. // Record it. NumExpansions = NewPackSize; - FirstPack = IdentifierLoc(ParmPack.second, Name); + FirstPack.first = Name; + FirstPack.second = ParmPack.second; HaveFirstPack = true; continue; } @@ -905,9 +905,9 @@ bool Sema::CheckParameterPacksForExpansion( // the same number of arguments specified. if (HaveFirstPack) Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict) - << FirstPack.getIdentifierInfo() << Name << *NumExpansions + << FirstPack.first << Name << *NumExpansions << (LeastNewPackSize != NewPackSize) << LeastNewPackSize - << SourceRange(FirstPack.getLoc()) << SourceRange(ParmPack.second); + << SourceRange(FirstPack.second) << SourceRange(ParmPack.second); else Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_multilevel) << Name << *NumExpansions << (LeastNewPackSize != NewPackSize) diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 6e7ee8b5506ff..eba7267904fb2 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -99,8 +99,8 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, StringRef name = attr.getAttrName()->getName(); // The GC attributes are usually written with macros; special-case them. - IdentifierInfo *II = - attr.isArgIdent(0) ? attr.getArgAsIdent(0)->getIdentifierInfo() : nullptr; + IdentifierInfo *II = attr.isArgIdent(0) ? attr.getArgAsIdent(0)->Ident + : nullptr; if (useExpansionLoc && loc.isMacroID() && II) { if (II->isStr("strong")) { if (S.findMacroSpelling(loc, "__strong")) name = "__strong"; @@ -5732,7 +5732,8 @@ static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state, } IdentifierLoc *Arg = new (S.Context) IdentifierLoc; - Arg->setIdentifierInfo(&S.Context.Idents.get(attrStr)); + Arg->Ident = &S.Context.Idents.get(attrStr); + Arg->Loc = SourceLocation(); ArgsUnion Args(Arg); @@ -6632,7 +6633,7 @@ static bool handleObjCOwnershipTypeAttr(TypeProcessingState &state, return true; } - IdentifierInfo *II = attr.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = attr.getArgAsIdent(0)->Ident; Qualifiers::ObjCLifetime lifetime; if (II->isStr("none")) lifetime = Qualifiers::OCL_ExplicitNone; @@ -6810,7 +6811,7 @@ static bool handleObjCGCTypeAttr(TypeProcessingState &state, ParsedAttr &attr, return true; } - IdentifierInfo *II = attr.getArgAsIdent(0)->getIdentifierInfo(); + IdentifierInfo *II = attr.getArgAsIdent(0)->Ident; if (II->isStr("weak")) GCAttr = Qualifiers::Weak; else if (II->isStr("strong")) @@ -7540,7 +7541,7 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) { if (Attr.isArgExpr(0)) Str = cast(Attr.getArgAsExpr(0))->getString(); else - Str = Attr.getArgAsIdent(0)->getIdentifierInfo()->getName(); + Str = Attr.getArgAsIdent(0)->Ident->getName(); PcsAttr::PCSType Type; if (!PcsAttr::ConvertStrToPCSType(Str, Type)) llvm_unreachable("already validated the attribute"); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index b404015867087..02c31dff620ec 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12811,7 +12811,7 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { for (unsigned I = 0; I < NumArchs; ++I) { IdentifierInfo *Ident = readBool() ? readIdentifier() : nullptr; SourceLocation Loc = readSourceLocation(); - Archs.emplace_back(Loc, Ident); + Archs.emplace_back(Ident, Loc); } return OpenACCDeviceTypeClause::Create(getContext(), ClauseKind, BeginLoc, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 4dca0613cb9ae..95b5718f1d140 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -8774,10 +8774,10 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { writeSourceLocation(DTC->getLParenLoc()); writeUInt32(DTC->getArchitectures().size()); for (const DeviceTypeArgument &Arg : DTC->getArchitectures()) { - writeBool(Arg.getIdentifierInfo()); - if (Arg.getIdentifierInfo()) - AddIdentifierRef(Arg.getIdentifierInfo()); - writeSourceLocation(Arg.getLoc()); + writeBool(Arg.first); + if (Arg.first) + AddIdentifierRef(Arg.first); + writeSourceLocation(Arg.second); } return; } diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 07856dbdba4b4..429bf823616da 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -664,7 +664,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc, const Module *Imported) { if (MDC.ScanInstance.getPreprocessor().isInImportingCXXNamedModules()) { P1689ModuleInfo RequiredModule; - RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str(); + RequiredModule.ModuleName = Path[0].first->getName().str(); RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule; MDC.RequiredStdCXXModules.push_back(RequiredModule); return; From ab7e0c0fc00b2c0ccae735cb0def103831d15b3b Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Apr 2025 17:08:10 +0200 Subject: [PATCH 123/710] [clang][bytecode] Implement __builtin_wmem{cpy,move} (#135969) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 30 ++++++--- clang/test/AST/ByteCode/builtin-functions.cpp | 62 +++++++++++++++++++ 2 files changed, 84 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d06941bf10fe0..b694a34e47ee0 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1788,14 +1788,18 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC, Pointer DestPtr = getParam(Frame, 0); const ASTContext &ASTCtx = S.getASTContext(); const Pointer &SrcPtr = getParam(Frame, 1); - const APSInt &Size = - peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2))); + APSInt Size = peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2))); assert(!Size.isSigned() && "memcpy and friends take an unsigned size"); if (ID == Builtin::BImemcpy || ID == Builtin::BImemmove) diagnoseNonConstexprBuiltin(S, OpPC, ID); - bool Move = (ID == Builtin::BI__builtin_memmove || ID == Builtin::BImemmove); + bool Move = + (ID == Builtin::BI__builtin_memmove || ID == Builtin::BImemmove || + ID == Builtin::BI__builtin_wmemmove || ID == Builtin::BIwmemmove); + bool WChar = ID == Builtin::BIwmemcpy || ID == Builtin::BIwmemmove || + ID == Builtin::BI__builtin_wmemcpy || + ID == Builtin::BI__builtin_wmemmove; // If the size is zero, we treat this as always being a valid no-op. if (Size.isZero()) { @@ -1806,7 +1810,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC, if (SrcPtr.isZero() || DestPtr.isZero()) { Pointer DiagPtr = (SrcPtr.isZero() ? SrcPtr : DestPtr); S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_null) - << /*IsMove=*/Move << /*IsWchar=*/false << !SrcPtr.isZero() + << /*IsMove=*/Move << /*IsWchar=*/WChar << !SrcPtr.isZero() << DiagPtr.toDiagnosticString(ASTCtx); return false; } @@ -1818,7 +1822,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC, ? std::to_string(SrcPtr.getIntegerRepresentation()) : std::to_string(DestPtr.getIntegerRepresentation()); S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_null) - << Move << false << DestPtr.isIntegralPointer() << DiagVal; + << Move << WChar << DestPtr.isIntegralPointer() << DiagVal; return false; } @@ -1837,11 +1841,17 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC, } unsigned DestElemSize = ASTCtx.getTypeSizeInChars(DestElemType).getQuantity(); + if (WChar) { + uint64_t WCharSize = + ASTCtx.getTypeSizeInChars(ASTCtx.getWCharType()).getQuantity(); + Size *= APSInt(APInt(Size.getBitWidth(), WCharSize, /*IsSigned=*/false), + /*IsUnsigend=*/true); + } + if (Size.urem(DestElemSize) != 0) { S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_unsupported) - << Move << /*IsWchar=*/false << 0 << DestElemType << Size - << DestElemSize; + << Move << WChar << 0 << DestElemType << Size << DestElemSize; return false; } @@ -1869,7 +1879,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC, APInt N = Size.udiv(DestElemSize); S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_unsupported) - << Move << /*IsWChar*/ false << (Size.ugt(RemainingSrcBytes) ? 1 : 2) + << Move << WChar << (Size.ugt(RemainingSrcBytes) ? 1 : 2) << DestElemType << toString(N, 10, /*Signed=*/false); return false; } @@ -2587,8 +2597,12 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, case Builtin::BI__builtin_memcpy: case Builtin::BImemcpy: + case Builtin::BI__builtin_wmemcpy: + case Builtin::BIwmemcpy: case Builtin::BI__builtin_memmove: case Builtin::BImemmove: + case Builtin::BI__builtin_wmemmove: + case Builtin::BIwmemmove: if (!interp__builtin_memcpy(S, OpPC, Frame, F, Call)) return false; break; diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 40f7a18119751..a57b4530d2264 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -30,6 +30,7 @@ extern "C" { extern wchar_t *wcschr(const wchar_t *s, wchar_t c); extern int wcscmp(const wchar_t *s1, const wchar_t *s2); extern int wcsncmp(const wchar_t *s1, const wchar_t *s2, size_t n); + extern wchar_t *wmemcpy(wchar_t *d, const wchar_t *s, size_t n); } namespace strcmp { @@ -1592,6 +1593,67 @@ namespace WMemChr { // both-note {{non-constexpr function 'wcschr' cannot be used in a constant expression}} } +namespace WMemCpy { + template + constexpr T result(T (&arr)[4]) { + return arr[0] * 1000 + arr[1] * 100 + arr[2] * 10 + arr[3]; + } + constexpr int test_wmemcpy(int a, int b, int n) { + wchar_t arr[4] = {1, 2, 3, 4}; + __builtin_wmemcpy(arr + a, arr + b, n); + // both-note@-1 2{{overlapping memory regions}} + // both-note@-2 {{source is not a contiguous array of at least 2 elements of type 'wchar_t'}} + // both-note@-3 {{destination is not a contiguous array of at least 3 elements of type 'wchar_t'}} + return result(arr); + } + static_assert(test_wmemcpy(1, 2, 1) == 1334); + static_assert(test_wmemcpy(2, 1, 1) == 1224); + static_assert(test_wmemcpy(0, 1, 2) == 2334); // both-error {{constant}} both-note {{in call}} + static_assert(test_wmemcpy(1, 0, 2) == 1124); // both-error {{constant}} both-note {{in call}} + static_assert(test_wmemcpy(1, 2, 1) == 1334); + static_assert(test_wmemcpy(0, 3, 1) == 4234); + static_assert(test_wmemcpy(0, 3, 2) == 4234); // both-error {{constant}} both-note {{in call}} + static_assert(test_wmemcpy(2, 0, 3) == 4234); // both-error {{constant}} both-note {{in call}} + + wchar_t global; + constexpr wchar_t *null = 0; + static_assert(__builtin_wmemcpy(&global, null, sizeof(wchar_t))); // both-error {{}} \ + // both-note {{source of 'wmemcpy' is nullptr}} + static_assert(__builtin_wmemcpy(null, &global, sizeof(wchar_t))); // both-error {{}} \ + // both-note {{destination of 'wmemcpy' is nullptr}} +} + +namespace WMemMove { + template + constexpr T result(T (&arr)[4]) { + return arr[0] * 1000 + arr[1] * 100 + arr[2] * 10 + arr[3]; + } + + constexpr int test_wmemmove(int a, int b, int n) { + wchar_t arr[4] = {1, 2, 3, 4}; + __builtin_wmemmove(arr + a, arr + b, n); + // both-note@-1 {{source is not a contiguous array of at least 2 elements of type 'wchar_t'}} + // both-note@-2 {{destination is not a contiguous array of at least 3 elements of type 'wchar_t'}} + return result(arr); + } + + static_assert(test_wmemmove(1, 2, 1) == 1334); + static_assert(test_wmemmove(2, 1, 1) == 1224); + static_assert(test_wmemmove(0, 1, 2) == 2334); + static_assert(test_wmemmove(1, 0, 2) == 1124); + static_assert(test_wmemmove(1, 2, 1) == 1334); + static_assert(test_wmemmove(0, 3, 1) == 4234); + static_assert(test_wmemmove(0, 3, 2) == 4234); // both-error {{constant}} both-note {{in call}} + static_assert(test_wmemmove(2, 0, 3) == 4234); // both-error {{constant}} both-note {{in call}} + + wchar_t global; + constexpr wchar_t *null = 0; + static_assert(__builtin_wmemmove(&global, null, sizeof(wchar_t))); // both-error {{}} \ + // both-note {{source of 'wmemmove' is nullptr}} + static_assert(__builtin_wmemmove(null, &global, sizeof(wchar_t))); // both-error {{}} \ + // both-note {{destination of 'wmemmove' is nullptr}} +} + namespace Invalid { constexpr int test() { // both-error {{never produces a constant expression}} __builtin_abort(); // both-note 2{{subexpression not valid in a constant expression}} From 2e9ab7cf96d802a906de342f32bc844036152ada Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 08:09:34 -0700 Subject: [PATCH 124/710] [NFC][Driver][CFI] Update boolean expression (#135881) Show why we don't need regular CFI runtime, when CFI diag runtime is linked. --- clang/lib/Driver/SanitizerArgs.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 3c7cd562a14e3..1db9da1b60939 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -371,8 +371,7 @@ bool SanitizerArgs::needsUbsanCXXRt() const { } bool SanitizerArgs::needsCfiRt() const { - return !(Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) && - CfiCrossDso && !ImplicitCfiRuntime; + return !needsCfiDiagRt() && CfiCrossDso && !ImplicitCfiRuntime; } bool SanitizerArgs::needsCfiDiagRt() const { From f875dd10162dcfb8f4625cef2bfc8e6b9f73f8fc Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Wed, 16 Apr 2025 08:10:29 -0700 Subject: [PATCH 125/710] [lldb][nfc] Remove redundant check in if statement (#135869) We already check this boolean in the `if` statement two lines above. --- lldb/source/Target/ThreadPlanStepInRange.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp index 8a2417e9da326..0e93691de68af 100644 --- a/lldb/source/Target/ThreadPlanStepInRange.cpp +++ b/lldb/source/Target/ThreadPlanStepInRange.cpp @@ -370,7 +370,7 @@ bool ThreadPlanStepInRange::DefaultShouldStopHereCallback( if (!should_stop_here) return false; - if (should_stop_here && current_plan->GetKind() == eKindStepInRange && + if (current_plan->GetKind() == eKindStepInRange && operation == eFrameCompareYounger) { ThreadPlanStepInRange *step_in_range_plan = static_cast(current_plan); From d13135134c43af674584780b1494316f6fdaf027 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 08:12:27 -0700 Subject: [PATCH 126/710] [NFC][Driver][CFI] Rename to clarify purpose of CFI runtime (#135885) CFI runtime (diagnostics, or not) is only needed for cross-dso support. --- clang/include/clang/Driver/SanitizerArgs.h | 4 ++-- clang/lib/Driver/SanitizerArgs.cpp | 12 +++++++----- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 6a866ded0e75c..528e3b400f3dc 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -105,8 +105,8 @@ class SanitizerArgs { bool requiresMinimalRuntime() const { return MinimalRuntime; } bool needsDfsanRt() const { return Sanitizers.has(SanitizerKind::DataFlow); } bool needsSafeStackRt() const { return SafeStackRuntime; } - bool needsCfiRt() const; - bool needsCfiDiagRt() const; + bool needsCfiCrossDsoRt() const; + bool needsCfiCrossDsoDiagRt() const; bool needsStatsRt() const { return Stats; } bool needsScudoRt() const { return Sanitizers.has(SanitizerKind::Scudo); } bool needsNsanRt() const { diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 1db9da1b60939..ee151d5c68b85 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -354,8 +354,8 @@ bool SanitizerArgs::needsFuzzerInterceptors() const { bool SanitizerArgs::needsUbsanRt() const { // All of these include ubsan. if (needsAsanRt() || needsMsanRt() || needsNsanRt() || needsHwasanRt() || - needsTsanRt() || needsDfsanRt() || needsLsanRt() || needsCfiDiagRt() || - (needsScudoRt() && !requiresMinimalRuntime())) + needsTsanRt() || needsDfsanRt() || needsLsanRt() || + needsCfiCrossDsoDiagRt() || (needsScudoRt() && !requiresMinimalRuntime())) return false; return (Sanitizers.Mask & NeedsUbsanRt & ~TrapSanitizers.Mask) || @@ -370,11 +370,13 @@ bool SanitizerArgs::needsUbsanCXXRt() const { ~TrapSanitizers.Mask); } -bool SanitizerArgs::needsCfiRt() const { - return !needsCfiDiagRt() && CfiCrossDso && !ImplicitCfiRuntime; +bool SanitizerArgs::needsCfiCrossDsoRt() const { + // Diag runtime includes cross dso runtime. + return !needsCfiCrossDsoDiagRt() && CfiCrossDso && !ImplicitCfiRuntime; } -bool SanitizerArgs::needsCfiDiagRt() const { +bool SanitizerArgs::needsCfiCrossDsoDiagRt() const { + // UBSsan handles CFI diagnostics without cross-DSO suppport. return (Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) && CfiCrossDso && !ImplicitCfiRuntime; } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index ddeadff8f6dfb..7aab849abe0c1 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1550,14 +1550,14 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, RequiredSymbols.push_back("__safestack_init"); } if (!(SanArgs.needsSharedRt() && SanArgs.needsUbsanRt())) { - if (SanArgs.needsCfiRt()) + if (SanArgs.needsCfiCrossDsoRt()) StaticRuntimes.push_back("cfi"); - if (SanArgs.needsCfiDiagRt()) + if (SanArgs.needsCfiCrossDsoDiagRt()) StaticRuntimes.push_back("cfi_diag"); } if (SanArgs.linkCXXRuntimes() && !SanArgs.requiresMinimalRuntime() && ((!SanArgs.needsSharedRt() && SanArgs.needsUbsanCXXRt()) || - SanArgs.needsCfiDiagRt())) { + SanArgs.needsCfiCrossDsoDiagRt())) { StaticRuntimes.push_back("ubsan_standalone_cxx"); } if (SanArgs.needsStatsRt()) { From b73e5419f66a2501b4491514a72c7e361c25d57e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 08:13:47 -0700 Subject: [PATCH 127/710] [NFC][CFI] Don't mix CFI and non-CFI flags on the same line (#135890) --- clang/test/Driver/sanitizer-ld.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index f9c3506e43208..52c1f6bf96242 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -830,58 +830,63 @@ // CHECK-NSAN-UBSAN: "--whole-archive" "{{[^"]*}}libclang_rt.nsan.a" "--no-whole-archive" // CFI by itself does not link runtime libraries. -// RUN: not %clang -fsanitize=cfi -### %s 2>&1 \ +// RUN: not %clang -fsanitize=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-LINUX // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" // CFI with diagnostics links the UBSan runtime. // RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ -// RUN: -### %s 2>&1\ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic // Cross-DSO CFI with diagnostics links just the CFI runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive" // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic // Cross-DSO CFI on Android does not link runtime libraries. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ +// RUN: -### %s 2>&1 \ // RUN: | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}" // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so" From 8c04656c457e28680c60e8edc15a4b170b684ca2 Mon Sep 17 00:00:00 2001 From: Benjamin Chetioui <3920784+bchetioui@users.noreply.github.com> Date: Wed, 16 Apr 2025 17:19:47 +0200 Subject: [PATCH 128/710] [bazel] Fix bazel build after 2b983a24583dd4e131d727717872a56712b5dd52. (#135976) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 85049ff8339c1..f7b379c2cd6d8 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -1562,18 +1562,23 @@ cc_library( ":AMDGPUDialect", ":AMDGPUPassIncGen", ":AMDGPUUtils", + ":AffineDialect", ":ArithDialect", + ":ArithUtils", ":ControlFlowDialect", ":FuncDialect", ":GPUDialect", ":IR", ":MemRefDialect", + ":MemRefUtils", ":Pass", ":SCFDialect", ":SideEffectInterfaces", ":Support", ":TransformUtils", ":VectorDialect", + ":VectorTransforms", + "//llvm:Support", ], ) From 30259076fec0af97e604ca943b61fb686b9b21ef Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 16 Apr 2025 11:26:53 -0400 Subject: [PATCH 129/710] [AMDGPU][True16][MC] fix opsel for v_cmpx 16bit inst (#135441) Fixed inst printer so that no opsel is generated for dst reg of cmpx 16bit insts --- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 9 +- .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s | 118 ++++++++--------- .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s | 120 +++++++++--------- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s | 110 ++++++++-------- .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s | 108 ++++++++-------- .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt | 118 ++++++++--------- .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 110 ++++++++-------- .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt | 110 ++++++++-------- 8 files changed, 403 insertions(+), 400 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index caff8be3d7348..677df64555623 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1220,6 +1220,10 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + const bool HasDst = + (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst) != -1) || + (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst) != -1); + // Print three values of neg/opsel for wmma instructions (prints 0 when there // is no src_modifier operand instead of not printing anything). if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC || @@ -1238,9 +1242,8 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, } const bool HasDstSel = - NumOps > 0 && - Mod == SISrcMods::OP_SEL_0 && - MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL; + HasDst && NumOps > 0 && Mod == SISrcMods::OP_SEL_0 && + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL; const bool IsPacked = MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsPacked; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index b7345ee6208ed..1e517754b0ecd 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -51,13 +51,13 @@ v_cmpx_class_f16_e64 v1.l, 0.5 // GFX11: v_cmpx_class_f16_e64 v1.l, 0.5 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0xe1,0x01,0x00] v_cmpx_class_f16_e64 v1.h, v2.h -// GFX11: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] v_cmpx_class_f16_e64 v255.h, v2.l -// GFX11: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_class_f16_e64 s105, v255.h -// GFX11: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +// GFX11: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] v_cmpx_class_f32_e64 v1, v2 // GFX11: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] @@ -195,10 +195,10 @@ v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_eq_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_f32_e64 v1, v2 // GFX11: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] @@ -327,10 +327,10 @@ v_cmpx_eq_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_eq_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_i32_e64 v1, v2 // GFX11: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -459,10 +459,10 @@ v_cmpx_eq_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_eq_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_u32_e64 v1, v2 // GFX11: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -591,10 +591,10 @@ v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_f_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] v_cmpx_f_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] v_cmpx_f_f32_e64 v1, v2 // GFX11: v_cmpx_f_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00] @@ -885,10 +885,10 @@ v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_ge_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_f32_e64 v1, v2 // GFX11: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -1017,10 +1017,10 @@ v_cmpx_ge_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ge_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_i32_e64 v1, v2 // GFX11: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -1149,10 +1149,10 @@ v_cmpx_ge_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ge_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_u32_e64 v1, v2 // GFX11: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1281,10 +1281,10 @@ v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_gt_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_f32_e64 v1, v2 // GFX11: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1413,10 +1413,10 @@ v_cmpx_gt_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_gt_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_i32_e64 v1, v2 // GFX11: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1545,10 +1545,10 @@ v_cmpx_gt_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_gt_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_u32_e64 v1, v2 // GFX11: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1677,10 +1677,10 @@ v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_le_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_f32_e64 v1, v2 // GFX11: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -1809,10 +1809,10 @@ v_cmpx_le_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_le_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_i32_e64 v1, v2 // GFX11: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1941,10 +1941,10 @@ v_cmpx_le_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_le_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_u32_e64 v1, v2 // GFX11: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -2073,10 +2073,10 @@ v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_lg_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lg_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lg_f32_e64 v1, v2 // GFX11: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -2205,10 +2205,10 @@ v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_lt_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_f32_e64 v1, v2 // GFX11: v_cmpx_lt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00] @@ -2337,10 +2337,10 @@ v_cmpx_lt_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_lt_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_i32_e64 v1, v2 // GFX11: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2469,10 +2469,10 @@ v_cmpx_lt_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_lt_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_u32_e64 v1, v2 // GFX11: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2601,10 +2601,10 @@ v_cmpx_ne_i16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ne_i16_e64 v1.h, v2.l -// GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ne_i16_e64 v255.l, v255.h -// GFX11: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_i32_e64 v1, v2 // GFX11: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2733,10 +2733,10 @@ v_cmpx_ne_u16_e64 0xfe0b, vcc_hi // GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ne_u16_e64 v1.h, v2.l -// GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ne_u16_e64 v255.l, v255.h -// GFX11: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_u32_e64 v1, v2 // GFX11: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] @@ -2865,10 +2865,10 @@ v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_neq_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] v_cmpx_neq_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] v_cmpx_neq_f32_e64 v1, v2 // GFX11: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2997,10 +2997,10 @@ v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nge_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nge_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nge_f32_e64 v1, v2 // GFX11: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -3129,10 +3129,10 @@ v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_ngt_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ngt_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ngt_f32_e64 v1, v2 // GFX11: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -3261,10 +3261,10 @@ v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nle_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nle_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nle_f32_e64 v1, v2 // GFX11: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3393,10 +3393,10 @@ v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nlg_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nlg_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlg_f32_e64 v1, v2 // GFX11: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3525,10 +3525,10 @@ v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nlt_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nlt_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlt_f32_e64 v1, v2 // GFX11: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3657,10 +3657,10 @@ v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_o_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] v_cmpx_o_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] v_cmpx_o_f32_e64 v1, v2 // GFX11: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -4077,10 +4077,10 @@ v_cmpx_tru_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_t_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] v_cmpx_t_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] v_cmpx_tru_f32_e64 v1, v2 // GFX11: v_cmpx_t_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00] @@ -4209,10 +4209,10 @@ v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_u_f16_e64 v1.h, v2.l -// GFX11: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +// GFX11: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] v_cmpx_u_f16_e64 v255.l, v255.h -// GFX11: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +// GFX11: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] v_cmpx_u_f32_e64 v1, v2 // GFX11: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s index 20864f291f71c..50feb32387eda 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s @@ -2,7 +2,7 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 %s v_cmpx_class_f16 v1.h, v255.h -// GFX11: v_cmpx_class_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_class_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] v_cmpx_class_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -20,7 +20,7 @@ v_cmpx_class_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_class_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_class_f16 v255.h, v2.h -// GFX11: v_cmpx_class_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_class_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_class_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -38,7 +38,7 @@ v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_f16 v1.h, v255.h -// GFX11: v_cmpx_eq_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -56,7 +56,7 @@ v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_f16 v255.h, v2.h -// GFX11: v_cmpx_eq_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -74,7 +74,7 @@ v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_i16 v1.h, v255.h -// GFX11: v_cmpx_eq_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -92,7 +92,7 @@ v_cmpx_eq_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_i16 v255.h, v2.h -// GFX11: v_cmpx_eq_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -110,7 +110,7 @@ v_cmpx_eq_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_u16 v1.h, v255.h -// GFX11: v_cmpx_eq_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_eq_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -128,7 +128,7 @@ v_cmpx_eq_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_u16 v255.h, v2.h -// GFX11: v_cmpx_eq_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_eq_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -146,7 +146,7 @@ v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_f_f16 v1.h, v255.h -// GFX11: v_cmpx_f_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_f_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x80,0xd4,0x01,0xff,0x03,0x00] v_cmpx_f_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_f_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -164,7 +164,7 @@ v_cmpx_f_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_f_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_f_f16 v255.h, v2.h -// GFX11: v_cmpx_f_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_f_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x80,0xd4,0xff,0x05,0x02,0x00] v_cmpx_f_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_f_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -182,7 +182,7 @@ v_cmpx_f_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_f_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_f16 v1.h, v255.h -// GFX11: v_cmpx_ge_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -200,7 +200,7 @@ v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_f16 v255.h, v2.h -// GFX11: v_cmpx_ge_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -218,7 +218,7 @@ v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_i16 v1.h, v255.h -// GFX11: v_cmpx_ge_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -236,7 +236,7 @@ v_cmpx_ge_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_i16 v255.h, v2.h -// GFX11: v_cmpx_ge_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -254,7 +254,7 @@ v_cmpx_ge_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_u16 v1.h, v255.h -// GFX11: v_cmpx_ge_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ge_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -272,7 +272,7 @@ v_cmpx_ge_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_u16 v255.h, v2.h -// GFX11: v_cmpx_ge_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ge_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -290,7 +290,7 @@ v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_f16 v1.h, v255.h -// GFX11: v_cmpx_gt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -308,7 +308,7 @@ v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_f16 v255.h, v2.h -// GFX11: v_cmpx_gt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -326,7 +326,7 @@ v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_i16 v1.h, v255.h -// GFX11: v_cmpx_gt_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -344,7 +344,7 @@ v_cmpx_gt_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_i16 v255.h, v2.h -// GFX11: v_cmpx_gt_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -362,7 +362,7 @@ v_cmpx_gt_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_u16 v1.h, v255.h -// GFX11: v_cmpx_gt_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_gt_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -380,7 +380,7 @@ v_cmpx_gt_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_u16 v255.h, v2.h -// GFX11: v_cmpx_gt_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_gt_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -398,7 +398,7 @@ v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_f16 v1.h, v255.h -// GFX11: v_cmpx_le_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_le_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -416,7 +416,7 @@ v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_f16 v255.h, v2.h -// GFX11: v_cmpx_le_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_le_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -434,7 +434,7 @@ v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_i16 v1.h, v255.h -// GFX11: v_cmpx_le_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_le_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -452,7 +452,7 @@ v_cmpx_le_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_i16 v255.h, v2.h -// GFX11: v_cmpx_le_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_le_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -470,7 +470,7 @@ v_cmpx_le_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_u16 v1.h, v255.h -// GFX11: v_cmpx_le_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_le_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -488,7 +488,7 @@ v_cmpx_le_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_u16 v255.h, v2.h -// GFX11: v_cmpx_le_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_le_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -506,7 +506,7 @@ v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lg_f16 v1.h, v255.h -// GFX11: v_cmpx_lg_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_lg_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -524,7 +524,7 @@ v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lg_f16 v255.h, v2.h -// GFX11: v_cmpx_lg_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_lg_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -542,7 +542,7 @@ v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_f16 v1.h, v255.h -// GFX11: v_cmpx_lt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -560,7 +560,7 @@ v_cmpx_lt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_f16 v255.h, v2.h -// GFX11: v_cmpx_lt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -578,7 +578,7 @@ v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_i16 v1.h, v255.h -// GFX11: v_cmpx_lt_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -596,7 +596,7 @@ v_cmpx_lt_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_i16 v255.h, v2.h -// GFX11: v_cmpx_lt_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -614,7 +614,7 @@ v_cmpx_lt_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_u16 v1.h, v255.h -// GFX11: v_cmpx_lt_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_lt_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -632,7 +632,7 @@ v_cmpx_lt_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_u16 v255.h, v2.h -// GFX11: v_cmpx_lt_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_lt_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -650,7 +650,7 @@ v_cmpx_lt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ne_i16 v1.h, v255.h -// GFX11: v_cmpx_ne_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ne_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ne_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -668,7 +668,7 @@ v_cmpx_ne_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ne_i16 v255.h, v2.h -// GFX11: v_cmpx_ne_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ne_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ne_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -686,7 +686,7 @@ v_cmpx_ne_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ne_u16 v1.h, v255.h -// GFX11: v_cmpx_ne_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ne_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ne_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -704,7 +704,7 @@ v_cmpx_ne_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ne_u16 v255.h, v2.h -// GFX11: v_cmpx_ne_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ne_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ne_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -722,7 +722,7 @@ v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_neq_f16 v1.h, v255.h -// GFX11: v_cmpx_neq_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_neq_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -740,7 +740,7 @@ v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_neq_f16 v255.h, v2.h -// GFX11: v_cmpx_neq_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_neq_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -758,7 +758,7 @@ v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nge_f16 v1.h, v255.h -// GFX11: v_cmpx_nge_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_nge_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -776,7 +776,7 @@ v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nge_f16 v255.h, v2.h -// GFX11: v_cmpx_nge_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_nge_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -794,7 +794,7 @@ v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ngt_f16 v1.h, v255.h -// GFX11: v_cmpx_ngt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_ngt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -812,7 +812,7 @@ v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ngt_f16 v255.h, v2.h -// GFX11: v_cmpx_ngt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_ngt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -830,7 +830,7 @@ v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nle_f16 v1.h, v255.h -// GFX11: v_cmpx_nle_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_nle_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -848,7 +848,7 @@ v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nle_f16 v255.h, v2.h -// GFX11: v_cmpx_nle_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_nle_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -866,7 +866,7 @@ v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nlg_f16 v1.h, v255.h -// GFX11: v_cmpx_nlg_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_nlg_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -884,7 +884,7 @@ v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nlg_f16 v255.h, v2.h -// GFX11: v_cmpx_nlg_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_nlg_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -902,7 +902,7 @@ v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nlt_f16 v1.h, v255.h -// GFX11: v_cmpx_nlt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_nlt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -920,7 +920,7 @@ v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nlt_f16 v255.h, v2.h -// GFX11: v_cmpx_nlt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_nlt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -938,7 +938,7 @@ v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_o_f16 v1.h, v255.h -// GFX11: v_cmpx_o_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_o_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -956,7 +956,7 @@ v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_o_f16 v255.h, v2.h -// GFX11: v_cmpx_o_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_o_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -974,7 +974,7 @@ v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_t_f16 v1.h, v255.h -// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] v_cmpx_t_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -992,7 +992,7 @@ v_cmpx_t_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_t_f16 v255.h, v2.h -// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] v_cmpx_t_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -1010,7 +1010,7 @@ v_cmpx_t_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_tru_f16 v1.h, v255.h -// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] v_cmpx_tru_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -1028,7 +1028,7 @@ v_cmpx_tru_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_tru_f16 v255.h, v2.h -// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] v_cmpx_tru_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -1046,7 +1046,7 @@ v_cmpx_tru_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_u_f16 v1.h, v255.h -// GFX11: v_cmpx_u_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] +// GFX11: v_cmpx_u_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -1064,7 +1064,7 @@ v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_u_f16 v255.h, v2.h -// GFX11: v_cmpx_u_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] +// GFX11: v_cmpx_u_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index c654afa57a67c..71245d9dc3286 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -48,13 +48,13 @@ v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi // GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] v_cmpx_class_f16_e64 v1.h, v2.h -// GFX12: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] v_cmpx_class_f16_e64 v255.h, v2.l -// GFX12: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_class_f16_e64 s105, v255.h -// GFX12: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +// GFX12: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] v_cmpx_class_f32_e64 v1, v2 // GFX12: v_cmpx_class_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00] @@ -192,10 +192,10 @@ v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_eq_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_f32_e64 v1, v2 // GFX12: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] @@ -324,10 +324,10 @@ v_cmpx_eq_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_eq_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_i32_e64 v1, v2 // GFX12: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -456,10 +456,10 @@ v_cmpx_eq_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_eq_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] v_cmpx_eq_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_u32_e64 v1, v2 // GFX12: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -588,10 +588,10 @@ v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_ge_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_f32_e64 v1, v2 // GFX12: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -720,10 +720,10 @@ v_cmpx_ge_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ge_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_i32_e64 v1, v2 // GFX12: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -852,10 +852,10 @@ v_cmpx_ge_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ge_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ge_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_u32_e64 v1, v2 // GFX12: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -984,10 +984,10 @@ v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_gt_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_f32_e64 v1, v2 // GFX12: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1116,10 +1116,10 @@ v_cmpx_gt_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_gt_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_i32_e64 v1, v2 // GFX12: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1248,10 +1248,10 @@ v_cmpx_gt_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_gt_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] v_cmpx_gt_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_u32_e64 v1, v2 // GFX12: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1380,10 +1380,10 @@ v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_le_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_f32_e64 v1, v2 // GFX12: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -1512,10 +1512,10 @@ v_cmpx_le_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_le_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_i32_e64 v1, v2 // GFX12: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1644,10 +1644,10 @@ v_cmpx_le_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_le_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] v_cmpx_le_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_u32_e64 v1, v2 // GFX12: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -1776,10 +1776,10 @@ v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_lg_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lg_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lg_f32_e64 v1, v2 // GFX12: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -1908,10 +1908,10 @@ v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_lt_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_f32_e64 v1, v2 // GFX12: v_cmpx_lt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00] @@ -2040,10 +2040,10 @@ v_cmpx_lt_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_lt_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_i32_e64 v1, v2 // GFX12: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2172,10 +2172,10 @@ v_cmpx_lt_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_lt_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] v_cmpx_lt_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lt_u32_e64 v1, v2 // GFX12: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2304,10 +2304,10 @@ v_cmpx_ne_i16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ne_i16_e64 v1.h, v2.l -// GFX12: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ne_i16_e64 v255.l, v255.h -// GFX12: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_i32_e64 v1, v2 // GFX12: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2436,10 +2436,10 @@ v_cmpx_ne_u16_e64 0xfe0b, vcc_hi // GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_cmpx_ne_u16_e64 v1.h, v2.l -// GFX12: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ne_u16_e64 v255.l, v255.h -// GFX12: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ne_u32_e64 v1, v2 // GFX12: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] @@ -2568,10 +2568,10 @@ v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_neq_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] v_cmpx_neq_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] v_cmpx_neq_f32_e64 v1, v2 // GFX12: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2700,10 +2700,10 @@ v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nge_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nge_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nge_f32_e64 v1, v2 // GFX12: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -2832,10 +2832,10 @@ v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_ngt_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] v_cmpx_ngt_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ngt_f32_e64 v1, v2 // GFX12: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -2964,10 +2964,10 @@ v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nle_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nle_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nle_f32_e64 v1, v2 // GFX12: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3096,10 +3096,10 @@ v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nlg_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nlg_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlg_f32_e64 v1, v2 // GFX12: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3228,10 +3228,10 @@ v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_nlt_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] v_cmpx_nlt_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlt_f32_e64 v1, v2 // GFX12: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3360,10 +3360,10 @@ v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_o_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] v_cmpx_o_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] v_cmpx_o_f32_e64 v1, v2 // GFX12: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -3492,10 +3492,10 @@ v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] v_cmpx_u_f16_e64 v1.h, v2.l -// GFX12: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +// GFX12: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] v_cmpx_u_f16_e64 v255.l, v255.h -// GFX12: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +// GFX12: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] v_cmpx_u_f32_e64 v1, v2 // GFX12: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s index 3c14bec145232..7767e89c5b7c2 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s @@ -2,7 +2,7 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 %s v_cmpx_class_f16 v1.h, v255.h -// GFX12: v_cmpx_class_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_class_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0xff,0x03,0x00] v_cmpx_class_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_class_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -20,7 +20,7 @@ v_cmpx_class_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_class_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_class_f16 v255.h, v2.h -// GFX12: v_cmpx_class_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_class_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_class_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_class_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -38,7 +38,7 @@ v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_f16 v1.h, v255.h -// GFX12: v_cmpx_eq_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -56,7 +56,7 @@ v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_f16 v255.h, v2.h -// GFX12: v_cmpx_eq_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -74,7 +74,7 @@ v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_i16 v1.h, v255.h -// GFX12: v_cmpx_eq_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -92,7 +92,7 @@ v_cmpx_eq_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_i16 v255.h, v2.h -// GFX12: v_cmpx_eq_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb2,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -110,7 +110,7 @@ v_cmpx_eq_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_u16 v1.h, v255.h -// GFX12: v_cmpx_eq_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_eq_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xba,0xd4,0x01,0xff,0x03,0x00] v_cmpx_eq_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -128,7 +128,7 @@ v_cmpx_eq_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_eq_u16 v255.h, v2.h -// GFX12: v_cmpx_eq_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_eq_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xba,0xd4,0xff,0x05,0x02,0x00] v_cmpx_eq_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -146,7 +146,7 @@ v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_f16 v1.h, v255.h -// GFX12: v_cmpx_ge_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -164,7 +164,7 @@ v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_f16 v255.h, v2.h -// GFX12: v_cmpx_ge_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -182,7 +182,7 @@ v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_i16 v1.h, v255.h -// GFX12: v_cmpx_ge_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -200,7 +200,7 @@ v_cmpx_ge_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_i16 v255.h, v2.h -// GFX12: v_cmpx_ge_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb6,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -218,7 +218,7 @@ v_cmpx_ge_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_u16 v1.h, v255.h -// GFX12: v_cmpx_ge_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ge_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbe,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ge_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -236,7 +236,7 @@ v_cmpx_ge_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ge_u16 v255.h, v2.h -// GFX12: v_cmpx_ge_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ge_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbe,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ge_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -254,7 +254,7 @@ v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_f16 v1.h, v255.h -// GFX12: v_cmpx_gt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -272,7 +272,7 @@ v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_f16 v255.h, v2.h -// GFX12: v_cmpx_gt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -290,7 +290,7 @@ v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_i16 v1.h, v255.h -// GFX12: v_cmpx_gt_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -308,7 +308,7 @@ v_cmpx_gt_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_i16 v255.h, v2.h -// GFX12: v_cmpx_gt_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb4,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -326,7 +326,7 @@ v_cmpx_gt_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_u16 v1.h, v255.h -// GFX12: v_cmpx_gt_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_gt_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbc,0xd4,0x01,0xff,0x03,0x00] v_cmpx_gt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -344,7 +344,7 @@ v_cmpx_gt_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_gt_u16 v255.h, v2.h -// GFX12: v_cmpx_gt_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_gt_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbc,0xd4,0xff,0x05,0x02,0x00] v_cmpx_gt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -362,7 +362,7 @@ v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_f16 v1.h, v255.h -// GFX12: v_cmpx_le_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_le_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -380,7 +380,7 @@ v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_f16 v255.h, v2.h -// GFX12: v_cmpx_le_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_le_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -398,7 +398,7 @@ v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_i16 v1.h, v255.h -// GFX12: v_cmpx_le_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_le_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -416,7 +416,7 @@ v_cmpx_le_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_i16 v255.h, v2.h -// GFX12: v_cmpx_le_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_le_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb3,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -434,7 +434,7 @@ v_cmpx_le_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_u16 v1.h, v255.h -// GFX12: v_cmpx_le_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_le_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbb,0xd4,0x01,0xff,0x03,0x00] v_cmpx_le_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -452,7 +452,7 @@ v_cmpx_le_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_le_u16 v255.h, v2.h -// GFX12: v_cmpx_le_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_le_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbb,0xd4,0xff,0x05,0x02,0x00] v_cmpx_le_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -470,7 +470,7 @@ v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lg_f16 v1.h, v255.h -// GFX12: v_cmpx_lg_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_lg_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -488,7 +488,7 @@ v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lg_f16 v255.h, v2.h -// GFX12: v_cmpx_lg_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_lg_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -506,7 +506,7 @@ v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_f16 v1.h, v255.h -// GFX12: v_cmpx_lt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -524,7 +524,7 @@ v_cmpx_lt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_f16 v255.h, v2.h -// GFX12: v_cmpx_lt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -542,7 +542,7 @@ v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_i16 v1.h, v255.h -// GFX12: v_cmpx_lt_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb1,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -560,7 +560,7 @@ v_cmpx_lt_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_i16 v255.h, v2.h -// GFX12: v_cmpx_lt_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb1,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -578,7 +578,7 @@ v_cmpx_lt_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_u16 v1.h, v255.h -// GFX12: v_cmpx_lt_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_lt_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb9,0xd4,0x01,0xff,0x03,0x00] v_cmpx_lt_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -596,7 +596,7 @@ v_cmpx_lt_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_lt_u16 v255.h, v2.h -// GFX12: v_cmpx_lt_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_lt_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb9,0xd4,0xff,0x05,0x02,0x00] v_cmpx_lt_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lt_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -614,7 +614,7 @@ v_cmpx_lt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_lt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ne_i16 v1.h, v255.h -// GFX12: v_cmpx_ne_i16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ne_i16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb5,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ne_i16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_i16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -632,7 +632,7 @@ v_cmpx_ne_i16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_i16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ne_i16 v255.h, v2.h -// GFX12: v_cmpx_ne_i16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ne_i16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xb5,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ne_i16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_i16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -650,7 +650,7 @@ v_cmpx_ne_i16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_i16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ne_u16 v1.h, v255.h -// GFX12: v_cmpx_ne_u16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ne_u16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbd,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ne_u16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_u16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -668,7 +668,7 @@ v_cmpx_ne_u16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_u16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ne_u16 v255.h, v2.h -// GFX12: v_cmpx_ne_u16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ne_u16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xbd,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ne_u16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ne_u16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -686,7 +686,7 @@ v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_neq_f16 v1.h, v255.h -// GFX12: v_cmpx_neq_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_neq_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -704,7 +704,7 @@ v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_neq_f16 v255.h, v2.h -// GFX12: v_cmpx_neq_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_neq_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -722,7 +722,7 @@ v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nge_f16 v1.h, v255.h -// GFX12: v_cmpx_nge_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_nge_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -740,7 +740,7 @@ v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nge_f16 v255.h, v2.h -// GFX12: v_cmpx_nge_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_nge_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -758,7 +758,7 @@ v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ngt_f16 v1.h, v255.h -// GFX12: v_cmpx_ngt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_ngt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -776,7 +776,7 @@ v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_ngt_f16 v255.h, v2.h -// GFX12: v_cmpx_ngt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_ngt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -794,7 +794,7 @@ v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nle_f16 v1.h, v255.h -// GFX12: v_cmpx_nle_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_nle_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -812,7 +812,7 @@ v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nle_f16 v255.h, v2.h -// GFX12: v_cmpx_nle_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_nle_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -830,7 +830,7 @@ v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nlg_f16 v1.h, v255.h -// GFX12: v_cmpx_nlg_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_nlg_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -848,7 +848,7 @@ v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nlg_f16 v255.h, v2.h -// GFX12: v_cmpx_nlg_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_nlg_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -866,7 +866,7 @@ v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_nlt_f16 v1.h, v255.h -// GFX12: v_cmpx_nlt_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_nlt_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -884,7 +884,7 @@ v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_nlt_f16 v255.h, v2.h -// GFX12: v_cmpx_nlt_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_nlt_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -902,7 +902,7 @@ v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_o_f16 v1.h, v255.h -// GFX12: v_cmpx_o_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_o_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -920,7 +920,7 @@ v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_o_f16 v255.h, v2.h -// GFX12: v_cmpx_o_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_o_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] @@ -938,7 +938,7 @@ v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_u_f16 v1.h, v255.h -// GFX12: v_cmpx_u_f16_e64 v1.h, v255.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] +// GFX12: v_cmpx_u_f16_e64 v1.h, v255.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] @@ -956,7 +956,7 @@ v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_cmpx_u_f16 v255.h, v2.h -// GFX12: v_cmpx_u_f16_e64 v255.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] +// GFX12: v_cmpx_u_f16_e64 v255.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt index 40c34708d863e..73da324785749 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt @@ -58,15 +58,15 @@ # GFX11: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] 0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] 0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 -# GFX11-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX11-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] # GFX11-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00 @@ -207,11 +207,11 @@ # GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00 @@ -343,11 +343,11 @@ # GFX11: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00 @@ -479,11 +479,11 @@ # GFX11: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00 @@ -615,11 +615,11 @@ # GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_f_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_f_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00 @@ -913,11 +913,11 @@ # GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00 @@ -1049,11 +1049,11 @@ # GFX11: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00 @@ -1185,11 +1185,11 @@ # GFX11: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00 @@ -1321,11 +1321,11 @@ # GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00 @@ -1457,11 +1457,11 @@ # GFX11: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00 @@ -1593,11 +1593,11 @@ # GFX11: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00 @@ -1729,11 +1729,11 @@ # GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00 @@ -1865,11 +1865,11 @@ # GFX11: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00 @@ -2001,11 +2001,11 @@ # GFX11: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00 @@ -2137,11 +2137,11 @@ # GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00 @@ -2273,11 +2273,11 @@ # GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_lt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_lt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00 @@ -2409,11 +2409,11 @@ # GFX11: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00 @@ -2545,11 +2545,11 @@ # GFX11: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00 @@ -2681,11 +2681,11 @@ # GFX11: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00 @@ -2817,11 +2817,11 @@ # GFX11: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00 @@ -2953,11 +2953,11 @@ # GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00 @@ -3089,11 +3089,11 @@ # GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00 @@ -3225,11 +3225,11 @@ # GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00 @@ -3361,11 +3361,11 @@ # GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00 @@ -3497,11 +3497,11 @@ # GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00 @@ -3633,11 +3633,11 @@ # GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00 @@ -3769,11 +3769,11 @@ # GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00 @@ -3905,11 +3905,11 @@ # GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00 @@ -4203,11 +4203,11 @@ # GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 -# GFX11-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] # GFX11-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 -# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] # GFX11-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index ab2d154e9ef9f..8001267eb6a86 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -54,15 +54,15 @@ # GFX12: v_cmpx_class_f16_e64 -|0xfe0b|, vcc_hi ; encoding: [0x7e,0x01,0xfd,0xd4,0xff,0xd6,0x00,0x20,0x0b,0xfe,0x00,0x00] 0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] 0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] 0x7e,0x00,0xfe,0xd4,0x01,0x05,0x02,0x00 @@ -203,11 +203,11 @@ # GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] @@ -340,11 +340,11 @@ # GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] @@ -477,11 +477,11 @@ # GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] @@ -614,11 +614,11 @@ # GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] @@ -751,11 +751,11 @@ # GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] @@ -888,11 +888,11 @@ # GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] @@ -1025,11 +1025,11 @@ # GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] @@ -1162,11 +1162,11 @@ # GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] @@ -1299,11 +1299,11 @@ # GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] @@ -1436,11 +1436,11 @@ # GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] @@ -1573,11 +1573,11 @@ # GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] @@ -1710,11 +1710,11 @@ # GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] @@ -1847,11 +1847,11 @@ # GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] @@ -1984,11 +1984,11 @@ # GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00 @@ -2120,11 +2120,11 @@ # GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] @@ -2257,11 +2257,11 @@ # GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] @@ -2394,11 +2394,11 @@ # GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] @@ -2531,11 +2531,11 @@ # GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00 @@ -2667,11 +2667,11 @@ # GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] @@ -2804,11 +2804,11 @@ # GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] @@ -2941,11 +2941,11 @@ # GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] @@ -3078,11 +3078,11 @@ # GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] @@ -3215,11 +3215,11 @@ # GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] @@ -3352,11 +3352,11 @@ # GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] @@ -3489,11 +3489,11 @@ # GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] @@ -3626,11 +3626,11 @@ # GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] 0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index f8ce4fafc0252..4a7a64d12fc72 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -65,11 +65,11 @@ # GFX12-FAKE16: v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] 0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1,0] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v1.h, v2.h op_sel:[1,1] ; encoding: [0x7e,0x18,0xfd,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 v255.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xfd,0xd4,0xff,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0xff,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00 @@ -77,7 +77,7 @@ # GFX12-FAKE16: v_cmpx_class_f16_e64 s1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x04,0x02,0x00] 0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00 -# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] +# GFX12-REAL16: v_cmpx_class_f16_e64 s105, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xfd,0xd4,0x69,0xfe,0x03,0x00] # GFX12-FAKE16: v_cmpx_class_f16_e64 s105, v255 ; encoding: [0x7e,0x00,0xfd,0xd4,0x69,0xfe,0x03,0x00] 0x7e,0x00,0xfd,0xd4,0x6a,0x04,0x00,0x00 @@ -231,11 +231,11 @@ # GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00 @@ -395,11 +395,11 @@ # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb2,0xd4,0x01,0x04,0x00,0x00 @@ -562,11 +562,11 @@ # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xba,0xd4,0x01,0x04,0x00,0x00 @@ -726,11 +726,11 @@ # GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00 @@ -890,11 +890,11 @@ # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb6,0xd4,0x01,0x04,0x00,0x00 @@ -1054,11 +1054,11 @@ # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbe,0xd4,0x01,0x04,0x00,0x00 @@ -1218,11 +1218,11 @@ # GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00 @@ -1382,11 +1382,11 @@ # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb4,0xd4,0x01,0x04,0x00,0x00 @@ -1546,11 +1546,11 @@ # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbc,0xd4,0x01,0x04,0x00,0x00 @@ -1710,11 +1710,11 @@ # GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00 @@ -1874,11 +1874,11 @@ # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb3,0xd4,0x01,0x04,0x00,0x00 @@ -2038,11 +2038,11 @@ # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbb,0xd4,0x01,0x04,0x00,0x00 @@ -2202,11 +2202,11 @@ # GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00 @@ -2366,11 +2366,11 @@ # GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00 @@ -2530,11 +2530,11 @@ # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb1,0xd4,0x01,0x04,0x00,0x00 @@ -2694,11 +2694,11 @@ # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb9,0xd4,0x01,0x04,0x00,0x00 @@ -2858,11 +2858,11 @@ # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64 v1, v2 ; encoding: [0x7e,0x00,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xb5,0xd4,0x01,0x04,0x00,0x00 @@ -3022,11 +3022,11 @@ # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64 v1, v2 ; encoding: [0x7e,0x00,0xbd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0xbd,0xd4,0x01,0x04,0x00,0x00 @@ -3186,11 +3186,11 @@ # GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00 @@ -3350,11 +3350,11 @@ # GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00 @@ -3514,11 +3514,11 @@ # GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00 @@ -3678,11 +3678,11 @@ # GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00 @@ -3842,11 +3842,11 @@ # GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00 @@ -4006,11 +4006,11 @@ # GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00 @@ -4170,11 +4170,11 @@ # GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00 @@ -4334,11 +4334,11 @@ # GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 -# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] # GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 -# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00 From aca710ac3655fcd3f057edc1382ceec7fc58ef19 Mon Sep 17 00:00:00 2001 From: Tom Honermann Date: Wed, 16 Apr 2025 08:28:09 -0700 Subject: [PATCH 130/710] [NFC][Clang] Introduce type aliases to replace use of auto in clang/lib/CodeGen/CGCall.cpp. (#135861) CGCall.cpp declares several functions with a return type that is an explicitly spelled out specialization of `SmallVector`. Previously, `auto` was used in several places to avoid repeating the long type name; a use that Clang maintainers find unjustified. This change introduces type aliases and replaces the existing uses of `auto` with the corresponding alias name. --- clang/lib/CodeGen/CGCall.cpp | 75 ++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index b25cdf9523ae1..bc1035163a8eb 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -199,15 +199,17 @@ static void appendParameterTypes(const CodeGenTypes &CGT, prefix.size()); } +using ExtParameterInfoList = + SmallVector; + /// Arrange the LLVM function layout for a value of the given function /// type, on top of any implicit parameters already stored. static const CGFunctionInfo & arrangeLLVMFunctionInfo(CodeGenTypes &CGT, bool instanceMethod, SmallVectorImpl &prefix, CanQual FTP) { - SmallVector paramInfos; + ExtParameterInfoList paramInfos; RequiredArgs Required = RequiredArgs::forPrototypePlus(FTP, prefix.size()); - // FIXME: Kill copy. appendParameterTypes(CGT, prefix, paramInfos, FTP); CanQualType resultType = FTP->getReturnType().getUnqualifiedType(); @@ -217,11 +219,13 @@ arrangeLLVMFunctionInfo(CodeGenTypes &CGT, bool instanceMethod, FTP->getExtInfo(), paramInfos, Required); } +using CanQualTypeList = SmallVector; + /// Arrange the argument and result information for a value of the /// given freestanding function type. const CGFunctionInfo & CodeGenTypes::arrangeFreeFunctionType(CanQual FTP) { - SmallVector argTypes; + CanQualTypeList argTypes; return ::arrangeLLVMFunctionInfo(*this, /*instanceMethod=*/false, argTypes, FTP); } @@ -319,7 +323,7 @@ const CGFunctionInfo & CodeGenTypes::arrangeCXXMethodType(const CXXRecordDecl *RD, const FunctionProtoType *FTP, const CXXMethodDecl *MD) { - SmallVector argTypes; + CanQualTypeList argTypes; // Add the 'this' pointer. argTypes.push_back(DeriveThisType(RD, MD)); @@ -375,8 +379,8 @@ const CGFunctionInfo & CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) { auto *MD = cast(GD.getDecl()); - SmallVector argTypes; - SmallVector paramInfos; + CanQualTypeList argTypes; + ExtParameterInfoList paramInfos; const CXXRecordDecl *ThisType = getCXXABI().getThisArgumentTypeForMethod(GD); argTypes.push_back(DeriveThisType(ThisType, MD)); @@ -421,26 +425,26 @@ CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) { argTypes, extInfo, paramInfos, required); } -static SmallVector -getArgTypesForCall(ASTContext &ctx, const CallArgList &args) { - SmallVector argTypes; +static CanQualTypeList getArgTypesForCall(ASTContext &ctx, + const CallArgList &args) { + CanQualTypeList argTypes; for (auto &arg : args) argTypes.push_back(ctx.getCanonicalParamType(arg.Ty)); return argTypes; } -static SmallVector -getArgTypesForDeclaration(ASTContext &ctx, const FunctionArgList &args) { - SmallVector argTypes; +static CanQualTypeList getArgTypesForDeclaration(ASTContext &ctx, + const FunctionArgList &args) { + CanQualTypeList argTypes; for (auto &arg : args) argTypes.push_back(ctx.getCanonicalParamType(arg->getType())); return argTypes; } -static llvm::SmallVector -getExtParameterInfosForCall(const FunctionProtoType *proto, - unsigned prefixArgs, unsigned totalArgs) { - llvm::SmallVector result; +static ExtParameterInfoList +getExtParameterInfosForCall(const FunctionProtoType *proto, unsigned prefixArgs, + unsigned totalArgs) { + ExtParameterInfoList result; if (proto->hasExtParameterInfos()) { addExtParameterInfosForCall(result, proto, prefixArgs, totalArgs); } @@ -462,8 +466,7 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, unsigned ExtraPrefixArgs, unsigned ExtraSuffixArgs, bool PassProtoArgs) { - // FIXME: Kill copy. - SmallVector ArgTypes; + CanQualTypeList ArgTypes; for (const auto &Arg : args) ArgTypes.push_back(Context.getCanonicalParamType(Arg.Ty)); @@ -483,7 +486,7 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, : Context.VoidTy; FunctionType::ExtInfo Info = FPT->getExtInfo(); - llvm::SmallVector ParamInfos; + ExtParameterInfoList ParamInfos; // If the prototype args are elided, we should only have ABI-specific args, // which never have param info. if (PassProtoArgs && FPT->hasExtParameterInfos()) { @@ -546,13 +549,11 @@ CodeGenTypes::arrangeObjCMethodDeclaration(const ObjCMethodDecl *MD) { const CGFunctionInfo & CodeGenTypes::arrangeObjCMessageSendSignature(const ObjCMethodDecl *MD, QualType receiverType) { - SmallVector argTys; - SmallVector extParamInfos( - MD->isDirectMethod() ? 1 : 2); + CanQualTypeList argTys; + ExtParameterInfoList extParamInfos(MD->isDirectMethod() ? 1 : 2); argTys.push_back(Context.getCanonicalParamType(receiverType)); if (!MD->isDirectMethod()) argTys.push_back(Context.getCanonicalParamType(Context.getObjCSelType())); - // FIXME: Kill copy? for (const auto *I : MD->parameters()) { argTys.push_back(Context.getCanonicalParamType(I->getType())); auto extParamInfo = FunctionProtoType::ExtParameterInfo().withIsNoEscape( @@ -579,7 +580,7 @@ CodeGenTypes::arrangeObjCMessageSendSignature(const ObjCMethodDecl *MD, const CGFunctionInfo & CodeGenTypes::arrangeUnprototypedObjCMessageSend(QualType returnType, const CallArgList &args) { - auto argTypes = getArgTypesForCall(Context, args); + CanQualTypeList argTypes = getArgTypesForCall(Context, args); FunctionType::ExtInfo einfo; return arrangeLLVMFunctionInfo(GetReturnType(returnType), FnInfoOpts::None, @@ -641,7 +642,7 @@ arrangeFreeFunctionLikeCall(CodeGenTypes &CGT, bool chainCall) { assert(args.size() >= numExtraRequiredArgs); - llvm::SmallVector paramInfos; + ExtParameterInfoList paramInfos; // In most cases, there are no optional arguments. RequiredArgs required = RequiredArgs::All; @@ -666,8 +667,7 @@ arrangeFreeFunctionLikeCall(CodeGenTypes &CGT, required = RequiredArgs(args.size()); } - // FIXME: Kill copy. - SmallVector argTypes; + CanQualTypeList argTypes; for (const auto &arg : args) argTypes.push_back(CGT.getContext().getCanonicalParamType(arg.Ty)); FnInfoOpts opts = chainCall ? FnInfoOpts::IsChainCall : FnInfoOpts::None; @@ -700,8 +700,9 @@ CodeGenTypes::arrangeBlockFunctionCall(const CallArgList &args, const CGFunctionInfo & CodeGenTypes::arrangeBlockFunctionDeclaration(const FunctionProtoType *proto, const FunctionArgList ¶ms) { - auto paramInfos = getExtParameterInfosForCall(proto, 1, params.size()); - auto argTypes = getArgTypesForDeclaration(Context, params); + ExtParameterInfoList paramInfos = + getExtParameterInfosForCall(proto, 1, params.size()); + CanQualTypeList argTypes = getArgTypesForDeclaration(Context, params); return arrangeLLVMFunctionInfo(GetReturnType(proto->getReturnType()), FnInfoOpts::None, argTypes, @@ -712,8 +713,7 @@ CodeGenTypes::arrangeBlockFunctionDeclaration(const FunctionProtoType *proto, const CGFunctionInfo & CodeGenTypes::arrangeBuiltinFunctionCall(QualType resultType, const CallArgList &args) { - // FIXME: Kill copy. - SmallVector argTypes; + CanQualTypeList argTypes; for (const auto &Arg : args) argTypes.push_back(Context.getCanonicalParamType(Arg.Ty)); return arrangeLLVMFunctionInfo(GetReturnType(resultType), FnInfoOpts::None, @@ -724,7 +724,7 @@ CodeGenTypes::arrangeBuiltinFunctionCall(QualType resultType, const CGFunctionInfo & CodeGenTypes::arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args) { - auto argTypes = getArgTypesForDeclaration(Context, args); + CanQualTypeList argTypes = getArgTypesForDeclaration(Context, args); return arrangeLLVMFunctionInfo(GetReturnType(resultType), FnInfoOpts::None, argTypes, FunctionType::ExtInfo(), {}, @@ -752,11 +752,10 @@ CodeGenTypes::arrangeCXXMethodCall(const CallArgList &args, "Emitting a call with less args than the required prefix?"); // Add one to account for `this`. It's a bit awkward here, but we don't count // `this` in similar places elsewhere. - auto paramInfos = - getExtParameterInfosForCall(proto, numPrefixArgs + 1, args.size()); + ExtParameterInfoList paramInfos = + getExtParameterInfosForCall(proto, numPrefixArgs + 1, args.size()); - // FIXME: Kill copy. - auto argTypes = getArgTypesForCall(Context, args); + CanQualTypeList argTypes = getArgTypesForCall(Context, args); FunctionType::ExtInfo info = proto->getExtInfo(); return arrangeLLVMFunctionInfo(GetReturnType(proto->getReturnType()), @@ -777,14 +776,14 @@ CodeGenTypes::arrangeCall(const CGFunctionInfo &signature, if (signature.arg_size() == args.size()) return signature; - SmallVector paramInfos; + ExtParameterInfoList paramInfos; auto sigParamInfos = signature.getExtParameterInfos(); if (!sigParamInfos.empty()) { paramInfos.append(sigParamInfos.begin(), sigParamInfos.end()); paramInfos.resize(args.size()); } - auto argTypes = getArgTypesForCall(Context, args); + CanQualTypeList argTypes = getArgTypesForCall(Context, args); assert(signature.getRequiredArgs().allowsOptionalArgs()); FnInfoOpts opts = FnInfoOpts::None; From 81b4fc2bedc411c257fdf24540318e24fe669b8b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 08:37:56 -0700 Subject: [PATCH 131/710] [CodeGen] Construct SmallVector with ArrayRef (NFC) (#135930) Note that we can drop the call to reserve because the constructor that takes ArrayRef calls append, which in turn calls reserve. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ab8e18267f3f5..b175e35385ec6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24747,10 +24747,8 @@ static SDValue combineConcatVectorOfShuffleAndItsOperands( // We are going to pad the shuffle operands, so any indice, that was picking // from the second operand, must be adjusted. - SmallVector AdjustedMask; - AdjustedMask.reserve(SVN->getMask().size()); + SmallVector AdjustedMask(SVN->getMask()); assert(SVN->getOperand(1).isUndef() && "Expected unary shuffle!"); - append_range(AdjustedMask, SVN->getMask()); // Identity masks for the operands of the (padded) shuffle. SmallVector IdentityMask(2 * OpVT.getVectorNumElements()); From 842bc07946e7ac71692ae235e784b9bc6c3d6535 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 08:38:16 -0700 Subject: [PATCH 132/710] [DebugInfo] Use StringRef::starts_with (NFC) (#135933) --- llvm/lib/DebugInfo/Symbolize/Symbolize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index d154f16f272bd..1d8217ad587ec 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -753,7 +753,7 @@ LLVMSymbolizer::DemangleName(StringRef Name, if (nonMicrosoftDemangle(Name, Result)) return Result; - if (!Name.empty() && Name.front() == '?') { + if (Name.starts_with('?')) { // Only do MSVC C++ demangling on symbols starting with '?'. int status = 0; char *DemangledName = microsoftDemangle( From 05772406153c390e61809757643ad49bff7dc71d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 08:39:07 -0700 Subject: [PATCH 133/710] [Utils] Use StringRef::ends_with (NFC) (#135934) --- llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 4e37c587dc975..941e787f91eff 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1936,7 +1936,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, bool IsIntrinsic = CalleeFn->isIntrinsic(); if (!IsIntrinsic) { StringRef CallerName = CI->getFunction()->getName(); - if (!CallerName.empty() && CallerName.back() == 'f' && + if (CallerName.ends_with('f') && CallerName.size() == (CalleeName.size() + 1) && CallerName.starts_with(CalleeName)) return nullptr; From 0045b82a42bd36306a14f8b40bd9b54470c299ea Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 08:39:55 -0700 Subject: [PATCH 134/710] [Vectorize] Construct SmallVector with an iterator range (NFC) (#135936) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f97386159d029..764f3e2dc64d9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8402,11 +8402,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( continue; } SmallVector> LocalLoadsDists(LoadsDists); - SmallVector OriginalLoads(LocalLoadsDists.size()); - transform(LoadsDists, OriginalLoads.begin(), - [](const std::pair &L) -> LoadInst * { - return L.first; - }); + SmallVector OriginalLoads(make_first_range(LoadsDists)); stable_sort(LocalLoadsDists, LoadSorter); SmallVector Loads; unsigned MaxConsecutiveDistance = 0; From 419fa1b06a36336ad85f1c71fc72ffa719ceb659 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 16 Apr 2025 17:57:51 +0200 Subject: [PATCH 135/710] [lldb][DataFormatter] Surface CalculateNumChildren errors in std::vector summary (#135944) When the data-formatters happen to break (e.g., due to layout changes in libc++), there's no clear indicator of them failing from a user's perspective. E.g., for `std::vector`s we would just show: ``` (std::vector) v = size=0 {} ``` which is highly misleading, especially if `v.size()` returns a non-zero size. This patch surfaces the various errors that could occur when calculating the number of children of a vector. rdar://146964266 --- .../Language/CPlusPlus/LibCxxVector.cpp | 21 +++++++--- lldb/source/ValueObject/ValueObject.cpp | 12 ++++-- .../libcxx-simulators/invalid-vector/Makefile | 3 ++ ...taFormatterLibcxxInvalidVectorSimulator.py | 39 +++++++++++++++++++ .../libcxx-simulators/invalid-vector/main.cpp | 37 ++++++++++++++++++ 5 files changed, 104 insertions(+), 8 deletions(-) create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/Makefile create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py create mode 100644 lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp index d538cac9f9134..ce2261b6f03c3 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp @@ -83,19 +83,30 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: llvm::Expected lldb_private::formatters:: LibcxxStdVectorSyntheticFrontEnd::CalculateNumChildren() { if (!m_start || !m_finish) - return 0; + return llvm::createStringError( + "Failed to determine start/end of vector data."); + uint64_t start_val = m_start->GetValueAsUnsigned(0); uint64_t finish_val = m_finish->GetValueAsUnsigned(0); - if (start_val == 0 || finish_val == 0) + // A default-initialized empty vector. + if (start_val == 0 && finish_val == 0) return 0; - if (start_val >= finish_val) - return 0; + if (start_val == 0) + return llvm::createStringError("Invalid value for start of vector."); + + if (finish_val == 0) + return llvm::createStringError("Invalid value for end of vector."); + + if (start_val > finish_val) + return llvm::createStringError( + "Start of vector data begins after end pointer."); size_t num_children = (finish_val - start_val); if (num_children % m_element_size) - return 0; + return llvm::createStringError("Size not multiple of element size."); + return num_children / m_element_size; } diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp index eac24353de90b..8741cb7343166 100644 --- a/lldb/source/ValueObject/ValueObject.cpp +++ b/lldb/source/ValueObject/ValueObject.cpp @@ -1521,10 +1521,16 @@ bool ValueObject::DumpPrintableRepresentation( str = GetLocationAsCString(); break; - case eValueObjectRepresentationStyleChildrenCount: - strm.Printf("%" PRIu64 "", (uint64_t)GetNumChildrenIgnoringErrors()); - str = strm.GetString(); + case eValueObjectRepresentationStyleChildrenCount: { + if (auto err = GetNumChildren()) { + strm.Printf("%" PRIu32, *err); + str = strm.GetString(); + } else { + strm << "error: " << toString(err.takeError()); + str = strm.GetString(); + } break; + } case eValueObjectRepresentationStyleType: str = GetTypeName().GetStringRef(); diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/Makefile new file mode 100644 index 0000000000000..38cfa81053488 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp +override CXXFLAGS_EXTRAS += -std=c++14 +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py new file mode 100644 index 0000000000000..8788ea7be882d --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py @@ -0,0 +1,39 @@ +""" +Test we can understand various layouts of the libc++'s std::string +""" + + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +import functools + + +class LibcxxInvalidVectorDataFormatterSimulatorTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test(self): + self.build() + lldbutil.run_to_source_breakpoint(self, "return 0", lldb.SBFileSpec("main.cpp")) + + self.expect( + "frame variable v1", + substrs=["size=error: Invalid value for end of vector."], + ) + self.expect( + "frame variable v2", + substrs=["size=error: Invalid value for start of vector."], + ) + self.expect( + "frame variable v3", + substrs=["size=error: Start of vector data begins after end pointer."], + ) + self.expect( + "frame variable v4", + substrs=["size=error: Failed to determine start/end of vector data."], + ) + self.expect( + "frame variable v5", + substrs=["size=error: Size not multiple of element size."], + ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp new file mode 100644 index 0000000000000..c9f04f60ec24d --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp @@ -0,0 +1,37 @@ +#define COMPRESSED_PAIR_REV 2 +#include + +namespace std { +namespace __1 { +template struct vector { + T *__begin_; + T *__end_; + _LLDB_COMPRESSED_PAIR(T *, __cap_ = nullptr, void *, __alloc_); +}; +} // namespace __1 + +namespace __2 { +template struct vector {}; +} // namespace __2 + +namespace __3 { +template struct vector { + T *__begin_; + T *__end_; + _LLDB_COMPRESSED_PAIR(short *, __cap_ = nullptr, void *, __alloc_); +}; +} // namespace __3 +} // namespace std + +int main() { + int arr[] = {1, 2, 3}; + std::__1::vector v1{.__begin_ = arr, .__end_ = nullptr}; + std::__1::vector v2{.__begin_ = nullptr, .__end_ = arr}; + std::__1::vector v3{.__begin_ = &arr[2], .__end_ = arr}; + std::__2::vector v4; + + char carr[] = {'a'}; + std::__3::vector v5{.__begin_ = carr, .__end_ = carr + 1}; + + return 0; +} From 34598fdadc06bd3b21aa97342dda05ecd9233912 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 16 Apr 2025 18:05:27 +0200 Subject: [PATCH 136/710] [llvm][ItaniumDemangle] Use __LDBL_MANT_DIG__ for configuring demangling of long doubles (#135968) Syncing in the changes from https://github.com/llvm/llvm-project/pull/134976 using the `cp-to-llvm.sh` script. --- llvm/include/llvm/Demangle/ItaniumDemangle.h | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h index b0363c1a7a786..28fdfc5eff1b6 100644 --- a/llvm/include/llvm/Demangle/ItaniumDemangle.h +++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h @@ -38,8 +38,10 @@ DEMANGLE_NAMESPACE_BEGIN template class PODSmallVector { - static_assert(std::is_trivial::value, - "T is required to be a trivial type"); + static_assert(std::is_trivially_copyable::value, + "T is required to be a trivially copyable type"); + static_assert(std::is_trivially_default_constructible::value, + "T is required to be trivially default constructible"); T *First = nullptr; T *Last = nullptr; T *Cap = nullptr; @@ -5739,14 +5741,16 @@ struct FloatData template <> struct FloatData { -#if defined(__mips__) && defined(__mips_n64) || defined(__aarch64__) || \ - defined(__wasm__) || defined(__riscv) || defined(__loongarch__) || \ - defined(__ve__) - static const size_t mangled_size = 32; -#elif defined(__arm__) || defined(__mips__) || defined(__hexagon__) - static const size_t mangled_size = 16; +#if __LDBL_MANT_DIG__ == 113 || __LDBL_MANT_DIG__ == 106 + static const size_t mangled_size = 32; +#elif __LDBL_MANT_DIG__ == 53 || defined(_MSC_VER) + // MSVC doesn't define __LDBL_MANT_DIG__, but it has long double equal to + // regular double on all current architectures. + static const size_t mangled_size = 16; +#elif __LDBL_MANT_DIG__ == 64 + static const size_t mangled_size = 20; #else - static const size_t mangled_size = 20; // May need to be adjusted to 16 or 24 on other platforms +#error Unknown size for __LDBL_MANT_DIG__ #endif // `-0x1.ffffffffffffffffffffffffffffp+16383` + 'L' + '\0' == 42 bytes. // 28 'f's * 4 bits == 112 bits, which is the number of mantissa bits. From 7f4422d99115efbb770e13ccb60cf6bfc190c245 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 16 Apr 2025 17:14:14 +0100 Subject: [PATCH 137/710] [AArch64] Add testing for shuffles that extend into new types. NFC --- llvm/test/CodeGen/AArch64/shuffle-extend.ll | 264 ++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/shuffle-extend.ll diff --git a/llvm/test/CodeGen/AArch64/shuffle-extend.ll b/llvm/test/CodeGen/AArch64/shuffle-extend.ll new file mode 100644 index 0000000000000..bb31380cc3ade --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shuffle-extend.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +define <2 x i8> @test_v16i8_v2i32_824(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_v16i8_v2i32_824: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v1.b[8] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <2 x i32> + %d = add <2 x i8> %c, %c + ret <2 x i8> %d +} + +define <2 x i8> @test_v16i8_v2i32_016(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_v16i8_v2i32_016: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <2 x i32> + %d = add <2 x i8> %c, %c + ret <2 x i8> %d +} + +define <2 x i8> @test_v8i8_v2i32_08(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: test_v8i8_v2i32_08: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %c = shufflevector <8 x i8> %a, <8 x i8> %b, <2 x i32> + %d = add <2 x i8> %c, %c + ret <2 x i8> %d +} + +define <2 x i16> @test_v8i16_v2i32_08(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_v8i16_v2i32_08: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <2 x i32> + %d = add <2 x i16> %c, %c + ret <2 x i16> %d +} + +define <2 x i16> @test_v4i16_v2i32_04(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: test_v4i16_v2i32_04: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <2 x i32> + %d = add <2 x i16> %c, %c + ret <2 x i16> %d +} + + +define <4 x i8> @test_v16i8_v4i16_824(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_v16i8_v4i16_824: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v1.b[8] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: mov v2.h[1], w9 +; CHECK-NEXT: mov v2.h[2], w8 +; CHECK-NEXT: umov w8, v1.b[0] +; CHECK-NEXT: mov v2.h[3], w8 +; CHECK-NEXT: add v0.4h, v2.4h, v2.4h +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <4 x i32> + %d = add <4 x i8> %c, %c + ret <4 x i8> %d +} + +define <4 x i8> @test_v16i8_v4i16_016(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_v16i8_v4i16_016: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v2.h[1], w9 +; CHECK-NEXT: mov v2.h[2], w8 +; CHECK-NEXT: umov w8, v1.b[4] +; CHECK-NEXT: mov v2.h[3], w8 +; CHECK-NEXT: add v0.4h, v2.4h, v2.4h +; CHECK-NEXT: ret + %c = shufflevector <16 x i8> %a, <16 x i8> %b, <4 x i32> + %d = add <4 x i8> %c, %c + ret <4 x i8> %d +} + +define <4 x i8> @test_v8i8_v4i16_08(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: test_v8i8_v4i16_08: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v2.h[1], w9 +; CHECK-NEXT: mov v2.h[2], w8 +; CHECK-NEXT: umov w8, v1.b[4] +; CHECK-NEXT: mov v2.h[3], w8 +; CHECK-NEXT: add v0.4h, v2.4h, v2.4h +; CHECK-NEXT: ret + %c = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> + %d = add <4 x i8> %c, %c + ret <4 x i8> %d +} + +define <4 x i16> @test_v8i16_v4i16_08(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_v8i16_v4i16_08: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + %d = add <4 x i16> %c, %c + ret <4 x i16> %d +} + +define <4 x i16> @test_v4i16_v4i16_04(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: test_v4i16_v4i16_04: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.8b, v1.8b, v0.8b, #2 +; CHECK-NEXT: trn2 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> + %d = add <4 x i16> %c, %c + ret <4 x i16> %d +} + + +define i1 @test1(ptr %add.ptr, ptr %result, <2 x i64> %hi, <2 x i64> %lo) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: mov w12, #1 // =0x1 +; CHECK-NEXT: cmgt v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmgt v4.2d, v1.2d, v2.2d +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v3.b[8] +; CHECK-NEXT: umov w10, v0.b[0] +; CHECK-NEXT: umov w11, v3.b[0] +; CHECK-NEXT: sub v0.2d, v2.2d, v1.2d +; CHECK-NEXT: dup v1.2d, x12 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: orr w8, w10, w8, lsl #1 +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: tst w8, #0x3 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %19 = load <2 x i64>, ptr %add.ptr, align 8 + %cmp = icmp sgt <2 x i64> %19, %hi + %sext = sext <2 x i1> %cmp to <2 x i64> + %20 = bitcast <2 x i64> %sext to <16 x i8> + %21 = and <16 x i8> %20, + %storedv = extractelement <16 x i8> %21, i64 0 + %storedv.1 = extractelement <16 x i8> %21, i64 8 + %22 = shl nuw nsw i8 %storedv.1, 1 + %or.111 = or disjoint i8 %22, %storedv + %cmp101 = icmp slt <2 x i64> %19, %lo + %sext102 = sext <2 x i1> %cmp101 to <2 x i64> + %23 = bitcast <2 x i64> %sext102 to <16 x i8> + %24 = and <16 x i8> %23, + %storedv104 = extractelement <16 x i8> %24, i64 0 + %storedv.1105 = extractelement <16 x i8> %24, i64 8 + %25 = shl nuw nsw i8 %storedv.1105, 1 + %or.111106 = or disjoint i8 %25, %storedv104 + %reass.sub = sub <2 x i64> %19, %lo + %add = add <2 x i64> %reass.sub, splat (i64 1) + store <2 x i64> %add, ptr %result, align 8 + %or118 = or i8 %or.111, %or.111106 + %cmp24.not = icmp eq i8 %or118, 0 + ret i1 %cmp24.not +} + +define i1 @test2(ptr %add.ptr, ptr %result, <2 x i64> %hi, <2 x i64> %lo) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: cmgt v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmgt v4.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v3.16b, v4.16b, v3.16b +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v3.b[8] +; CHECK-NEXT: umov w10, v0.b[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: umov w8, v3.b[0] +; CHECK-NEXT: fmov s3, w10 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mov w9, #1 // =0x1 +; CHECK-NEXT: mov v3.s[1], w8 +; CHECK-NEXT: dup v2.2d, x9 +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: add v0.2d, v1.2d, v2.2d +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tst w8, #0xff +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %1 = load <2 x i64>, ptr %add.ptr, align 8 + %cmp = icmp sgt <2 x i64> %1, %hi + %sext = sext <2 x i1> %cmp to <2 x i64> + %2 = bitcast <2 x i64> %sext to <16 x i8> + %3 = and <16 x i8> %2, + %cmp101 = icmp slt <2 x i64> %1, %lo + %sext102 = sext <2 x i1> %cmp101 to <2 x i64> + %4 = bitcast <2 x i64> %sext102 to <16 x i8> + %5 = and <16 x i8> %4, + %6 = shufflevector <16 x i8> %3, <16 x i8> %5, <2 x i32> + %7 = shl nuw nsw <2 x i8> %6, splat (i8 1) + %8 = shufflevector <16 x i8> %3, <16 x i8> %5, <2 x i32> + %9 = or disjoint <2 x i8> %7, %8 + %reass.sub = sub <2 x i64> %1, %lo + %add = add <2 x i64> %reass.sub, splat (i64 1) + store <2 x i64> %add, ptr %result, align 8 + %10 = extractelement <2 x i8> %9, i32 0 + %11 = extractelement <2 x i8> %9, i32 1 + %or118 = or i8 %10, %11 + %cmp24.not = icmp eq i8 %or118, 0 + ret i1 %cmp24.not +} From 24171f4d12a02b49de1cc7a1beb2dc19c740a9f1 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 09:29:28 -0700 Subject: [PATCH 138/710] [NFC][CFI] Add test to check for '-flto' and '-fvisibility=' flags (#135892) --- clang/test/Driver/sanitizer-ld.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 52c1f6bf96242..67ca33d676d20 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -829,6 +829,16 @@ // CHECK-NSAN-UBSAN: "--whole-archive" "{{[^"]*}}libclang_rt.nsan.a" "--no-whole-archive" +// CFI requirements. +// RUN: not %clang -fsanitize=cfi \ +// RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s 2>&1 \ +// RUN: | %{filecheck} --check-prefix=CHECK-CFI-PREREQ-LINUX +// CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-flto' +// CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-fvisibility=' + // CFI by itself does not link runtime libraries. // RUN: not %clang -fsanitize=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ From 76b5fcbf975547251faaeed8b567ea09d139a607 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 16 Apr 2025 09:39:52 -0700 Subject: [PATCH 139/710] [TableGen] Store flat source operand number in OperandMap in PseudoLoweringEmitter. NFC (#135886) Previously we stored the index into the source CodeGenInstruction's operand list. Any operand with sub operands stored the same index into all of the OperandMap entries for that operand. The emitting loop would look up the MIOperandNo for the source and add the sub index. This patch moves the logic into the loop that updates the OperandMap. Now the emitting loop only needs to print the value. While there, I've added a check that MIOperandNo is the same for source and destination. --- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index 7f67c13c0bbbd..96325eac95004 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -181,24 +181,32 @@ void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { SourceOperands[SrcOp.Name] = Idx; LLVM_DEBUG(dbgs() << " Operand mapping:\n"); - for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i) { + for (const auto &[Idx, Opnd] : enumerate(Insn.Operands)) { // We've already handled constant values. Just map instruction operands // here. - if (OperandMap[Insn.Operands[i].MIOperandNo].Kind != OpData::Operand) + if (OperandMap[Opnd.MIOperandNo].Kind != OpData::Operand) continue; StringMap::iterator SourceOp = - SourceOperands.find(Dag->getArgNameStr(i)); + SourceOperands.find(Dag->getArgNameStr(Idx)); if (SourceOp == SourceOperands.end()) PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + - "', output operand '" + Dag->getArgNameStr(i) + + "', output operand '" + Dag->getArgNameStr(Idx) + "' has no matching source operand"); + const auto &SrcOpnd = SourceInsn.Operands[SourceOp->getValue()]; + if (Opnd.MINumOperands != SrcOpnd.MINumOperands) + PrintFatalError( + Rec, + "In pseudo instruction '" + Rec->getName() + "', output operand '" + + Opnd.Rec->getName() + + "' has a different number of sub operands than source operand '" + + SrcOpnd.Rec->getName() + "'"); + // Map the source operand to the destination operand index for each // MachineInstr operand. - for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I) - OperandMap[Insn.Operands[i].MIOperandNo + I].Data.Operand = - SourceOp->getValue(); + for (unsigned I = 0, E = Opnd.MINumOperands; I != E; ++I) + OperandMap[Opnd.MIOperandNo + I].Data.Operand = SrcOpnd.MIOperandNo + I; - LLVM_DEBUG(dbgs() << " " << SourceOp->getValue() << " ==> " << i + LLVM_DEBUG(dbgs() << " " << SourceOp->getValue() << " ==> " << Idx << "\n"); } @@ -236,10 +244,7 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { switch (Expansion.OperandMap[MIOpNo + i].Kind) { case OpData::Operand: o << " lowerOperand(MI->getOperand(" - << Source.Operands[Expansion.OperandMap[MIOpNo].Data.Operand] - .MIOperandNo + - i - << "), MCOp);\n" + << Expansion.OperandMap[MIOpNo + i].Data.Operand << "), MCOp);\n" << " Inst.addOperand(MCOp);\n"; break; case OpData::Imm: From 76b7ae7e454a1f0d814406d16926aa6722afcda4 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 09:30:23 -0700 Subject: [PATCH 140/710] [SLP][NFC]Remove std::placeholders:: qualifiers, NFC --- .../Transforms/Vectorize/SLPVectorizer.cpp | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 764f3e2dc64d9..810d44343c4a9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21008,14 +21008,13 @@ bool SLPVectorizerPass::vectorizeStores( AnyProfitableGraph = false; unsigned StartIdx = std::distance( RangeSizes.begin(), - find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF, - std::placeholders::_1))); + find_if(RangeSizes, + std::bind(IsNotVectorized, Size >= MaxRegVF, _1))); while (StartIdx < End) { - unsigned EndIdx = - std::distance(RangeSizes.begin(), - find_if(RangeSizes.drop_front(StartIdx), - std::bind(IsVectorized, Size >= MaxRegVF, - std::placeholders::_1))); + unsigned EndIdx = std::distance( + RangeSizes.begin(), + find_if(RangeSizes.drop_front(StartIdx), + std::bind(IsVectorized, Size >= MaxRegVF, _1))); unsigned Sz = EndIdx >= End ? End : EndIdx; for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) { if (!checkTreeSizes(RangeSizes.slice(Cnt, Size), @@ -21085,7 +21084,7 @@ bool SLPVectorizerPass::vectorizeStores( if (Size > 2 && Res && !all_of(RangeSizes.slice(Cnt, Size), std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize, - std::placeholders::_1))) { + _1))) { Cnt += Size; continue; } @@ -21093,8 +21092,7 @@ bool SLPVectorizerPass::vectorizeStores( // trees, just with larger number of elements. if (Size > MaxRegVF && TreeSize > 1 && all_of(RangeSizes.slice(Cnt, Size), - std::bind(FirstSizeSame, TreeSize, - std::placeholders::_1))) { + std::bind(FirstSizeSame, TreeSize, _1))) { Cnt += Size; while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize) ++Cnt; @@ -21118,8 +21116,7 @@ bool SLPVectorizerPass::vectorizeStores( StartIdx = std::distance( RangeSizes.begin(), find_if(RangeSizes.drop_front(Sz), - std::bind(IsNotVectorized, Size >= MaxRegVF, - std::placeholders::_1))); + std::bind(IsNotVectorized, Size >= MaxRegVF, _1))); } if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size)) break; @@ -21140,8 +21137,7 @@ bool SLPVectorizerPass::vectorizeStores( End - std::distance( RangeSizes.begin(), - find_if(RangeSizes, std::bind(IsNotVectorized, true, - std::placeholders::_1))) + + find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) + 1)); unsigned VF = bit_ceil(CandidateVFs.front()) * 2; unsigned Limit = From 726a5c2c57c486e69df2dfc296482e1d8014ab62 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 09:54:27 -0700 Subject: [PATCH 141/710] [NFC][CFI] Avoid clang error in CFI tests (#135981) In these tests we test correct linking flags set, and it's confusing that command fails because of other missing required flags. Clang diagnostics opportunistically proceed after error report on required flags, but there is no guaranty that processing of tested flags are the same in supported and erroneous flag sets. --- clang/test/Driver/sanitizer-ld.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 67ca33d676d20..a00ec029d3d46 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -840,7 +840,8 @@ // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-fvisibility=' // CFI by itself does not link runtime libraries. -// RUN: not %clang -fsanitize=cfi \ +// RUN: %clang -fsanitize=cfi \ +// RUN: -flto -fvisibility=hidden \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -849,7 +850,8 @@ // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" // CFI with diagnostics links the UBSan runtime. -// RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ +// RUN: %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ +// RUN: -flto -fvisibility=hidden \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -859,7 +861,8 @@ // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: -flto -fvisibility=hidden \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -870,7 +873,8 @@ // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic // Cross-DSO CFI with diagnostics links just the CFI runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: -flto -fvisibility=hidden \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ @@ -882,7 +886,8 @@ // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic // Cross-DSO CFI on Android does not link runtime libraries. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: -flto -fvisibility=hidden \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ @@ -891,7 +896,8 @@ // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. -// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ +// RUN: -flto -fvisibility=hidden \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ From 72506eb37d9440d32e6dada187785b06aecb415c Mon Sep 17 00:00:00 2001 From: Kostiantyn Lazukin Date: Wed, 16 Apr 2025 17:54:42 +0100 Subject: [PATCH 142/710] [compiler-rt] Fix `addtf3_test.c` being skipped due to misplaced include (#134106) [compiler-rt] The test `addtf3_test.c` is currently guarded by `#if defined(CRT_HAS_IEEE_TF)`, a macro that is declared in `int_lib.h`. However, `int_lib.h` is included *after* the preprocessor check, which results in the macro not being defined in time and causes the test to always be skipped. This patch moves the includes of `fp_test.h` and `int_lib.h` to the top of the file so that `CRT_HAS_IEEE_TF` is defined before it is checked. Co-authored-by: Kostiantyn Lazukin --- compiler-rt/test/builtins/Unit/addtf3_test.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/compiler-rt/test/builtins/Unit/addtf3_test.c b/compiler-rt/test/builtins/Unit/addtf3_test.c index cd5872e7dedf4..0ab73172c193a 100644 --- a/compiler-rt/test/builtins/Unit/addtf3_test.c +++ b/compiler-rt/test/builtins/Unit/addtf3_test.c @@ -4,14 +4,14 @@ #include #include +#include "fp_test.h" +#include "int_lib.h" + // The testcase currently assumes IEEE TF format, once that has been // fixed the defined(CRT_HAS_IEEE_TF) guard can be removed to enable it for // IBM 128 floats as well. #if defined(CRT_HAS_IEEE_TF) -# include "fp_test.h" -# include "int_lib.h" - // Returns: a + b COMPILER_RT_ABI tf_float __addtf3(tf_float a, tf_float b); @@ -62,7 +62,13 @@ int main() { defined(i386) || defined(__x86_64__) || \ (defined(__loongarch__) && __loongarch_frlen != 0) // Rounding mode tests on supported architectures - const tf_float m = 1234.0L, n = 0.01L; + // Use explicit values because the binary representation of long double + // is platform dependent. Intended values: + // m = 1234.0L, n = 0.01L (where L is a literal for 128 bit long double) + const tf_float m = + fromRep128(UINT64_C(0x4009348000000000), UINT64_C(0x0000000000000000)); + const tf_float n = + fromRep128(UINT64_C(0x3FF847AE147AE147), UINT64_C(0xAE147AE147AE147B)); fesetround(FE_UPWARD); if (test__addtf3(m, n, UINT64_C(0x40093480a3d70a3d), From 51fa6cde7d773aa7f41b410c8263884ad32eca86 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 09:52:27 -0700 Subject: [PATCH 143/710] [SLP][NFC]Add a test with missing unsigned promotion for smax reduction, NFC --- .../smax-reduction-unsigned-missing-sign.ll | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll new file mode 100644 index 0000000000000..e6408572acf8f --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s + +define i32 @test(i8 %0) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i8 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> , i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.smax.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %1 = icmp ne i8 0, 0 + %2 = zext i1 %1 to i32 + %3 = icmp ne i8 %0, 0 + %4 = zext i1 %3 to i32 + %5 = icmp ne i8 0, 0 + %6 = zext i1 %5 to i32 + %7 = icmp ne i8 0, 0 + %8 = zext i1 %7 to i32 + %cond27.2 = tail call i32 @llvm.smax.i32(i32 %4, i32 %2) + %cond27.3 = tail call i32 @llvm.smax.i32(i32 %6, i32 %cond27.2) + %cond27.4 = tail call i32 @llvm.smax.i32(i32 %8, i32 %cond27.3) + ret i32 %cond27.4 +} + +declare i32 @llvm.smax.i32(i32, i32) From ed9bcb52954f8e6171563d2b8310b0ca6d03d655 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 16 Apr 2025 20:05:13 +0300 Subject: [PATCH 144/710] [CodeGen][RISCV] Add helper class for emitting CFI instructions into MIR (#135845) PR: https://github.com/llvm/llvm-project/pull/135845 --- llvm/include/llvm/CodeGen/CFIInstBuilder.h | 88 ++++++ llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 272 ++++--------------- 2 files changed, 146 insertions(+), 214 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/CFIInstBuilder.h diff --git a/llvm/include/llvm/CodeGen/CFIInstBuilder.h b/llvm/include/llvm/CodeGen/CFIInstBuilder.h new file mode 100644 index 0000000000000..e799b47a0c974 --- /dev/null +++ b/llvm/include/llvm/CodeGen/CFIInstBuilder.h @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_CFIINSTBUILDER_H +#define LLVM_CODEGEN_CFIINSTBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/MC/MCDwarf.h" + +namespace llvm { + +/// Helper class for creating CFI instructions and inserting them into MIR. +class CFIInstBuilder { + MachineFunction &MF; + MachineBasicBlock &MBB; + MachineBasicBlock::iterator InsertPt; + + /// MIFlag to set on a MachineInstr. Typically, FrameSetup or FrameDestroy. + MachineInstr::MIFlag MIFlag; + + /// Selects DWARF register numbering: debug or exception handling. Should be + /// consistent with the choice of the ELF section (.debug_frame or .eh_frame) + /// where CFI will be encoded. + bool IsEH; + + // Cache frequently used variables. + const TargetRegisterInfo &TRI; + const MCInstrDesc &CFIID; + const MIMetadata MIMD; // Default-initialized, no debug location desired. + +public: + CFIInstBuilder(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + MachineInstr::MIFlag MIFlag, bool IsEH = true) + : MF(*MBB.getParent()), MBB(MBB), MIFlag(MIFlag), IsEH(IsEH), + TRI(*MF.getSubtarget().getRegisterInfo()), + CFIID(MF.getSubtarget().getInstrInfo()->get( + TargetOpcode::CFI_INSTRUCTION)) { + setInsertPoint(InsertPt); + } + + void setInsertPoint(MachineBasicBlock::iterator IP) { InsertPt = IP; } + + void insertCFIInst(const MCCFIInstruction &CFIInst) const { + BuildMI(MBB, InsertPt, MIMD, CFIID) + .addCFIIndex(MF.addFrameInst(CFIInst)) + .setMIFlag(MIFlag); + } + + void buildDefCFA(MCRegister Reg, int64_t Offset) const { + insertCFIInst(MCCFIInstruction::cfiDefCfa( + nullptr, TRI.getDwarfRegNum(Reg, IsEH), Offset)); + } + + void buildDefCFARegister(MCRegister Reg) const { + insertCFIInst(MCCFIInstruction::createDefCfaRegister( + nullptr, TRI.getDwarfRegNum(Reg, IsEH))); + } + + void buildDefCFAOffset(int64_t Offset) const { + insertCFIInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset)); + } + + void buildOffset(MCRegister Reg, int64_t Offset) const { + insertCFIInst(MCCFIInstruction::createOffset( + nullptr, TRI.getDwarfRegNum(Reg, IsEH), Offset)); + } + + void buildRestore(MCRegister Reg) const { + insertCFIInst(MCCFIInstruction::createRestore( + nullptr, TRI.getDwarfRegNum(Reg, IsEH))); + } + + void buildEscape(StringRef Bytes, StringRef Comment = "") const { + insertCFIInst( + MCCFIInstruction::createEscape(nullptr, Bytes, SMLoc(), Comment)); + } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_CFIINSTBUILDER_H diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index c7b2b781422d1..cefe7a732519d 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -14,6 +14,7 @@ #include "RISCVMachineFunctionInfo.h" #include "RISCVSubtarget.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -28,64 +29,6 @@ using namespace llvm; -namespace { - -class CFISaveRegisterEmitter { - MachineFunction &MF; - MachineFrameInfo &MFI; - -public: - CFISaveRegisterEmitter(MachineFunction &MF) - : MF{MF}, MFI{MF.getFrameInfo()} {}; - - void emit(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const RISCVRegisterInfo &RI, const RISCVInstrInfo &TII, - const DebugLoc &DL, const CalleeSavedInfo &CS) const { - int FrameIdx = CS.getFrameIdx(); - int64_t Offset = MFI.getObjectOffset(FrameIdx); - MCRegister Reg = CS.getReg(); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, RI.getDwarfRegNum(Reg, true), Offset)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } -}; - -class CFIRestoreRegisterEmitter { - MachineFunction &MF; - -public: - CFIRestoreRegisterEmitter(MachineFunction &MF) : MF{MF} {}; - - void emit(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const RISCVRegisterInfo &RI, const RISCVInstrInfo &TII, - const DebugLoc &DL, const CalleeSavedInfo &CS) const { - MCRegister Reg = CS.getReg(); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createRestore(nullptr, RI.getDwarfRegNum(Reg, true))); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); - } -}; - -} // namespace - -template -void RISCVFrameLowering::emitCFIForCSI( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const SmallVector &CSI) const { - MachineFunction *MF = MBB.getParent(); - const RISCVRegisterInfo *RI = STI.getRegisterInfo(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); - DebugLoc DL = MBB.findDebugLoc(MBBI); - - Emitter E{*MF}; - for (const auto &CS : CSI) - E.emit(MBB, MBBI, *RI, *TII, DL, CS); -} - static Align getABIStackAlignment(RISCVABI::ABI ABI) { if (ABI == RISCVABI::ABI_ILP32E) return Align(4); @@ -209,11 +152,8 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, Offset, // addend (sleb128) }; - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( - nullptr, StringRef(CFIInst, sizeof(CFIInst)))); - BuildMI(MBB, MI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); + CFIInstBuilder(MBB, MI, MachineInstr::FrameSetup) + .buildEscape(StringRef(CFIInst, sizeof(CFIInst))); } static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, @@ -257,11 +197,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameDestroy); // Restore the SCS pointer - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( - nullptr, STI.getRegisterInfo()->getDwarfRegNum(SCSPReg, /*IsEH*/ true))); - BuildMI(MBB, MI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameDestroy); + CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); } // Get the ID of the libcall used for spilling and restoring callee saved @@ -531,14 +467,10 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV( .setMIFlag(Flag); TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag); + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); if (EmitCFI) { // Set the CFA register to TargetReg. - unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true); - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFA(TargetReg, -Amount); } // It will be expanded to a probe loop in `inlineStackProbe`. @@ -548,12 +480,7 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV( if (EmitCFI) { // Set the CFA register back to SP. - unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true); - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFARegister(SPReg); } // SUB SP, SP, T1 @@ -665,20 +592,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, const RISCVRegisterInfo *RI = STI.getRegisterInfo(); const RISCVInstrInfo *TII = STI.getInstrInfo(); bool IsRV64 = STI.is64Bit(); + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); // Simply allocate the stack if it's not big enough to require a probe. if (!NeedProbe || Offset <= ProbeSize) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset), MachineInstr::FrameSetup, getStackAlign()); - if (EmitCFI) { - // Emit ".cfi_def_cfa_offset RealStackSize" - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(RealStackSize); if (NeedProbe && DynAllocation) { // s[d|w] zero, 0(sp) @@ -707,14 +629,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, .setMIFlags(MachineInstr::FrameSetup); CurrentOffset += ProbeSize; - if (EmitCFI) { - // Emit ".cfi_def_cfa_offset CurrentOffset" - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, CurrentOffset)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(CurrentOffset); } uint64_t Residual = Offset - CurrentOffset; @@ -722,14 +638,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual), MachineInstr::FrameSetup, getStackAlign()); - if (EmitCFI) { - // Emit ".cfi_def_cfa_offset Offset" - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(Offset); if (DynAllocation) { // s[d|w] zero, 0(sp) @@ -756,12 +666,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, if (EmitCFI) { // Set the CFA register to TargetReg. - unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true); - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, RoundedSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFA(TargetReg, RoundedSize); } // It will be expanded to a probe loop in `inlineStackProbe`. @@ -771,12 +676,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, if (EmitCFI) { // Set the CFA register back to SP. - unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true); - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFARegister(SPReg); } if (Residual) { @@ -792,14 +692,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, } } - if (EmitCFI) { - // Emit ".cfi_def_cfa_offset Offset" - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(Offset); } static bool isPush(unsigned Opcode) { @@ -855,7 +749,6 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); auto *RVFI = MF.getInfo(); const RISCVRegisterInfo *RI = STI.getRegisterInfo(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); Register BPReg = RISCVABI::getBPReg(); @@ -888,6 +781,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // callee-saved register. MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + getUnmanagedCSI(MF, CSI).size()); + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); // If libcalls are used to spill and restore callee-saved registers, the frame // has two sections; the opaque section managed by the libcalls, and the @@ -915,14 +809,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign()); RVFI->setLibCallStackSize(LibCallFrameSize); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, LibCallFrameSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - - emitCFIForCSI(MBB, MBBI, - getPushOrLibCallsSavedInfo(MF, CSI)); + CFIBuilder.buildDefCFAOffset(LibCallFrameSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); } // FIXME (note copied from Lanai): This appears to be overallocating. Needs @@ -949,13 +839,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, } if (RVFI->useQCIInterrupt(MF)) { - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, QCIInterruptPushAmount)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - - emitCFIForCSI(MBB, MBBI, getQCISavedInfo(MF, CSI)); + CFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); + for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); } else if (RVFI->isPushable(MF) && FirstFrameSetup != MBB.end() && isPush(FirstFrameSetup->getOpcode())) { // Use available stack adjustment in push instruction to allocate additional @@ -967,14 +854,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, FirstFrameSetup->getOperand(1).setImm(StackAdj); StackSize -= StackAdj; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize - StackSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - - emitCFIForCSI(MBB, MBBI, - getPushOrLibCallsSavedInfo(MF, CSI)); + CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); } // Allocate space on the stack if necessary. @@ -995,10 +878,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // FIXME: assumes exactly one instruction is used to save each callee-saved // register. std::advance(MBBI, getUnmanagedCSI(MF, CSI).size()); + CFIBuilder.setInsertPoint(MBBI); // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - emitCFIForCSI(MBB, MBBI, getUnmanagedCSI(MF, CSI)); + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx())); // Generate new FP. if (hasFP(MF)) { @@ -1017,12 +902,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - // Emit ".cfi_def_cfa $fp, RVFI->getVarArgsSaveSize()" - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( - nullptr, RI->getDwarfRegNum(FPReg, true), RVFI->getVarArgsSaveSize())); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); } uint64_t SecondSPAdjustAmount = 0; @@ -1052,11 +932,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, if (!hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". - unsigned CFIIndex = MF.addFrameInst(createDefCFAExpression( + CFIBuilder.insertCFIInst(createDefCFAExpression( *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); } std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); @@ -1122,17 +999,13 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF, uint64_t &StackSize, int64_t CFAOffset) const { const RISCVRegisterInfo *RI = STI.getRegisterInfo(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(StackSize), MachineInstr::FrameDestroy, getStackAlign()); StackSize = 0; - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) + .buildDefCFAOffset(CFAOffset); } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1140,7 +1013,6 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, const RISCVRegisterInfo *RI = STI.getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); auto *RVFI = MF.getInfo(); - const RISCVInstrInfo *TII = STI.getInstrInfo(); // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -1171,6 +1043,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // callee-saved register. auto FirstScalarCSRRestoreInsn = std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); + CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn, + MachineInstr::FrameDestroy); uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount @@ -1191,14 +1065,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getScalable(RVVStackSize), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( - nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize)); - BuildMI(MBB, FirstScalarCSRRestoreInsn, DL, - TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); - } + if (!hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); } @@ -1216,14 +1084,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(SecondSPAdjustAmount), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) { - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, FirstSPAdjustAmount)); - BuildMI(MBB, FirstScalarCSRRestoreInsn, DL, - TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); - } + if (!hasFP(MF)) + CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount); } // Restore the stack pointer using the value of the frame pointer. Only @@ -1243,19 +1105,14 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, getStackAlign()); } - if (hasFP(MF)) { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( - nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize)); - BuildMI(MBB, FirstScalarCSRRestoreInsn, DL, - TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); - } + if (hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); // Skip to after the restores of scalar callee-saved registers // FIXME: assumes exactly one instruction is used to restore each // callee-saved register. MBBI = std::next(FirstScalarCSRRestoreInsn, getUnmanagedCSI(MF, CSI).size()); + CFIBuilder.setInsertPoint(MBBI); if (getLibCallID(MF, CSI) != -1) { // tail __riscv_restore_[0-12] instruction is considered as a terminator, @@ -1271,7 +1128,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, } // Recover callee-saved registers. - emitCFIForCSI(MBB, MBBI, getUnmanagedCSI(MF, CSI)); + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) { // Use available stack adjustment in pop instruction to deallocate stack @@ -1290,17 +1148,14 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto NextI = next_nodbg(MBBI, MBB.end()); if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) { ++MBBI; + CFIBuilder.setInsertPoint(MBBI); - emitCFIForCSI( - MBB, MBBI, getPushOrLibCallsSavedInfo(MF, CSI)); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); // Update CFA offset. After CM_POP SP should be equal to CFA, so CFA // offset should be a zero. - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); + CFIBuilder.buildDefCFAOffset(0); } } @@ -2052,9 +1907,7 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI( MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = MF->getFrameInfo(); RISCVMachineFunctionInfo *RVFI = MF->getInfo(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); const RISCVRegisterInfo &TRI = *STI.getRegisterInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo()); if (RVVCSI.empty()) @@ -2068,17 +1921,15 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI( FixedSize -= ScalarLocalVarSize; } + CFIInstBuilder CFIBuilder(MBB, MI, MachineInstr::FrameSetup); for (auto &CS : RVVCSI) { // Insert the spill to the stack frame. int FI = CS.getFrameIdx(); MCRegister BaseReg = getRVVBaseRegister(TRI, CS.getReg()); unsigned NumRegs = getCalleeSavedRVVNumRegs(CS.getReg()); for (unsigned i = 0; i < NumRegs; ++i) { - unsigned CFIIndex = MF->addFrameInst(createDefCFAOffset( + CFIBuilder.insertCFIInst(createDefCFAOffset( TRI, BaseReg + i, -FixedSize, MFI.getObjectOffset(FI) / 8 + i)); - BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); } } } @@ -2087,22 +1938,15 @@ void RISCVFrameLowering::emitCalleeSavedRVVEpilogCFI( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = MF->getFrameInfo(); - const RISCVRegisterInfo *RI = STI.getRegisterInfo(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); const RISCVRegisterInfo &TRI = *STI.getRegisterInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); + CFIInstBuilder CFIHelper(MBB, MI, MachineInstr::FrameDestroy); const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo()); for (auto &CS : RVVCSI) { MCRegister BaseReg = getRVVBaseRegister(TRI, CS.getReg()); unsigned NumRegs = getCalleeSavedRVVNumRegs(CS.getReg()); - for (unsigned i = 0; i < NumRegs; ++i) { - unsigned CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore( - nullptr, RI->getDwarfRegNum(BaseReg + i, true))); - BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameDestroy); - } + for (unsigned i = 0; i < NumRegs; ++i) + CFIHelper.buildRestore(BaseReg + i); } } From 0daf20b3605f19271af7afa4175e7d62194e5578 Mon Sep 17 00:00:00 2001 From: James Newling Date: Wed, 16 Apr 2025 10:08:36 -0700 Subject: [PATCH 145/710] [mlir][vector] transpose(broadcast) -> broadcast canonicalization (#135096) Example seen in the 'real world': ``` %0 = vector.broadcast %arg0 : vector<1xi8> to vector<1x8xi8> %1 = vector.transpose %0, [1, 0] : vector<1x8xi8> to vector<8x1xi8> ``` This PR adds a canonicalizer that rewrites the above as ``` %1 = vector.broadcast %arg0 : vector<1xi8> to vector<8x1xi8> ``` It works by determining if a transpose is only shuffling contiguous broadcast dimensions. --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 120 ++++++++++++--- mlir/test/Dialect/Vector/canonicalize.mlir | 24 --- .../Vector/canonicalize/vector-transpose.mlir | 139 ++++++++++++++++++ 3 files changed, 235 insertions(+), 48 deletions(-) create mode 100644 mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index bee5c1fd6ed58..504032a398fbe 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -6085,28 +6085,6 @@ class TransposeFolder final : public OpRewritePattern { } }; -// Folds transpose(broadcast()) into broadcast(). -struct FoldTransposedScalarBroadcast final - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(vector::TransposeOp transposeOp, - PatternRewriter &rewriter) const override { - auto bcastOp = transposeOp.getVector().getDefiningOp(); - if (!bcastOp) - return failure(); - - auto srcVectorType = llvm::dyn_cast(bcastOp.getSourceType()); - if (!srcVectorType || srcVectorType.getNumElements() == 1) { - rewriter.replaceOpWithNewOp( - transposeOp, transposeOp.getResultVectorType(), bcastOp.getSource()); - return success(); - } - - return failure(); - } -}; - // Folds transpose(splat x : src_type) : res_type into splat x : res_type. class FoldTransposeSplat final : public OpRewritePattern { public: @@ -6161,12 +6139,106 @@ class FoldTransposeCreateMask final : public OpRewritePattern { } }; +/// Folds transpose(broadcast(x)) to broadcast(x) if the transpose is +/// 'order preserving', where 'order preserving' means the flattened +/// inputs and outputs of the transpose have identical (numerical) values. +/// +/// Example: +/// ``` +/// %0 = vector.broadcast %input : vector<1x1xi32> to vector<1x8xi32> +/// %1 = vector.transpose %0, [1, 0] : vector<1x8xi32> +/// to vector<8x1xi32> +/// ``` +/// can be rewritten as the equivalent +/// ``` +/// %0 = vector.broadcast %input : vector<1x1xi32> to vector<8x1xi32>. +/// ``` +/// The algorithm works by partitioning dimensions into groups that can be +/// locally permuted while preserving order, and checks that the transpose +/// only permutes within these groups. +/// +/// Groups are either contiguous sequences of 1s, or non-1s (1-element groups). +/// Consider broadcasting 4x1x1x7 to 2x3x4x5x6x7. This is equivalent to +/// broadcasting from 1x1x4x1x1x7. +/// ^^^ ^ ^^^ ^ +/// groups: 0 1 2 3 +/// Order preserving permutations for this example are ones that only permute +/// within the groups [0,1] and [3,4], like (1 0 2 4 3 5 6). +class FoldTransposeBroadcast : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + FoldTransposeBroadcast(MLIRContext *context, PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit) {} + + LogicalResult matchAndRewrite(vector::TransposeOp transpose, + PatternRewriter &rewriter) const override { + + vector::BroadcastOp broadcast = + transpose.getVector().getDefiningOp(); + if (!broadcast) { + return rewriter.notifyMatchFailure(transpose, + "not preceded by a broadcast"); + } + + auto inputType = dyn_cast(broadcast.getSourceType()); + VectorType outputType = transpose.getResultVectorType(); + + // transpose(broadcast(scalar)) -> broadcast(scalar) is always valid + bool inputIsScalar = !inputType; + if (inputIsScalar) { + rewriter.replaceOpWithNewOp(transpose, outputType, + transpose.getVector()); + return success(); + } + + ArrayRef permutation = transpose.getPermutation(); + ArrayRef inputShape = inputType.getShape(); + int64_t inputRank = inputType.getRank(); + int64_t outputRank = transpose.getType().getRank(); + int64_t deltaRank = outputRank - inputRank; + + int low = 0; + for (int inputIndex = 0; inputIndex < inputRank; ++inputIndex) { + bool notOne = inputShape[inputIndex] != 1; + bool prevNotOne = (inputIndex != 0 && inputShape[inputIndex - 1] != 1); + bool groupEndFound = notOne || prevNotOne; + if (groupEndFound) { + int high = inputIndex + deltaRank; + // Return failure if not all permutation destinations for indices in + // [low, high) are in [low, high), i.e. the permutation is not local to + // the group. + for (int i = low; i < high; ++i) { + if (permutation[i] < low || permutation[i] >= high) { + return rewriter.notifyMatchFailure( + transpose, "permutation not local to group"); + } + } + } + } + + // We don't need to check the final group [low, outputRank) because if it is + // not locally bound, there must be a preceding group that already failed + // the check (impossible to have just 1 non-locally bound group). + + // The preceding logic also ensures that at this point, the output of the + // transpose is definitely broadcastable from the input shape, assert so: + assert(vector::isBroadcastableTo(inputType, outputType) == + vector::BroadcastableToResult::Success && + "not broadcastable directly to transpose output"); + + rewriter.replaceOpWithNewOp(transpose, outputType, + transpose.getVector()); + + return success(); + } +}; + } // namespace void vector::TransposeOp::getCanonicalizationPatterns( RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 78b0ea78849e8..733a2c67d2c0c 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -2218,30 +2218,6 @@ func.func @shuffle_nofold1(%v0 : vector<4xi32>, %v1 : vector<2xi32>) -> vector<5 // ----- -// CHECK-LABEL: func @transpose_scalar_broadcast1 -// CHECK-SAME: (%[[ARG:.+]]: vector<1xf32>) -// CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : vector<1xf32> to vector<1x8xf32> -// CHECK: return %[[V]] : vector<1x8xf32> -func.func @transpose_scalar_broadcast1(%value: vector<1xf32>) -> vector<1x8xf32> { - %bcast = vector.broadcast %value : vector<1xf32> to vector<8x1xf32> - %t = vector.transpose %bcast, [1, 0] : vector<8x1xf32> to vector<1x8xf32> - return %t : vector<1x8xf32> -} - -// ----- - -// CHECK-LABEL: func @transpose_scalar_broadcast2 -// CHECK-SAME: (%[[ARG:.+]]: f32) -// CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : f32 to vector<1x8xf32> -// CHECK: return %[[V]] : vector<1x8xf32> -func.func @transpose_scalar_broadcast2(%value: f32) -> vector<1x8xf32> { - %bcast = vector.broadcast %value : f32 to vector<8x1xf32> - %t = vector.transpose %bcast, [1, 0] : vector<8x1xf32> to vector<1x8xf32> - return %t : vector<1x8xf32> -} - -// ----- - // CHECK-LABEL: func @transpose_splat_constant // CHECK: %[[CST:.+]] = arith.constant dense<5.000000e+00> : vector<8x4xf32> // CHECK: return %[[CST]] diff --git a/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir new file mode 100644 index 0000000000000..e97e147459de2 --- /dev/null +++ b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir @@ -0,0 +1,139 @@ +// RUN: mlir-opt %s -canonicalize="test-convergence" -split-input-file -allow-unregistered-dialect | FileCheck %s + +// This file contains some canonicalizations tests involving vector.transpose. + +// CHECK-LABEL: func @transpose_scalar_broadcast1 +// CHECK-SAME: (%[[ARG:.+]]: vector<1xf32>) +// CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : vector<1xf32> to vector<1x8xf32> +// CHECK: return %[[V]] : vector<1x8xf32> +func.func @transpose_scalar_broadcast1(%value: vector<1xf32>) -> vector<1x8xf32> { + %bcast = vector.broadcast %value : vector<1xf32> to vector<8x1xf32> + %t = vector.transpose %bcast, [1, 0] : vector<8x1xf32> to vector<1x8xf32> + return %t : vector<1x8xf32> +} + +// ----- + +// CHECK-LABEL: func @transpose_scalar_broadcast2 +// CHECK-SAME: (%[[ARG:.+]]: f32) +// CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : f32 to vector<1x8xf32> +// CHECK: return %[[V]] : vector<1x8xf32> +func.func @transpose_scalar_broadcast2(%value: f32) -> vector<1x8xf32> { + %bcast = vector.broadcast %value : f32 to vector<8x1xf32> + %t = vector.transpose %bcast, [1, 0] : vector<8x1xf32> to vector<1x8xf32> + return %t : vector<1x8xf32> +} + +// ----- + + +// CHECK-LABEL: broadcast_transpose_scalar_to_broadcast +// CHECK-SAME: %[[ARG:.*]]: i8) -> vector<2x3x4xi8> { +func.func @broadcast_transpose_scalar_to_broadcast(%arg0 : i8) -> vector<2x3x4xi8> { +// CHECK: %[[BC:.*]] = vector.broadcast %[[ARG]] : i8 to vector<2x3x4xi8> + %0 = vector.broadcast %arg0 : i8 to vector<3x4x2xi8> + %1 = vector.transpose %0, [2, 0, 1] : vector<3x4x2xi8> to vector<2x3x4xi8> +// CHECK: return %[[BC]] : vector<2x3x4xi8> + return %1 : vector<2x3x4xi8> +} + +// ----- + +// CHECK-LABEL: broadcast_transpose_ones_to_broadcast +// CHECK-SAME: %[[ARG:.*]]: vector<1x1x1xi8>) -> vector<2x3x4xi8> { +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG]] : vector<1x1x1xi8> to vector<2x3x4xi8> +// CHECK: return %[[RES]] : vector<2x3x4xi8> +func.func @broadcast_transpose_ones_to_broadcast(%arg0 : vector<1x1x1xi8>) -> vector<2x3x4xi8> { + %0 = vector.broadcast %arg0 : vector<1x1x1xi8> to vector<3x4x2xi8> + %1 = vector.transpose %0, [2, 0, 1] : vector<3x4x2xi8> to vector<2x3x4xi8> + return %1 : vector<2x3x4xi8> +} + +// ----- + +// CHECK-LABEL: broadcast_transpose_partial_ones_to_broadcast +// CHECK-SAME: %[[ARG:.*]]: vector<1xi8>) -> vector<8x1xi8> { +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG]] : vector<1xi8> to vector<8x1xi8> +// CHECK: return %[[RES]] : vector<8x1xi8> +func.func @broadcast_transpose_partial_ones_to_broadcast(%arg0 : vector<1xi8>) -> vector<8x1xi8> { + %0 = vector.broadcast %arg0 : vector<1xi8> to vector<1x8xi8> + %1 = vector.transpose %0, [1, 0] : vector<1x8xi8> to vector<8x1xi8> + return %1 : vector<8x1xi8> +} + +// ----- + +// CHECK-LABEL: broadcast_transpose_mixed_example +// CHECK-SAME: %[[ARG:.*]]: vector<4x1x1x7xi8>) -> vector<3x2x4x5x6x7xi8> { +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG]] : vector<4x1x1x7xi8> to vector<3x2x4x5x6x7xi8> +// CHECK: return %[[RES]] : vector<3x2x4x5x6x7xi8> +func.func @broadcast_transpose_mixed_example(%arg0 : vector<4x1x1x7xi8>) -> vector<3x2x4x5x6x7xi8> { + %0 = vector.broadcast %arg0 : vector<4x1x1x7xi8> to vector<2x3x4x5x6x7xi8> + %1 = vector.transpose %0, [1, 0, 2, 3, 4, 5] : vector<2x3x4x5x6x7xi8> to vector<3x2x4x5x6x7xi8> + return %1 : vector<3x2x4x5x6x7xi8> +} + +// ----- + +// CHECK-LABEL: broadcast_transpose_final_group +// CHECK-SAME: %[[ARG:.*]]: vector<4x7x1x1xi8>) -> vector<4x7x2x3xi8> { +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG]] : vector<4x7x1x1xi8> to vector<4x7x2x3xi8> +// CHECK: return %[[RES]] : vector<4x7x2x3xi8> +func.func @broadcast_transpose_final_group(%arg0 : vector<4x7x1x1xi8>) -> vector<4x7x2x3xi8> { + %0 = vector.broadcast %arg0 : vector<4x7x1x1xi8> to vector<4x7x3x2xi8> + %1 = vector.transpose %0, [0, 1, 3, 2] : vector<4x7x3x2xi8> to vector<4x7x2x3xi8> + return %1 : vector<4x7x2x3xi8> +} + +// ----- + +// CHECK-LABEL: negative_broadcast_transpose_square +// CHECK-SAME: %[[ARG:.*]]: +// CHECK: %[[BCT:.*]] = vector.broadcast %[[ARG]] +// CHECK: %[[TRP:.*]] = vector.transpose %[[BCT]], [1, 0] +// CHECK: return %[[TRP]] : vector<4x4xi8> +func.func @negative_broadcast_transpose_square(%arg0 : vector<4x1xi8>) -> vector<4x4xi8> { + %0 = vector.broadcast %arg0 : vector<4x1xi8> to vector<4x4xi8> + %1 = vector.transpose %0, [1, 0] : vector<4x4xi8> to vector<4x4xi8> + return %1 : vector<4x4xi8> +} + +// ----- + +// CHECK-LABEL: negative_broadcast_transpose_hypercube +// CHECK-SAME: %[[ARG:.*]]: +// CHECK: %[[BCT:.*]] = vector.broadcast %[[ARG]] +// CHECK: %[[TRP:.*]] = vector.transpose %[[BCT]], [1, 0, 3, 2] +// CHECK: return %[[TRP]] : vector<4x4x4x4xi8> +func.func @negative_broadcast_transpose_hypercube(%arg0 : vector<1x1x4xi8>) -> vector<4x4x4x4xi8> { + %0 = vector.broadcast %arg0 : vector<1x1x4xi8> to vector<4x4x4x4xi8> + %1 = vector.transpose %0, [1, 0, 3, 2] : vector<4x4x4x4xi8> to vector<4x4x4x4xi8> + return %1 : vector<4x4x4x4xi8> +} + +// ----- + +// CHECK-LABEL: negative_broadcast_transpose_102 +// CHECK-SAME: %[[ARG:.*]]: +// CHECK: %[[BCT:.*]] = vector.broadcast %[[ARG]] +// CHECK: %[[TRP:.*]] = vector.transpose %[[BCT]], [1, 0, 2] +// CHECK: return %[[TRP]] : vector<3x3x3xi8> +func.func @negative_broadcast_transpose_102(%arg0 : vector<3x1x3xi8>) -> vector<3x3x3xi8> { + %0 = vector.broadcast %arg0 : vector<3x1x3xi8> to vector<3x3x3xi8> + %1 = vector.transpose %0, [1, 0, 2] : vector<3x3x3xi8> to vector<3x3x3xi8> + return %1 : vector<3x3x3xi8> +} + +// ----- + +// CHECK-LABEL: negative_broadcast_transpose_021 +// CHECK-SAME: %[[ARG:.*]]: +// CHECK: %[[BCT:.*]] = vector.broadcast %[[ARG]] +// CHECK: %[[TRP:.*]] = vector.transpose %[[BCT]], [0, 2, 1] +// CHECK: return %[[TRP]] : vector<3x3x3xi8> +func.func @negative_broadcast_transpose_021(%arg0 : vector<3x1x3xi8>) -> vector<3x3x3xi8> { + %0 = vector.broadcast %arg0 : vector<3x1x3xi8> to vector<3x3x3xi8> + %1 = vector.transpose %0, [0, 2, 1] : vector<3x3x3xi8> to vector<3x3x3xi8> + return %1 : vector<3x3x3xi8> +} + From d88a3a36ad26e68281873fab9a35389f6eb5c919 Mon Sep 17 00:00:00 2001 From: James Newling Date: Wed, 16 Apr 2025 10:08:57 -0700 Subject: [PATCH 146/710] [mlir][vector] Remove redundant shape_cast(shape_cast(x)) pattern (#135447) This PR removes one OpRewritePattern `shape_cast(shape_cast(x)) -> x` that is already handled by `ShapeCastOp::fold`. Note that this might affect downstream users who indirectly call `populateShapeCastFoldingPatterns(RewritePatternSet &patterns, PatternBenefit)` and then use `patterns` with a `GreedyRewriteConfig config` that has `config.fold = false`. (only user I've checked is IREE, that never uses config.fold = false). --- .../Vector/Transforms/VectorRewritePatterns.h | 4 -- .../Transforms/VectorDropLeadUnitDim.cpp | 2 - .../Transforms/VectorTransferOpTransforms.cpp | 2 - .../Vector/Transforms/VectorTransforms.cpp | 68 +------------------ 4 files changed, 1 insertion(+), 75 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h index 7de4a6a315750..ce97847172197 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h @@ -306,10 +306,6 @@ void populateVectorUnrollPatterns(RewritePatternSet &patterns, const UnrollVectorOptions &options, PatternBenefit benefit = 1); -/// Collect a set of vector.shape_cast folding patterns. -void populateShapeCastFoldingPatterns(RewritePatternSet &patterns, - PatternBenefit benefit = 1); - /// Collect a set of leading one dimension removal patterns. /// /// These patterns insert vector.shape_cast to remove leading one dimensions diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp index fda3baf3aa390..68a44ea889470 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp @@ -8,7 +8,6 @@ #include -#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" @@ -577,5 +576,4 @@ void mlir::vector::populateCastAwayVectorLeadingOneDimPatterns( CastAwayConstantMaskLeadingOneDim, CastAwayTransferReadLeadingOneDim, CastAwayTransferWriteLeadingOneDim, CastAwayElementwiseLeadingOneDim, CastAwayContractionLeadingOneDim>(patterns.getContext(), benefit); - populateShapeCastFoldingPatterns(patterns, benefit); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp index 62dfd439b0ad1..999fb9c415886 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp @@ -976,7 +976,6 @@ void mlir::vector::populateVectorTransferDropUnitDimsPatterns( patterns .add( patterns.getContext(), benefit); - populateShapeCastFoldingPatterns(patterns); } void mlir::vector::populateFlattenVectorTransferPatterns( @@ -985,6 +984,5 @@ void mlir::vector::populateFlattenVectorTransferPatterns( patterns.add( patterns.getContext(), targetVectorBitwidth, benefit); - populateShapeCastFoldingPatterns(patterns, benefit); populateDropUnitDimWithShapeCastPatterns(patterns, benefit); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index d50d5fe96f49a..89839d0440d3c 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -16,36 +16,24 @@ #include #include #include -#include -#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" -#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Location.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/VectorInterfaces.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "vector-to-vector" @@ -71,54 +59,6 @@ static std::optional getResultIndex(AffineMap map, int64_t index) { namespace { -/// ShapeCastOpFolder folds cancelling ShapeCastOps away. -// -// Example: -// -// The following MLIR with cancelling ShapeCastOps: -// -// %0 = source : vector<5x4x2xf32> -// %1 = shape_cast %0 : vector<5x4x2xf32> to vector<20x2xf32> -// %2 = shape_cast %1 : vector<20x2xf32> to vector<5x4x2xf32> -// %3 = user %2 : vector<5x4x2xf32> -// -// Should canonicalize to the following: -// -// %0 = source : vector<5x4x2xf32> -// %1 = user %0 : vector<5x4x2xf32> -// -struct ShapeCastOpFolder : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(vector::ShapeCastOp shapeCastOp, - PatternRewriter &rewriter) const override { - // Check if 'shapeCastOp' has vector source/result type. - auto sourceVectorType = - dyn_cast_or_null(shapeCastOp.getSource().getType()); - auto resultVectorType = - dyn_cast_or_null(shapeCastOp.getResult().getType()); - if (!sourceVectorType || !resultVectorType) - return failure(); - - // Check if shape cast op source operand is also a shape cast op. - auto sourceShapeCastOp = dyn_cast_or_null( - shapeCastOp.getSource().getDefiningOp()); - if (!sourceShapeCastOp) - return failure(); - auto operandSourceVectorType = - cast(sourceShapeCastOp.getSource().getType()); - auto operandResultVectorType = sourceShapeCastOp.getType(); - - // Check if shape cast operations invert each other. - if (operandSourceVectorType != resultVectorType || - operandResultVectorType != sourceVectorType) - return failure(); - - rewriter.replaceOp(shapeCastOp, sourceShapeCastOp.getSource()); - return success(); - } -}; - /// Convert MulIOp/MulFOp + MultiDimReductionOp into ContractionOp. /// Ex: /// ``` @@ -2113,11 +2053,6 @@ void mlir::vector::populateVectorMaskMaterializationPatterns( patterns.add(patterns.getContext(), benefit); } -void mlir::vector::populateShapeCastFoldingPatterns(RewritePatternSet &patterns, - PatternBenefit benefit) { - patterns.add(patterns.getContext(), benefit); -} - void mlir::vector::populateDropUnitDimWithShapeCastPatterns( RewritePatternSet &patterns, PatternBenefit benefit) { // TODO: Consider either: @@ -2126,8 +2061,7 @@ void mlir::vector::populateDropUnitDimWithShapeCastPatterns( // * better naming to distinguish this and // populateVectorTransferCollapseInnerMostContiguousDimsPatterns. patterns.add( - patterns.getContext(), benefit); + DropUnitDimsFromTransposeOp>(patterns.getContext(), benefit); } void mlir::vector::populateBubbleVectorBitCastOpPatterns( From facc57fc25d0f05f5834fed421662dbad3ec5b50 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 16 Apr 2025 19:09:45 +0200 Subject: [PATCH 147/710] [Clang][RFC] Bypass TAD during overload resolution if a perfect match exists (#133426) This implements the same overload resolution behavior as GCC, as described in https://wg21.link/p3606 (sections 1-2, not 3) If, during overload resolution, a non-template candidate is always picked because each argument is a perfect match (i.e., the source and target types are the same), we do not perform deduction for any template candidate that might exist. The goal is to be able to merge #122423 without being too disruptive. This change means that the selection of the best viable candidate and template argument deduction become interleaved. To avoid rewriting half of Clang, we store in `OverloadCandidateSet` enough information to deduce template candidates from `OverloadCandidateSet::BestViableFunction`. This means the lifetime of any object used by the template argument must outlive a call to `Add*Template*Candidate`. This two-phase resolution is not performed for some initialization as there are cases where template candidates are a better match per the standard. It's also bypassed for code completion. The change has a nice impact on compile times https://llvm-compile-time-tracker.com/compare.php?from=edc22c64e527171041876f26a491bb1d03d905d5&to=8170b860bd4b70917005796c05a9be013a95abb2&stat=instructions%3Au Fixes #62096 Fixes #74581 Fixes #53454 --- clang/docs/ReleaseNotes.rst | 6 + clang/include/clang/Sema/Overload.h | 224 +++++++- clang/lib/Sema/SemaCodeComplete.cpp | 6 +- clang/lib/Sema/SemaInit.cpp | 15 +- clang/lib/Sema/SemaOverload.cpp | 522 ++++++++++++++---- clang/lib/Sema/SemaTemplateDeduction.cpp | 4 +- .../constrant-satisfaction-conversions.cpp | 8 +- clang/test/SemaCUDA/function-overload.cu | 3 - .../SemaCXX/implicit-member-functions.cpp | 21 +- ...overload-resolution-deferred-templates.cpp | 185 +++++++ .../instantiate-function-params.cpp | 7 +- .../Templight/templight-empty-entries-fix.cpp | 126 ++--- 12 files changed, 910 insertions(+), 217 deletions(-) create mode 100644 clang/test/SemaCXX/overload-resolution-deferred-templates.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0891fd058bb57..acbc9c5a6fac9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -96,6 +96,12 @@ C++ Language Changes asm((std::string_view("nop")) ::: (std::string_view("memory"))); } +- Clang now implements the changes to overload resolution proposed by section 1 and 2 of + `P3606 `_. If a non-template candidate exists in an overload set that is + a perfect match (all conversion sequences are identity conversions) template candiates are not instantiated. + Diagnostics that would have resulted from the instantiation of these template candidates are no longer + produced. This aligns Clang closer to the behavior of GCC, and fixes (#GH62096), (#GH74581), and (#GH74581). + C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 6e08762dcc6d7..813811af06e89 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -407,6 +407,24 @@ class Sema; Third == ICK_Identity; } + /// A conversion sequence is perfect if it is an identity conversion and + /// the type of the source is the same as the type of the target. + bool isPerfect(const ASTContext &C) const { + if (!isIdentityConversion()) + return false; + // If we are not performing a reference binding, we can skip comparing + // the types, which has a noticeable performance impact. + if (!ReferenceBinding) { + assert(First || C.hasSameUnqualifiedType(getFromType(), getToType(2))); + return true; + } + if (!C.hasSameType(getFromType(), getToType(2))) + return false; + if (BindsToRvalue && IsLvalueReference) + return false; + return true; + } + ImplicitConversionRank getRank() const; NarrowingKind getNarrowingKind(ASTContext &Context, const Expr *Converted, @@ -743,6 +761,12 @@ class Sema; Standard.setAllToTypes(T); } + /// A conversion sequence is perfect if it is an identity conversion and + /// the type of the source is the same as the type of the target. + bool isPerfect(const ASTContext &C) const { + return isStandard() && Standard.isPerfect(C); + } + // True iff this is a conversion sequence from an initializer list to an // array or std::initializer. bool hasInitializerListContainerType() const { @@ -979,6 +1003,20 @@ class Sema; return false; } + // An overload is a perfect match if the conversion + // sequences for each argument are perfect. + bool isPerfectMatch(const ASTContext &Ctx) const { + if (!Viable) + return false; + for (const auto &C : Conversions) { + if (!C.isInitialized() || !C.isPerfect(Ctx)) + return false; + } + if (isa_and_nonnull(Function)) + return FinalConversion.isPerfect(Ctx); + return true; + } + bool TryToFixBadConversion(unsigned Idx, Sema &S) { bool CanFix = Fix.tryToFixConversion( Conversions[Idx].Bad.FromExpr, @@ -1015,6 +1053,65 @@ class Sema; RewriteKind(CRK_None) {} }; + struct DeferredTemplateOverloadCandidate { + + // intrusive linked list support for allocateDeferredCandidate + DeferredTemplateOverloadCandidate *Next = nullptr; + + enum Kind { Function, Method, Conversion }; + + LLVM_PREFERRED_TYPE(Kind) + unsigned Kind : 2; + LLVM_PREFERRED_TYPE(bool) + unsigned AllowObjCConversionOnExplicit : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned AllowResultConversion : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned AllowExplicit : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned SuppressUserConversions : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned PartialOverloading : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned AggregateCandidateDeduction : 1; + }; + + struct DeferredFunctionTemplateOverloadCandidate + : public DeferredTemplateOverloadCandidate { + FunctionTemplateDecl *FunctionTemplate; + DeclAccessPair FoundDecl; + ArrayRef Args; + CallExpr::ADLCallKind IsADLCandidate; + OverloadCandidateParamOrder PO; + }; + static_assert(std::is_trivially_destructible_v< + DeferredFunctionTemplateOverloadCandidate>); + + struct DeferredMethodTemplateOverloadCandidate + : public DeferredTemplateOverloadCandidate { + FunctionTemplateDecl *FunctionTemplate; + DeclAccessPair FoundDecl; + ArrayRef Args; + CXXRecordDecl *ActingContext; + Expr::Classification ObjectClassification; + QualType ObjectType; + OverloadCandidateParamOrder PO; + }; + static_assert(std::is_trivially_destructible_v< + DeferredMethodTemplateOverloadCandidate>); + + struct DeferredConversionTemplateOverloadCandidate + : public DeferredTemplateOverloadCandidate { + FunctionTemplateDecl *FunctionTemplate; + DeclAccessPair FoundDecl; + CXXRecordDecl *ActingContext; + Expr *From; + QualType ToType; + }; + + static_assert(std::is_trivially_destructible_v< + DeferredConversionTemplateOverloadCandidate>); + /// OverloadCandidateSet - A set of overload candidates, used in C++ /// overload resolution (C++ 13.3). class OverloadCandidateSet { @@ -1043,6 +1140,11 @@ class Sema; /// C++ [over.match.call.general] /// Resolve a call through the address of an overload set. CSK_AddressOfOverloadSet, + + /// When doing overload resolution during code completion, + /// we want to show all viable candidates, including otherwise + /// deferred template candidates. + CSK_CodeCompletion, }; /// Information about operator rewrites to consider when adding operator @@ -1117,7 +1219,15 @@ class Sema; SmallVector Candidates; llvm::SmallPtrSet Functions; - // Allocator for ConversionSequenceLists. We store the first few of these + DeferredTemplateOverloadCandidate *FirstDeferredCandidate = nullptr; + unsigned DeferredCandidatesCount : 8 * sizeof(unsigned) - 2; + LLVM_PREFERRED_TYPE(bool) + unsigned HasDeferredTemplateConstructors : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned ResolutionByPerfectCandidateIsDisabled : 1; + + // Allocator for ConversionSequenceLists and deferred candidate args. + // We store the first few of these // inline to avoid allocation for small sets. llvm::BumpPtrAllocator SlabAllocator; @@ -1125,8 +1235,11 @@ class Sema; CandidateSetKind Kind; OperatorRewriteInfo RewriteInfo; + /// Small storage size for ImplicitConversionSequences + /// and the persisted arguments of deferred candidates. constexpr static unsigned NumInlineBytes = - 24 * sizeof(ImplicitConversionSequence); + 32 * sizeof(ImplicitConversionSequence); + unsigned NumInlineBytesUsed = 0; alignas(void *) char InlineSpace[NumInlineBytes]; @@ -1137,15 +1250,13 @@ class Sema; /// from the slab allocator. /// FIXME: It would probably be nice to have a SmallBumpPtrAllocator /// instead. - /// FIXME: Now that this only allocates ImplicitConversionSequences, do we - /// want to un-generalize this? template T *slabAllocate(unsigned N) { // It's simpler if this doesn't need to consider alignment. static_assert(alignof(T) == alignof(void *), "Only works for pointer-aligned types."); - static_assert(std::is_trivial::value || - std::is_same::value, + static_assert(std::is_trivially_destructible_v || + (std::is_same_v), "Add destruction logic to OverloadCandidateSet::clear()."); unsigned NBytes = sizeof(T) * N; @@ -1159,12 +1270,34 @@ class Sema; return reinterpret_cast(FreeSpaceStart); } + // Because the size of OverloadCandidateSet has a noticeable impact on + // performance, we store each deferred template candidate in the slab + // allocator such that deferred candidates are ultimately a singly-linked + // intrusive linked list. This ends up being much more efficient than a + // SmallVector that is empty in the common case. + template T *allocateDeferredCandidate() { + T *C = slabAllocate(1); + if (!FirstDeferredCandidate) + FirstDeferredCandidate = C; + else { + auto *F = FirstDeferredCandidate; + while (F->Next) + F = F->Next; + F->Next = C; + } + DeferredCandidatesCount++; + return C; + } + void destroyCandidates(); public: OverloadCandidateSet(SourceLocation Loc, CandidateSetKind CSK, OperatorRewriteInfo RewriteInfo = {}) - : Loc(Loc), Kind(CSK), RewriteInfo(RewriteInfo) {} + : FirstDeferredCandidate(nullptr), DeferredCandidatesCount(0), + HasDeferredTemplateConstructors(false), + ResolutionByPerfectCandidateIsDisabled(false), Loc(Loc), Kind(CSK), + RewriteInfo(RewriteInfo) {} OverloadCandidateSet(const OverloadCandidateSet &) = delete; OverloadCandidateSet &operator=(const OverloadCandidateSet &) = delete; ~OverloadCandidateSet() { destroyCandidates(); } @@ -1176,6 +1309,9 @@ class Sema; /// Whether diagnostics should be deferred. bool shouldDeferDiags(Sema &S, ArrayRef Args, SourceLocation OpLoc); + // Whether the resolution of template candidates should be deferred + bool shouldDeferTemplateArgumentDeduction(const LangOptions &Opts) const; + /// Determine when this overload candidate will be new to the /// overload set. bool isNewCandidate(Decl *F, OverloadCandidateParamOrder PO = @@ -1199,8 +1335,10 @@ class Sema; iterator begin() { return Candidates.begin(); } iterator end() { return Candidates.end(); } - size_t size() const { return Candidates.size(); } - bool empty() const { return Candidates.empty(); } + size_t size() const { return Candidates.size() + DeferredCandidatesCount; } + bool empty() const { + return Candidates.empty() && DeferredCandidatesCount == 0; + } /// Allocate storage for conversion sequences for NumConversions /// conversions. @@ -1216,6 +1354,24 @@ class Sema; return ConversionSequenceList(Conversions, NumConversions); } + /// Provide storage for any Expr* arg that must be preserved + /// until deferred template candidates are deduced. + /// Typically this should be used for reversed operator arguments + /// and any time the argument array is transformed while adding + /// a template candidate. + llvm::MutableArrayRef getPersistentArgsArray(unsigned N) { + Expr **Exprs = slabAllocate(N); + return llvm::MutableArrayRef(Exprs, N); + } + + template + llvm::MutableArrayRef getPersistentArgsArray(T *...Exprs) { + llvm::MutableArrayRef Arr = + getPersistentArgsArray(sizeof...(Exprs)); + llvm::copy(std::initializer_list{Exprs...}, Arr.data()); + return Arr; + } + /// Add a new candidate with NumConversions conversion sequence slots /// to the overload set. OverloadCandidate &addCandidate(unsigned NumConversions = 0, @@ -1231,6 +1387,32 @@ class Sema; return C; } + void AddDeferredTemplateCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + ArrayRef Args, bool SuppressUserConversions, + bool PartialOverloading, bool AllowExplicit, + CallExpr::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, + bool AggregateCandidateDeduction); + + void AddDeferredMethodTemplateCandidate( + FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, QualType ObjectType, + Expr::Classification ObjectClassification, ArrayRef Args, + bool SuppressUserConversions, bool PartialOverloading, + OverloadCandidateParamOrder PO); + + void AddDeferredConversionTemplateCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, Expr *From, QualType ToType, + bool AllowObjCConversionOnExplicit, bool AllowExplicit, + bool AllowResultConversion); + + void InjectNonDeducedTemplateCandidates(Sema &S); + + void DisableResolutionByPerfectCandidate() { + ResolutionByPerfectCandidateIsDisabled = true; + } + /// Find the best viable function on this overload set, if it exists. OverloadingResult BestViableFunction(Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator& Best); @@ -1263,6 +1445,15 @@ class Sema; DestAS = AS; } + private: + OverloadingResult ResultForBestCandidate(const iterator &Best); + void CudaExcludeWrongSideCandidates( + Sema &S, SmallVectorImpl &Candidates); + OverloadingResult + BestViableFunctionImpl(Sema &S, SourceLocation Loc, + OverloadCandidateSet::iterator &Best); + void PerfectViableFunction(Sema &S, SourceLocation Loc, + OverloadCandidateSet::iterator &Best); }; bool isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1, @@ -1311,6 +1502,21 @@ class Sema; // parameter. bool shouldEnforceArgLimit(bool PartialOverloading, FunctionDecl *Function); + inline bool OverloadCandidateSet::shouldDeferTemplateArgumentDeduction( + const LangOptions &Opts) const { + return + // For user defined conversion we need to check against different + // combination of CV qualifiers and look at any explicit specifier, so + // always deduce template candidates. + Kind != CSK_InitByUserDefinedConversion + // When doing code completion, we want to see all the + // viable candidates. + && Kind != CSK_CodeCompletion + // CUDA may prefer template candidates even when a non-candidate + // is a perfect match + && !Opts.CUDA; + } + } // namespace clang #endif // LLVM_CLANG_SEMA_OVERLOAD_H diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index f6ec4cb0f069e..45405d4709e14 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -6354,7 +6354,8 @@ SemaCodeCompletion::ProduceCallSignatureHelp(Expr *Fn, ArrayRef Args, Expr *NakedFn = Fn->IgnoreParenCasts(); // Build an overload candidate set based on the functions we find. SourceLocation Loc = Fn->getExprLoc(); - OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal); + OverloadCandidateSet CandidateSet(Loc, + OverloadCandidateSet::CSK_CodeCompletion); if (auto ULE = dyn_cast(NakedFn)) { SemaRef.AddOverloadedCallCandidates(ULE, ArgsWithoutDependentTypes, @@ -6557,7 +6558,8 @@ QualType SemaCodeCompletion::ProduceConstructorSignatureHelp( // FIXME: Provide support for variadic template constructors. if (CRD) { - OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal); + OverloadCandidateSet CandidateSet(Loc, + OverloadCandidateSet::CSK_CodeCompletion); for (NamedDecl *C : SemaRef.LookupConstructors(CRD)) { if (auto *FD = dyn_cast(C)) { // FIXME: we can't yet provide correct signature help for initializer diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index a1e4bb4321d53..82489847b589b 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -10029,12 +10029,19 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( // When [...] the constructor [...] is a candidate by // - [over.match.copy] (in all cases) if (TD) { - SmallVector TmpInits; - for (Expr *E : Inits) + + // As template candidates are not deduced immediately, + // persist the array in the overload set. + MutableArrayRef TmpInits = + Candidates.getPersistentArgsArray(Inits.size()); + + for (auto [I, E] : llvm::enumerate(Inits)) { if (auto *DI = dyn_cast(E)) - TmpInits.push_back(DI->getInit()); + TmpInits[I] = DI->getInit(); else - TmpInits.push_back(E); + TmpInits[I] = E; + } + AddTemplateOverloadCandidate( TD, FoundDecl, /*ExplicitArgs=*/nullptr, TmpInits, Candidates, /*SuppressUserConversions=*/false, diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 55634aa75ae25..deef01c946feb 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1123,6 +1123,10 @@ void OverloadCandidateSet::clear(CandidateSetKind CSK) { Candidates.clear(); Functions.clear(); Kind = CSK; + FirstDeferredCandidate = nullptr; + DeferredCandidatesCount = 0; + HasDeferredTemplateConstructors = false; + ResolutionByPerfectCandidateIsDisabled = false; } namespace { @@ -7795,15 +7799,14 @@ void Sema::AddMethodCandidate( } } -void Sema::AddMethodTemplateCandidate( +static void AddMethodTemplateCandidateImmediately( + Sema &S, OverloadCandidateSet &CandidateSet, FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ObjectType, Expr::Classification ObjectClassification, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, OverloadCandidateParamOrder PO) { - if (!CandidateSet.isNewCandidate(MethodTmpl, PO)) - return; + bool SuppressUserConversions, bool PartialOverloading, + OverloadCandidateParamOrder PO) { // C++ [over.match.funcs]p7: // In each case where a candidate is a function template, candidate @@ -7817,12 +7820,12 @@ void Sema::AddMethodTemplateCandidate( TemplateDeductionInfo Info(CandidateSet.getLocation()); FunctionDecl *Specialization = nullptr; ConversionSequenceList Conversions; - if (TemplateDeductionResult Result = DeduceTemplateArguments( + if (TemplateDeductionResult Result = S.DeduceTemplateArguments( MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, /*AggregateDeductionCandidate=*/false, /*PartialOrdering=*/false, ObjectType, ObjectClassification, [&](ArrayRef ParamTypes) { - return CheckNonDependentConversions( + return S.CheckNonDependentConversions( MethodTmpl, ParamTypes, Args, CandidateSet, Conversions, SuppressUserConversions, ActingContext, ObjectType, ObjectClassification, PO); @@ -7844,8 +7847,8 @@ void Sema::AddMethodTemplateCandidate( Candidate.FailureKind = ovl_fail_bad_conversion; else { Candidate.FailureKind = ovl_fail_bad_deduction; - Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, - Info); + Candidate.DeductionFailure = + MakeDeductionFailureInfo(S.Context, Result, Info); } return; } @@ -7855,10 +7858,34 @@ void Sema::AddMethodTemplateCandidate( assert(Specialization && "Missing member function template specialization?"); assert(isa(Specialization) && "Specialization is not a member function?"); - AddMethodCandidate(cast(Specialization), FoundDecl, - ActingContext, ObjectType, ObjectClassification, Args, - CandidateSet, SuppressUserConversions, PartialOverloading, - Conversions, PO, Info.hasStrictPackMatch()); + S.AddMethodCandidate( + cast(Specialization), FoundDecl, ActingContext, ObjectType, + ObjectClassification, Args, CandidateSet, SuppressUserConversions, + PartialOverloading, Conversions, PO, Info.hasStrictPackMatch()); +} + +void Sema::AddMethodTemplateCandidate( + FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, + TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ObjectType, + Expr::Classification ObjectClassification, ArrayRef Args, + OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, + bool PartialOverloading, OverloadCandidateParamOrder PO) { + if (!CandidateSet.isNewCandidate(MethodTmpl, PO)) + return; + + if (ExplicitTemplateArgs || + !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts())) { + AddMethodTemplateCandidateImmediately( + *this, CandidateSet, MethodTmpl, FoundDecl, ActingContext, + ExplicitTemplateArgs, ObjectType, ObjectClassification, Args, + SuppressUserConversions, PartialOverloading, PO); + return; + } + + CandidateSet.AddDeferredMethodTemplateCandidate( + MethodTmpl, FoundDecl, ActingContext, ObjectType, ObjectClassification, + Args, SuppressUserConversions, PartialOverloading, PO); } /// Determine whether a given function template has a simple explicit specifier @@ -7867,14 +7894,18 @@ static bool isNonDependentlyExplicit(FunctionTemplateDecl *FTD) { return ExplicitSpecifier::getFromDecl(FTD->getTemplatedDecl()).isExplicit(); } -void Sema::AddTemplateOverloadCandidate( +static bool hasDependentExplicit(FunctionTemplateDecl *FTD) { + return ExplicitSpecifier::getFromDecl(FTD->getTemplatedDecl()).getKind() == + ExplicitSpecKind::Unresolved; +} + +static void AddTemplateOverloadCandidateImmediately( + Sema &S, OverloadCandidateSet &CandidateSet, FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, bool AllowExplicit, ADLCallKind IsADLCandidate, - OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { - if (!CandidateSet.isNewCandidate(FunctionTemplate, PO)) - return; + bool SuppressUserConversions, bool PartialOverloading, bool AllowExplicit, + Sema::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, + bool AggregateCandidateDeduction) { // If the function template has a non-dependent explicit specification, // exclude it now if appropriate; we are not permitted to perform deduction @@ -7901,14 +7932,14 @@ void Sema::AddTemplateOverloadCandidate( FunctionTemplate->getTemplateDepth()); FunctionDecl *Specialization = nullptr; ConversionSequenceList Conversions; - if (TemplateDeductionResult Result = DeduceTemplateArguments( + if (TemplateDeductionResult Result = S.DeduceTemplateArguments( FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, AggregateCandidateDeduction, /*PartialOrdering=*/false, /*ObjectType=*/QualType(), /*ObjectClassification=*/Expr::Classification(), [&](ArrayRef ParamTypes) { - return CheckNonDependentConversions( + return S.CheckNonDependentConversions( FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions, SuppressUserConversions, nullptr, QualType(), {}, PO); }); @@ -7932,8 +7963,8 @@ void Sema::AddTemplateOverloadCandidate( Candidate.FailureKind = ovl_fail_bad_conversion; else { Candidate.FailureKind = ovl_fail_bad_deduction; - Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, - Info); + Candidate.DeductionFailure = + MakeDeductionFailureInfo(S.Context, Result, Info); } return; } @@ -7941,7 +7972,7 @@ void Sema::AddTemplateOverloadCandidate( // Add the function template specialization produced by template argument // deduction as a candidate. assert(Specialization && "Missing function template specialization?"); - AddOverloadCandidate( + S.AddOverloadCandidate( Specialization, FoundDecl, Args, CandidateSet, SuppressUserConversions, PartialOverloading, AllowExplicit, /*AllowExplicitConversions=*/false, IsADLCandidate, Conversions, PO, @@ -7949,6 +7980,38 @@ void Sema::AddTemplateOverloadCandidate( Info.hasStrictPackMatch()); } +void Sema::AddTemplateOverloadCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, + OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, + bool PartialOverloading, bool AllowExplicit, ADLCallKind IsADLCandidate, + OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { + if (!CandidateSet.isNewCandidate(FunctionTemplate, PO)) + return; + + bool DependentExplicitSpecifier = hasDependentExplicit(FunctionTemplate); + + if (ExplicitTemplateArgs || + !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) || + (isa(FunctionTemplate->getTemplatedDecl()) && + DependentExplicitSpecifier)) { + + AddTemplateOverloadCandidateImmediately( + *this, CandidateSet, FunctionTemplate, FoundDecl, ExplicitTemplateArgs, + Args, SuppressUserConversions, PartialOverloading, AllowExplicit, + IsADLCandidate, PO, AggregateCandidateDeduction); + + if (DependentExplicitSpecifier) + CandidateSet.DisableResolutionByPerfectCandidate(); + return; + } + + CandidateSet.AddDeferredTemplateCandidate( + FunctionTemplate, FoundDecl, Args, SuppressUserConversions, + PartialOverloading, AllowExplicit, IsADLCandidate, PO, + AggregateCandidateDeduction); +} + bool Sema::CheckNonDependentConversions( FunctionTemplateDecl *FunctionTemplate, ArrayRef ParamTypes, ArrayRef Args, OverloadCandidateSet &CandidateSet, @@ -8264,16 +8327,12 @@ void Sema::AddConversionCandidate( } } -void Sema::AddTemplateConversionCandidate( +static void AddTemplateConversionCandidateImmediately( + Sema &S, OverloadCandidateSet &CandidateSet, FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingDC, Expr *From, QualType ToType, - OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion) { - assert(isa(FunctionTemplate->getTemplatedDecl()) && - "Only conversion function templates permitted here"); - - if (!CandidateSet.isNewCandidate(FunctionTemplate)) - return; + CXXRecordDecl *ActingContext, Expr *From, QualType ToType, + bool AllowObjCConversionOnExplicit, bool AllowExplicit, + bool AllowResultConversion) { // If the function template has a non-dependent explicit specification, // exclude it now if appropriate; we are not permitted to perform deduction @@ -8288,11 +8347,11 @@ void Sema::AddTemplateConversionCandidate( } QualType ObjectType = From->getType(); - Expr::Classification ObjectClassification = From->Classify(getASTContext()); + Expr::Classification ObjectClassification = From->Classify(S.Context); TemplateDeductionInfo Info(CandidateSet.getLocation()); CXXConversionDecl *Specialization = nullptr; - if (TemplateDeductionResult Result = DeduceTemplateArguments( + if (TemplateDeductionResult Result = S.DeduceTemplateArguments( FunctionTemplate, ObjectType, ObjectClassification, ToType, Specialization, Info); Result != TemplateDeductionResult::Success) { @@ -8302,18 +8361,47 @@ void Sema::AddTemplateConversionCandidate( Candidate.Viable = false; Candidate.FailureKind = ovl_fail_bad_deduction; Candidate.ExplicitCallArguments = 1; - Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, - Info); + Candidate.DeductionFailure = + MakeDeductionFailureInfo(S.Context, Result, Info); return; } // Add the conversion function template specialization produced by // template argument deduction as a candidate. assert(Specialization && "Missing function template specialization?"); - AddConversionCandidate(Specialization, FoundDecl, ActingDC, From, ToType, - CandidateSet, AllowObjCConversionOnExplicit, - AllowExplicit, AllowResultConversion, - Info.hasStrictPackMatch()); + S.AddConversionCandidate(Specialization, FoundDecl, ActingContext, From, + ToType, CandidateSet, AllowObjCConversionOnExplicit, + AllowExplicit, AllowResultConversion, + Info.hasStrictPackMatch()); +} + +void Sema::AddTemplateConversionCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingDC, Expr *From, QualType ToType, + OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, + bool AllowExplicit, bool AllowResultConversion) { + assert(isa(FunctionTemplate->getTemplatedDecl()) && + "Only conversion function templates permitted here"); + + if (!CandidateSet.isNewCandidate(FunctionTemplate)) + return; + + if (!CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) || + CandidateSet.getKind() == + OverloadCandidateSet::CSK_InitByUserDefinedConversion || + CandidateSet.getKind() == OverloadCandidateSet::CSK_InitByConstructor) { + AddTemplateConversionCandidateImmediately( + *this, CandidateSet, FunctionTemplate, FoundDecl, ActingDC, From, + ToType, AllowObjCConversionOnExplicit, AllowExplicit, + AllowResultConversion); + + CandidateSet.DisableResolutionByPerfectCandidate(); + return; + } + + CandidateSet.AddDeferredConversionTemplateCandidate( + FunctionTemplate, FoundDecl, ActingDC, From, ToType, + AllowObjCConversionOnExplicit, AllowExplicit, AllowResultConversion); } void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion, @@ -8463,11 +8551,17 @@ void Sema::AddNonMemberOperatorCandidates( if (FunTmpl) { AddTemplateOverloadCandidate(FunTmpl, F.getPair(), ExplicitTemplateArgs, FunctionArgs, CandidateSet); - if (CandidateSet.getRewriteInfo().shouldAddReversed(*this, Args, FD)) - AddTemplateOverloadCandidate( - FunTmpl, F.getPair(), ExplicitTemplateArgs, - {FunctionArgs[1], FunctionArgs[0]}, CandidateSet, false, false, - true, ADLCallKind::NotADL, OverloadCandidateParamOrder::Reversed); + if (CandidateSet.getRewriteInfo().shouldAddReversed(*this, Args, FD)) { + + // As template candidates are not deduced immediately, + // persist the array in the overload set. + ArrayRef Reversed = CandidateSet.getPersistentArgsArray( + FunctionArgs[1], FunctionArgs[0]); + AddTemplateOverloadCandidate(FunTmpl, F.getPair(), ExplicitTemplateArgs, + Reversed, CandidateSet, false, false, true, + ADLCallKind::NotADL, + OverloadCandidateParamOrder::Reversed); + } } else { if (ExplicitTemplateArgs) continue; @@ -10199,6 +10293,8 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name, // FIXME: Pass in the explicit template arguments? ArgumentDependentLookup(Name, Loc, Args, Fns); + ArrayRef ReversedArgs; + // Erase all of the candidates we already knew about. for (OverloadCandidateSet::iterator Cand = CandidateSet.begin(), CandEnd = CandidateSet.end(); @@ -10238,9 +10334,15 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name, /*AllowExplicit=*/true, ADLCallKind::UsesADL); if (CandidateSet.getRewriteInfo().shouldAddReversed( *this, Args, FTD->getTemplatedDecl())) { + + // As template candidates are not deduced immediately, + // persist the array in the overload set. + if (ReversedArgs.empty()) + ReversedArgs = CandidateSet.getPersistentArgsArray(Args[1], Args[0]); + AddTemplateOverloadCandidate( - FTD, FoundDecl, ExplicitTemplateArgs, {Args[1], Args[0]}, - CandidateSet, /*SuppressUserConversions=*/false, PartialOverloading, + FTD, FoundDecl, ExplicitTemplateArgs, ReversedArgs, CandidateSet, + /*SuppressUserConversions=*/false, PartialOverloading, /*AllowExplicit=*/true, ADLCallKind::UsesADL, OverloadCandidateParamOrder::Reversed); } @@ -10913,23 +11015,147 @@ bool OverloadCandidate::NotValidBecauseConstraintExprHasError() const { ->Satisfaction.ContainsErrors; } -/// Computes the best viable function (C++ 13.3.3) -/// within an overload candidate set. -/// -/// \param Loc The location of the function name (or operator symbol) for -/// which overload resolution occurs. -/// -/// \param Best If overload resolution was successful or found a deleted -/// function, \p Best points to the candidate function found. -/// -/// \returns The result of overload resolution. +void OverloadCandidateSet::AddDeferredTemplateCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + ArrayRef Args, bool SuppressUserConversions, + bool PartialOverloading, bool AllowExplicit, + CallExpr::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, + bool AggregateCandidateDeduction) { + + auto *C = + allocateDeferredCandidate(); + + C = new (C) DeferredFunctionTemplateOverloadCandidate{ + {nullptr, DeferredFunctionTemplateOverloadCandidate::Function, + /*AllowObjCConversionOnExplicit=*/false, + /*AllowResultConversion=*/false, AllowExplicit, SuppressUserConversions, + PartialOverloading, AggregateCandidateDeduction}, + FunctionTemplate, + FoundDecl, + Args, + IsADLCandidate, + PO}; + + HasDeferredTemplateConstructors |= + isa(FunctionTemplate->getTemplatedDecl()); +} + +void OverloadCandidateSet::AddDeferredMethodTemplateCandidate( + FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, QualType ObjectType, + Expr::Classification ObjectClassification, ArrayRef Args, + bool SuppressUserConversions, bool PartialOverloading, + OverloadCandidateParamOrder PO) { + + assert(!isa(MethodTmpl->getTemplatedDecl())); + + auto *C = + allocateDeferredCandidate(); + + C = new (C) DeferredMethodTemplateOverloadCandidate{ + {nullptr, DeferredFunctionTemplateOverloadCandidate::Method, + /*AllowObjCConversionOnExplicit=*/false, + /*AllowResultConversion=*/false, + /*AllowExplicit=*/false, SuppressUserConversions, PartialOverloading, + /*AggregateCandidateDeduction=*/false}, + MethodTmpl, + FoundDecl, + Args, + ActingContext, + ObjectClassification, + ObjectType, + PO}; +} + +void OverloadCandidateSet::AddDeferredConversionTemplateCandidate( + FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, Expr *From, QualType ToType, + bool AllowObjCConversionOnExplicit, bool AllowExplicit, + bool AllowResultConversion) { + + auto *C = + allocateDeferredCandidate(); + + C = new (C) DeferredConversionTemplateOverloadCandidate{ + {nullptr, DeferredFunctionTemplateOverloadCandidate::Conversion, + AllowObjCConversionOnExplicit, AllowResultConversion, + /*AllowExplicit=*/false, + /*SuppressUserConversions=*/false, + /*PartialOverloading*/ false, + /*AggregateCandidateDeduction=*/false}, + FunctionTemplate, + FoundDecl, + ActingContext, + From, + ToType}; +} + +static void +AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, + DeferredMethodTemplateOverloadCandidate &C) { + + AddMethodTemplateCandidateImmediately( + S, CandidateSet, C.FunctionTemplate, C.FoundDecl, C.ActingContext, + /*ExplicitTemplateArgs=*/nullptr, C.ObjectType, C.ObjectClassification, + C.Args, C.SuppressUserConversions, C.PartialOverloading, C.PO); +} + +static void +AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, + DeferredFunctionTemplateOverloadCandidate &C) { + AddTemplateOverloadCandidateImmediately( + S, CandidateSet, C.FunctionTemplate, C.FoundDecl, + /*ExplicitTemplateArgs=*/nullptr, C.Args, C.SuppressUserConversions, + C.PartialOverloading, C.AllowExplicit, C.IsADLCandidate, C.PO, + C.AggregateCandidateDeduction); +} + +static void +AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, + DeferredConversionTemplateOverloadCandidate &C) { + return AddTemplateConversionCandidateImmediately( + S, CandidateSet, C.FunctionTemplate, C.FoundDecl, C.ActingContext, C.From, + C.ToType, C.AllowObjCConversionOnExplicit, C.AllowExplicit, + C.AllowResultConversion); +} + +void OverloadCandidateSet::InjectNonDeducedTemplateCandidates(Sema &S) { + Candidates.reserve(Candidates.size() + DeferredCandidatesCount); + DeferredTemplateOverloadCandidate *Cand = FirstDeferredCandidate; + while (Cand) { + switch (Cand->Kind) { + case DeferredTemplateOverloadCandidate::Function: + AddTemplateOverloadCandidate( + S, *this, + *static_cast(Cand)); + break; + case DeferredTemplateOverloadCandidate::Method: + AddTemplateOverloadCandidate( + S, *this, + *static_cast(Cand)); + break; + case DeferredTemplateOverloadCandidate::Conversion: + AddTemplateOverloadCandidate( + S, *this, + *static_cast(Cand)); + break; + } + Cand = Cand->Next; + } + FirstDeferredCandidate = nullptr; + DeferredCandidatesCount = 0; +} + OverloadingResult -OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, - iterator &Best) { - llvm::SmallVector Candidates; - std::transform(begin(), end(), std::back_inserter(Candidates), - [](OverloadCandidate &Cand) { return &Cand; }); +OverloadCandidateSet::ResultForBestCandidate(const iterator &Best) { + Best->Best = true; + if (Best->Function && Best->Function->isDeleted()) + return OR_Deleted; + return OR_Success; +} +void OverloadCandidateSet::CudaExcludeWrongSideCandidates( + Sema &S, SmallVectorImpl &Candidates) { // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but // are accepted by both clang and NVCC. However, during a particular // compilation mode only one call variant is viable. We need to @@ -10941,27 +11167,112 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, // -fgpu-exclude-wrong-side-overloads is off. When // -fgpu-exclude-wrong-side-overloads is on, all candidates are compared // uniformly in isBetterOverloadCandidate. - if (S.getLangOpts().CUDA && !S.getLangOpts().GPUExcludeWrongSideOverloads) { - const FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true); - bool ContainsSameSideCandidate = - llvm::any_of(Candidates, [&](OverloadCandidate *Cand) { - // Check viable function only. - return Cand->Viable && Cand->Function && - S.CUDA().IdentifyPreference(Caller, Cand->Function) == - SemaCUDA::CFP_SameSide; - }); - if (ContainsSameSideCandidate) { - auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) { - // Check viable function only to avoid unnecessary data copying/moving. + if (!S.getLangOpts().CUDA || S.getLangOpts().GPUExcludeWrongSideOverloads) + return; + const FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true); + + bool ContainsSameSideCandidate = + llvm::any_of(Candidates, [&](const OverloadCandidate *Cand) { + // Check viable function only. return Cand->Viable && Cand->Function && S.CUDA().IdentifyPreference(Caller, Cand->Function) == - SemaCUDA::CFP_WrongSide; - }; - llvm::erase_if(Candidates, IsWrongSideCandidate); + SemaCUDA::CFP_SameSide; + }); + + if (!ContainsSameSideCandidate) + return; + + auto IsWrongSideCandidate = [&](const OverloadCandidate *Cand) { + // Check viable function only to avoid unnecessary data copying/moving. + return Cand->Viable && Cand->Function && + S.CUDA().IdentifyPreference(Caller, Cand->Function) == + SemaCUDA::CFP_WrongSide; + }; + llvm::erase_if(Candidates, IsWrongSideCandidate); +} + +/// Computes the best viable function (C++ 13.3.3) +/// within an overload candidate set. +/// +/// \param Loc The location of the function name (or operator symbol) for +/// which overload resolution occurs. +/// +/// \param Best If overload resolution was successful or found a deleted +/// function, \p Best points to the candidate function found. +/// +/// \returns The result of overload resolution. +OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S, + SourceLocation Loc, + iterator &Best) { + + assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || + DeferredCandidatesCount == 0 && + "Unexpected deferred template candidates"); + + bool TwoPhaseResolution = + DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled; + + if (TwoPhaseResolution) { + + PerfectViableFunction(S, Loc, Best); + if (Best != end()) + return ResultForBestCandidate(Best); + } + + InjectNonDeducedTemplateCandidates(S); + return BestViableFunctionImpl(S, Loc, Best); +} + +void OverloadCandidateSet::PerfectViableFunction( + Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { + + Best = end(); + for (auto It = begin(); It != end(); ++It) { + + if (!It->isPerfectMatch(S.getASTContext())) + continue; + + // We found a suitable conversion function + // but if there is a template constructor in the target class + // we might prefer that instead. + if (HasDeferredTemplateConstructors && + isa_and_nonnull(It->Function)) { + Best = end(); + break; + } + + if (Best == end()) { + Best = It; + continue; + } + if (Best->Function && It->Function) { + FunctionDecl *D = + S.getMoreConstrainedFunction(Best->Function, It->Function); + if (D == nullptr) { + Best = end(); + break; + } + if (D == It->Function) + Best = It; + continue; } + // ambiguous + Best = end(); + break; } +} + +OverloadingResult OverloadCandidateSet::BestViableFunctionImpl( + Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { + + llvm::SmallVector Candidates; + Candidates.reserve(this->Candidates.size()); + std::transform(begin(), end(), std::back_inserter(Candidates), + [](OverloadCandidate &Cand) { return &Cand; }); + + if (S.getLangOpts().CUDA) + CudaExcludeWrongSideCandidates(S, Candidates); - // Find the best viable function. Best = end(); for (auto *Cand : Candidates) { Cand->Best = false; @@ -10983,9 +11294,8 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, if (Best == end()) return OR_No_Viable_Function; + llvm::SmallVector PendingBest; llvm::SmallVector EquivalentCands; - - llvm::SmallVector PendingBest; PendingBest.push_back(&*Best); Best->Best = true; @@ -11008,25 +11318,15 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, } } - // If we found more than one best candidate, this is ambiguous. if (Best == end()) return OR_Ambiguous; - // Best is the best viable function. - if (Best->Function && Best->Function->isDeleted()) - return OR_Deleted; - - if (auto *M = dyn_cast_or_null(Best->Function); - Kind == CSK_AddressOfOverloadSet && M && - M->isImplicitObjectMemberFunction()) { - return OR_No_Viable_Function; - } + OverloadingResult R = ResultForBestCandidate(Best); if (!EquivalentCands.empty()) S.diagnoseEquivalentInternalLinkageDeclarations(Loc, Best->Function, EquivalentCands); - - return OR_Success; + return R; } namespace { @@ -12733,6 +13033,9 @@ SmallVector OverloadCandidateSet::CompleteCandidates( Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef Args, SourceLocation OpLoc, llvm::function_ref Filter) { + + InjectNonDeducedTemplateCandidates(S); + // Sort the candidates by viability and position. Sorting directly would // be prohibitive, so we make a set of pointers and sort those. SmallVector Cands; @@ -14354,10 +14657,12 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, Expr *ExecConfig, bool AllowTypoCorrection, bool CalleesAddressIsTaken) { - OverloadCandidateSet CandidateSet( - Fn->getExprLoc(), CalleesAddressIsTaken - ? OverloadCandidateSet::CSK_AddressOfOverloadSet - : OverloadCandidateSet::CSK_Normal); + + OverloadCandidateSet::CandidateSetKind CSK = + CalleesAddressIsTaken ? OverloadCandidateSet::CSK_AddressOfOverloadSet + : OverloadCandidateSet::CSK_Normal; + + OverloadCandidateSet CandidateSet(Fn->getExprLoc(), CSK); ExprResult result; if (buildOverloadedCallSet(S, Fn, ULE, Args, LParenLoc, &CandidateSet, @@ -14373,6 +14678,17 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, OverloadingResult OverloadResult = CandidateSet.BestViableFunction(*this, Fn->getBeginLoc(), Best); + // [C++23][over.call.func] + // if overload resolution selects a non-static member function, + // the call is ill-formed; + if (CSK == OverloadCandidateSet::CSK_AddressOfOverloadSet && + Best != CandidateSet.end()) { + if (auto *M = dyn_cast_or_null(Best->Function); + M && M->isImplicitObjectMemberFunction()) { + OverloadResult = OR_No_Viable_Function; + } + } + // Model the case with a call to a templated function whose definition // encloses the call and whose return type contains a placeholder type as if // the UnresolvedLookupExpr was type-dependent. @@ -14708,18 +15024,24 @@ void Sema::LookupOverloadedBinOp(OverloadCandidateSet &CandidateSet, // rewritten candidates using these functions if necessary. AddNonMemberOperatorCandidates(Fns, Args, CandidateSet); + // As template candidates are not deduced immediately, + // persist the array in the overload set. + ArrayRef ReversedArgs; + if (CandidateSet.getRewriteInfo().allowsReversed(Op) || + CandidateSet.getRewriteInfo().allowsReversed(ExtraOp)) + ReversedArgs = CandidateSet.getPersistentArgsArray(Args[1], Args[0]); + // Add operator candidates that are member functions. AddMemberOperatorCandidates(Op, OpLoc, Args, CandidateSet); if (CandidateSet.getRewriteInfo().allowsReversed(Op)) - AddMemberOperatorCandidates(Op, OpLoc, {Args[1], Args[0]}, CandidateSet, + AddMemberOperatorCandidates(Op, OpLoc, ReversedArgs, CandidateSet, OverloadCandidateParamOrder::Reversed); // In C++20, also add any rewritten member candidates. if (ExtraOp) { AddMemberOperatorCandidates(ExtraOp, OpLoc, Args, CandidateSet); if (CandidateSet.getRewriteInfo().allowsReversed(ExtraOp)) - AddMemberOperatorCandidates(ExtraOp, OpLoc, {Args[1], Args[0]}, - CandidateSet, + AddMemberOperatorCandidates(ExtraOp, OpLoc, ReversedArgs, CandidateSet, OverloadCandidateParamOrder::Reversed); } diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 772962ac653f7..0ecdbb3ffb89f 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -6142,9 +6142,9 @@ FunctionDecl *Sema::getMoreConstrainedFunction(FunctionDecl *FD1, assert(!FD1->getDescribedTemplate() && !FD2->getDescribedTemplate() && "not for function templates"); assert(!FD1->isFunctionTemplateSpecialization() || - isa(FD1)); + (isa(FD1))); assert(!FD2->isFunctionTemplateSpecialization() || - isa(FD2)); + (isa(FD2))); FunctionDecl *F1 = FD1; if (FunctionDecl *P = FD1->getTemplateInstantiationPattern(false)) diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp index ba8e2dc372e98..083e743818121 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp @@ -14,7 +14,7 @@ template struct S { // expected-note@#FINST{{in instantiation of function template specialization}} template requires (S{}) void f(T); -void f(int); +void f(long); // Ensure this applies to operator && as well. // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S')}} @@ -22,7 +22,7 @@ void f(int); // expected-note@#F2INST{{in instantiation of function template specialization}} template requires (S{} && true) void f2(T); -void f2(int); +void f2(long); template requires requires { requires S{}; @@ -36,12 +36,12 @@ template requires requires { // } void f3(T); -void f3(int); +void f3(long); // Doesn't diagnose, since this is no longer a compound requirement. template requires (bool(1 && 2)) void f4(T); -void f4(int); +void f4(long); void g() { f(0); // #FINST diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu index 4710c81763adf..3d05839af7528 100644 --- a/clang/test/SemaCUDA/function-overload.cu +++ b/clang/test/SemaCUDA/function-overload.cu @@ -1,6 +1,3 @@ -// REQUIRES: x86-registered-target -// REQUIRES: nvptx-registered-target - // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \ // RUN: -verify=host,hostdefer,devdefer,expected %s // RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only \ diff --git a/clang/test/SemaCXX/implicit-member-functions.cpp b/clang/test/SemaCXX/implicit-member-functions.cpp index 1554b1af5d59a..8350eac5b88a0 100644 --- a/clang/test/SemaCXX/implicit-member-functions.cpp +++ b/clang/test/SemaCXX/implicit-member-functions.cpp @@ -54,31 +54,24 @@ namespace PR7594 { namespace Recursion { template struct InvokeCopyConstructor { static const T &get(); - typedef decltype(T(get())) type; // expected-error {{no matching conver}} + typedef decltype(T(get())) type; }; struct B; struct A { - // expected-note@-1 {{while substituting deduced template arguments}} typedef B type; template::type> - // expected-note@-1 {{in instantiation of template class}} A(const T &); - // expected-note@-1 {{in instantiation of default argument}} }; - struct B { // expected-note {{while declaring the implicit copy constructor for 'B'}} - // expected-note@-1 {{candidate constructor (the implicit move }} - B(); // expected-note {{candidate constructor not viable}} + struct B { + B(); A a; }; // Triggering the declaration of B's copy constructor causes overload - // resolution to occur for A's copying constructor, which instantiates - // InvokeCopyConstructor, which triggers the declaration of B's copy - // constructor. Notionally, this happens when we get to the end of the - // definition of 'struct B', so there is no declared copy constructor yet. - // - // This behavior is g++-compatible, but isn't exactly right; the class is - // supposed to be incomplete when we implicitly declare its special members. + // resolution to occur for A's copying constructor, which picks + // the implicit copy constructor of A. + // Because that copy constructor is always a perfect match the template + // candidate is not instantiated. B b = B(); diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp new file mode 100644 index 0000000000000..877816ca013ec --- /dev/null +++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp @@ -0,0 +1,185 @@ +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++11 %s +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++20 %s +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++2c %s + +template +struct Invalid { static_assert(false, "instantiated Invalid"); }; // #err-invalid + +template +int f(T a, Invalid = {}); // #note-f + +// sanity check +int e1 = f(0); +//expected-error@#err-invalid {{static assertion failed: instantiated Invalid}} +//expected-note@-2 {{in instantiation of default function argument expression for 'f' required here}} +//expected-note@#note-f {{in instantiation of template class 'Invalid' requested here}} +//expected-note@#note-f {{passing argument to parameter here}} + +int f(int); +int ok1 = f(0); +int e4 = f((const int&)(ok1)); + +int f(int, int = 0); +int ok2 = f(0, 0); + +int e2 = f(0L); +//expected-error@#err-invalid {{static assertion failed: instantiated Invalid}} +//expected-note@-2 {{in instantiation of default function argument expression for 'f' required here}} +//expected-note@#note-f {{in instantiation of template class 'Invalid' requested here}} +//expected-note@#note-f {{passing argument to parameter here}} + +int f(long); +int ok3 = f(0L); + +template +struct Invalid2 { static_assert(false, "instantiated Invalid2"); }; // #err-qualifiers + +template +int ref(T a, Invalid2 = {}); // expected-note 2{{here}} +int ref(int&); +int ref1 = ref(ok3); +int ref2 = ref((const int&)ok3); // expected-note {{here}} +//expected-error@#err-qualifiers {{static assertion failed: instantiated Invalid2}} + + +template +int f_alias(T a, Invalid = {}); +using Alias = int; +int f_alias(Alias); +int ok4 = f_alias(0); + +#if __cplusplus >= 202002 + +struct Copyable { + template + requires __is_constructible(Copyable, T) + explicit Copyable(T op) noexcept; // #1 + Copyable(const Copyable&) noexcept = default; // #2 +}; +static_assert(__is_constructible(Copyable, const Copyable&)); + +struct ImplicitlyCopyable { + template + requires __is_constructible(ImplicitlyCopyable, T) + explicit ImplicitlyCopyable(T op) = delete; // #1 +}; +static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&)); + + +struct Movable { + template + requires __is_constructible(Movable, T) // #err-self-constraint-1 + explicit Movable(T op) noexcept; // #1 + Movable(Movable&&) noexcept = default; // #2 +}; +static_assert(__is_constructible(Movable, Movable&&)); +static_assert(__is_constructible(Movable, const Movable&)); +// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} + +static_assert(__is_constructible(Movable, int)); +// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \ +// expected-note@-1 2{{}} +// expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}} +// expected-note@#err-self-constraint-1 4{{}} + +template +struct Members { + constexpr auto f(auto) { + static_assert(false, ""); + } + constexpr auto f(int) { return 1; } + constexpr auto f(int) requires true { return 2; } + + constexpr auto g(auto) { + static_assert(false, "instantiated member"); //#err-qualified-member + return 0; + } + constexpr auto g(int) & { return 1; } + + static constexpr auto s(auto) { + static_assert(false, ""); + } + static constexpr auto s(int) { + return 1; + } + static constexpr auto s(int) requires true { + return 2; + } +}; + +static_assert(Members{}.f(0) == 2); +static_assert(Members{}.g(0) == 0); +// expected-error@#err-qualified-member {{static assertion failed: instantiated member}} \ +// expected-note@-1{{in instantiation of function template specialization 'Members::g' }} +Members m1; +static_assert(m1.g(0) == 1); +static_assert(Members{}.s(0) == 2); + + +namespace ConstructorInit{ +struct S { + template + S(T&&) {} +}; +struct Test { + operator S() = delete; +}; + +static_assert(__is_constructible(S, Test)); +} + +namespace RefBinding { + +template struct remove_reference; +template struct remove_reference<_Tp &> { + using type = _Tp; +}; +template remove_reference<_Tp>::type move(_Tp &&); +template struct _Head_base { + _Head_base(_Head &__h) : _M_head_impl(__h) {} + template _Head_base(_UHead &&); + _Head _M_head_impl; +}; + +template void forward_as_tuple(_Elements &&) { + _Head_base<_Elements &&>(_Elements{}); +} +struct StringRef { + void operator[](const StringRef __k) { forward_as_tuple((move)(__k)); } +}; + +} + +template struct tuple {}; +struct BonkersBananas { + template operator T(); + template explicit operator tuple() = delete; +}; +static_assert(!__is_constructible(tuple, BonkersBananas)); + +namespace GH62096 { +template +struct Oops { + static_assert(sizeof(T) == 0); // #GH62096-err + static constexpr bool value = true; +}; + +template +concept Operator = Oops::value; // #GH62096-note1 + +template void f(OP op); // // #GH62096-note2 +void f(int); + +void g(int n) { f(n); } // OK +void h(short n) { f(n); } +// expected-error@#GH62096-err {{static assertion failed due to requirement 'sizeof(short) == 0'}} \ +// expected-note@-1{{in instantiation of function template specialization}} \ +// expected-note@-1{{while checking constraint satisfaction for template}} +// expected-note@#GH62096-note1{{in instantiation}} +// expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}} +// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}} +// expected-note@#GH62096-note2{{while checking the satisfaction of concept}} +// expected-note@#GH62096-err {{expression evaluates}} +} + +#endif diff --git a/clang/test/SemaTemplate/instantiate-function-params.cpp b/clang/test/SemaTemplate/instantiate-function-params.cpp index 7dd5595de58a3..eb2a7c5d4e8d6 100644 --- a/clang/test/SemaTemplate/instantiate-function-params.cpp +++ b/clang/test/SemaTemplate/instantiate-function-params.cpp @@ -6,13 +6,12 @@ template struct if_ { typedef if_c< static_cast(T1::value)> almost_type_; // expected-note 7{{in instantiation}} }; template struct wrap_constraints { }; -template +template inline char has_constraints_(Model* , // expected-note 3{{candidate template ignored}} - wrap_constraints* = 0); // expected-note 4{{in instantiation}} - + wrap_constraints* = 0); template struct not_satisfied { static const bool value = sizeof( has_constraints_((Model*)0) == 1); // expected-error 3{{no matching function}} \ - // expected-note 4{{while substituting deduced template arguments into function template 'has_constraints_' [with }} + // expected-note 4{{in instantiation}} }; template struct requirement_; template struct instantiate { diff --git a/clang/test/Templight/templight-empty-entries-fix.cpp b/clang/test/Templight/templight-empty-entries-fix.cpp index d13b748068efe..7f34b10134929 100644 --- a/clang/test/Templight/templight-empty-entries-fix.cpp +++ b/clang/test/Templight/templight-empty-entries-fix.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -templight-dump -Wno-unused-value %s 2>&1 | FileCheck %s -void a() { +void a(long) { [] {}; } @@ -17,14 +17,14 @@ void a() { // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:4:3'$}} // CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:4:3'$}} -template void a() { a(); } +template void a(long) { a(0); } // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template non-type parameter 0 of a$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -42,29 +42,29 @@ template void a() { a(); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a<0>'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a<0>'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} template struct b { typedef int c; }; -template ::c> void a() { a(); } +template ::c> void a(long) { a(0); } // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -130,25 +130,25 @@ template ::c> void a() { a(); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template non-type parameter 0 of a$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -166,34 +166,10 @@ template ::c> void a() { a(); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} template void d(int = 0) { d(); } -// CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+a$}} -// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} -// CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} -// CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+a$}} -// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} -// CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} -// CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+a$}} -// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} -// CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} -// CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+a$}} -// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} -// CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} @@ -249,41 +225,41 @@ void e() { } // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} template class> @@ -299,71 +275,71 @@ void foo() { // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} // CHECK: {{^poi:[ ]+''$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} // CHECK: {{^poi:[ ]+''$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PartialOrderingTTP$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:5'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PartialOrderingTTP$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:5'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'d'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'d'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:171:29'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:171:29'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} From 73b8750a970ddaec5da1540c100561bd5104bca6 Mon Sep 17 00:00:00 2001 From: James Newling Date: Wed, 16 Apr 2025 10:10:23 -0700 Subject: [PATCH 148/710] [mlir][vector] fold transpose(poison) -> poison (#135675) Following on from https://github.com/llvm/llvm-project/pull/133988 --------- Signed-off-by: James Newling --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 4 ++++ mlir/test/Dialect/Vector/canonicalize.mlir | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 504032a398fbe..3ef947c7b14a4 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -6009,6 +6009,10 @@ OpFoldResult vector::TransposeOp::fold(FoldAdaptor adaptor) { if (attr.isSplat()) return attr.reshape(getResultVectorType()); + // Eliminate poison transpose ops. + if (llvm::dyn_cast_if_present(adaptor.getVector())) + return ub::PoisonAttr::get(getContext()); + // Eliminate identity transpose ops. This happens when the dimensions of the // input vector remain in their original order after the transpose operation. ArrayRef perm = getPermutation(); diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 733a2c67d2c0c..c28afd0dee97e 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -2240,6 +2240,17 @@ func.func @transpose_splat2(%arg : f32) -> vector<3x4xf32> { // ----- +// CHECK-LABEL: transpose_poison +// CHECK: %[[POISON:.*]] = ub.poison : vector<4x6xi8> +// CHECK: return %[[POISON]] : vector<4x6xi8> +func.func @transpose_poison() -> vector<4x6xi8> { + %poison = ub.poison : vector<6x4xi8> + %transpose = vector.transpose %poison, [1, 0] : vector<6x4xi8> to vector<4x6xi8> + return %transpose : vector<4x6xi8> +} + +// ----- + // CHECK-LABEL: func.func @insert_1d_constant // CHECK-DAG: %[[ACST:.*]] = arith.constant dense<[9, 1, 2]> : vector<3xi32> // CHECK-DAG: %[[BCST:.*]] = arith.constant dense<[0, 9, 2]> : vector<3xi32> From eea86489dd3df5b66d75ee2590f4824913c411d5 Mon Sep 17 00:00:00 2001 From: James Newling Date: Wed, 16 Apr 2025 10:12:38 -0700 Subject: [PATCH 149/710] [mlir][vector] Fold broadcast(poison) -> poison (#135677) In addition to the new folder, I've also a test for broadcast(splat) -> splat which I think was missing Signed-off-by: James Newling --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 2 ++ mlir/test/Dialect/Vector/canonicalize.mlir | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 3ef947c7b14a4..4b2fba03ce551 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -2590,6 +2590,8 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) { } if (auto attr = llvm::dyn_cast(adaptor.getSource())) return DenseElementsAttr::get(vectorType, attr.getSplatValue()); + if (llvm::dyn_cast(adaptor.getSource())) + return ub::PoisonAttr::get(getContext()); return {}; } diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index c28afd0dee97e..b24cf93707d8b 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -1151,6 +1151,28 @@ func.func @bitcast_i8_to_i32() -> (vector<4xi32>, vector<4xi32>) { // ----- +// CHECK-LABEL: broadcast_poison +// CHECK: %[[POISON:.*]] = ub.poison : vector<4x6xi8> +// CHECK: return %[[POISON]] : vector<4x6xi8> +func.func @broadcast_poison() -> vector<4x6xi8> { + %poison = ub.poison : vector<6xi8> + %broadcast = vector.broadcast %poison : vector<6xi8> to vector<4x6xi8> + return %broadcast : vector<4x6xi8> +} + +// ----- + +// CHECK-LABEL: broadcast_splat_constant +// CHECK: %[[CONST:.*]] = arith.constant dense<1> : vector<4x6xi8> +// CHECK: return %[[CONST]] : vector<4x6xi8> +func.func @broadcast_splat_constant() -> vector<4x6xi8> { + %cst = arith.constant dense<1> : vector<6xi8> + %broadcast = vector.broadcast %cst : vector<6xi8> to vector<4x6xi8> + return %broadcast : vector<4x6xi8> +} + +// ----- + // CHECK-LABEL: broadcast_folding1 // CHECK: %[[CST:.*]] = arith.constant dense<42> : vector<4xi32> // CHECK-NOT: vector.broadcast From 52e0337ea34142f55c427493e9ca2be5fce2dd38 Mon Sep 17 00:00:00 2001 From: Chris B Date: Wed, 16 Apr 2025 12:13:19 -0500 Subject: [PATCH 150/710] [HLSL][OpenCL] Strip addrspace from implicit cast diags (#135830) The address space of a source value for an implicit cast isn't really relevant when emitting conversion warnings. Since the lvalue->rvalue cast effectively removes the address space they don't factor in, but they do create visual noise in the diagnostics. This is a small quality-of-life fixup to get in as HLSL adopts more address space annotations. --- clang/lib/Sema/SemaChecking.cpp | 8 ++++++++ clang/test/SemaHLSL/Language/ImpCastAddrSpace.hlsl | 12 ++++++++++++ clang/test/SemaOpenCL/cl20-device-side-enqueue.cl | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaHLSL/Language/ImpCastAddrSpace.hlsl diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 42da9ba97e0d3..26c2dc655a931 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -11417,6 +11417,14 @@ static void AnalyzeAssignment(Sema &S, BinaryOperator *E) { static void DiagnoseImpCast(Sema &S, Expr *E, QualType SourceType, QualType T, SourceLocation CContext, unsigned diag, bool pruneControlFlow = false) { + // For languages like HLSL and OpenCL, implicit conversion diagnostics listing + // address space annotations isn't really useful. The warnings aren't because + // you're converting a `private int` to `unsigned int`, it is because you're + // conerting `int` to `unsigned int`. + if (SourceType.hasAddressSpace()) + SourceType = S.getASTContext().removeAddrSpaceQualType(SourceType); + if (T.hasAddressSpace()) + T = S.getASTContext().removeAddrSpaceQualType(T); if (pruneControlFlow) { S.DiagRuntimeBehavior(E->getExprLoc(), E, S.PDiag(diag) diff --git a/clang/test/SemaHLSL/Language/ImpCastAddrSpace.hlsl b/clang/test/SemaHLSL/Language/ImpCastAddrSpace.hlsl new file mode 100644 index 0000000000000..61e71b219b721 --- /dev/null +++ b/clang/test/SemaHLSL/Language/ImpCastAddrSpace.hlsl @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -Wconversion -fnative-half-type %s -verify + +static double D = 2.0; +static int I = D; // expected-warning{{implicit conversion turns floating-point number into integer: 'double' to 'int'}} +groupshared float F = I; // expected-warning{{implicit conversion from 'int' to 'float' may lose precision}} + +export void fn() { + half d = I; // expected-warning{{implicit conversion from 'int' to 'half' may lose precision}} + int i = D; // expected-warning{{implicit conversion turns floating-point number into integer: 'double' to 'int'}} + int j = F; // expected-warning{{implicit conversion turns floating-point number into integer: 'float' to 'int'}} + int k = d; // expected-warning{{implicit conversion turns floating-point number into integer: 'half' to 'int'}} +} diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl index 36b901fc5f29e..524de8ce2f7dc 100644 --- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl @@ -97,7 +97,7 @@ kernel void enqueue_kernel_tests(void) { }, c, 1024L); #ifdef WCONV -// expected-warning-re@-2{{implicit conversion changes signedness: '__private char' to 'unsigned {{int|long}}'}} +// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to 'unsigned {{int|long}}'}} #endif #define UINT_MAX 4294967295 From 81739c39db11b7f9a4f3528c1c66b552e57b47e4 Mon Sep 17 00:00:00 2001 From: Volodymyr Sapsai Date: Wed, 16 Apr 2025 10:14:05 -0700 Subject: [PATCH 151/710] [Modules] Fix an identifier hiding a function-like macro definition. (#135471) We emit a macro definition only in a module defining it. But it means that if another module has an identifier with the same name as the macro, the users of such module won't be able to use the macro anymore. Fix by storing that an identifier has a macro definition that's not in a current module (`MacroDirectivesOffset == 0`). This way `IdentifierLookupVisitor` knows not to stop at the first module with an identifier but to keep checking included modules for the actual macro definition. Fixes issue #32040. rdar://30258278 --- clang/lib/Serialization/ASTReader.cpp | 17 ++++++++---- clang/lib/Serialization/ASTReaderInternals.h | 6 ++++ clang/lib/Serialization/ASTWriter.cpp | 16 +++++++---- clang/test/Modules/macro-identifier-hiding.c | 29 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 10 deletions(-) create mode 100644 clang/test/Modules/macro-identifier-hiding.c diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 02c31dff620ec..58d3d33b31644 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1131,7 +1131,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, bool HasRevertedTokenIDToIdentifier = readBit(Bits); bool Poisoned = readBit(Bits); bool ExtensionToken = readBit(Bits); - bool HadMacroDefinition = readBit(Bits); + bool HasMacroDefinition = readBit(Bits); assert(Bits == 0 && "Extra bits in the identifier?"); DataLen -= sizeof(uint16_t) * 2; @@ -1151,14 +1151,17 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, "Incorrect C++ operator keyword flag"); (void)CPlusPlusOperatorKeyword; - // If this identifier is a macro, deserialize the macro - // definition. - if (HadMacroDefinition) { + // If this identifier has a macro definition, deserialize it or notify the + // visitor the actual definition is in a different module. + if (HasMacroDefinition) { uint32_t MacroDirectivesOffset = endian::readNext(d); DataLen -= 4; - Reader.addPendingMacro(II, &F, MacroDirectivesOffset); + if (MacroDirectivesOffset) + Reader.addPendingMacro(II, &F, MacroDirectivesOffset); + else + hasMacroDefinitionInDependencies = true; } Reader.SetIdentifierInfo(ID, II); @@ -2419,6 +2422,10 @@ namespace { // declarations it needs. ++NumIdentifierLookupHits; Found = *Pos; + if (Trait.hasMoreInformationInDependencies()) { + // Look for the identifier in extra modules as they contain more info. + return false; + } return true; } diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h index 353e0a53cad9b..4a7794889b039 100644 --- a/clang/lib/Serialization/ASTReaderInternals.h +++ b/clang/lib/Serialization/ASTReaderInternals.h @@ -286,6 +286,8 @@ class ASTIdentifierLookupTrait : public ASTIdentifierLookupTraitBase { // identifier that was constructed before the AST file was read. IdentifierInfo *KnownII; + bool hasMacroDefinitionInDependencies = false; + public: using data_type = IdentifierInfo *; @@ -300,6 +302,10 @@ class ASTIdentifierLookupTrait : public ASTIdentifierLookupTraitBase { IdentifierID ReadIdentifierID(const unsigned char *d); ASTReader &getReader() const { return Reader; } + + bool hasMoreInformationInDependencies() const { + return hasMacroDefinitionInDependencies; + } }; /// The on-disk hash table used to contain information about diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 95b5718f1d140..8c261e13d5ea4 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -3795,7 +3795,10 @@ bool IsInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset, II->getNotableIdentifierID() != tok::NotableIdentifierKind::not_notable || II->getBuiltinID() != Builtin::ID::NotBuiltin || II->getObjCKeywordID() != tok::ObjCKeywordKind::objc_not_keyword; - if (MacroOffset || II->isPoisoned() || (!IsModule && IsInteresting) || + if (MacroOffset || + (II->hasMacroDefinition() && + II->hasFETokenInfoChangedSinceDeserialization()) || + II->isPoisoned() || (!IsModule && IsInteresting) || II->hasRevertedTokenIDToIdentifier() || (NeedDecls && II->getFETokenInfo())) return true; @@ -3874,7 +3877,8 @@ class ASTIdentifierTableTrait { if (isInterestingIdentifier(II, MacroOffset)) { DataLen += 2; // 2 bytes for builtin ID DataLen += 2; // 2 bytes for flags - if (MacroOffset) + if (MacroOffset || (II->hasMacroDefinition() && + II->hasFETokenInfoChangedSinceDeserialization())) DataLen += 4; // MacroDirectives offset. if (NeedDecls && IdResolver) @@ -3905,15 +3909,17 @@ class ASTIdentifierTableTrait { assert((Bits & 0xffff) == Bits && "ObjCOrBuiltinID too big for ASTReader."); LE.write(Bits); Bits = 0; - bool HadMacroDefinition = MacroOffset != 0; - Bits = (Bits << 1) | unsigned(HadMacroDefinition); + bool HasMacroDefinition = + (MacroOffset != 0) || (II->hasMacroDefinition() && + II->hasFETokenInfoChangedSinceDeserialization()); + Bits = (Bits << 1) | unsigned(HasMacroDefinition); Bits = (Bits << 1) | unsigned(II->isExtensionToken()); Bits = (Bits << 1) | unsigned(II->isPoisoned()); Bits = (Bits << 1) | unsigned(II->hasRevertedTokenIDToIdentifier()); Bits = (Bits << 1) | unsigned(II->isCPlusPlusOperatorKeyword()); LE.write(Bits); - if (HadMacroDefinition) + if (HasMacroDefinition) LE.write(MacroOffset); if (NeedDecls && IdResolver) { diff --git a/clang/test/Modules/macro-identifier-hiding.c b/clang/test/Modules/macro-identifier-hiding.c new file mode 100644 index 0000000000000..4cd7cf0500322 --- /dev/null +++ b/clang/test/Modules/macro-identifier-hiding.c @@ -0,0 +1,29 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache \ +// RUN: -fsyntax-only %t/test.c -verify +// Test again with the populated module cache. +// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules.cache \ +// RUN: -fsyntax-only %t/test.c -verify + +// Test that an identifier with the same name as a macro doesn't hide this +// macro from the includers. + +//--- macro-definition.h +#define __P(protos) () +#define __Q(protos) () + +//--- macro-transitive.h +#include "macro-definition.h" +void test(int __P) {} // not "interesting" identifier +struct __Q {}; // "interesting" identifier + +//--- module.modulemap +module MacroDefinition { header "macro-definition.h" export * } +module MacroTransitive { header "macro-transitive.h" export * } + +//--- test.c +// expected-no-diagnostics +#include "macro-transitive.h" +void foo __P(()); +void bar __Q(()); From 913dcf1aa36f3ea2d67a0d2b05b9d1375987e553 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 10:03:25 -0700 Subject: [PATCH 152/710] [SLP]Fix type promotion for smax reduction with unsigned reduced operands Need to add an extra bit for sign info for unsigned reduced values to generate correct code. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- .../RISCV/smax-reduction-unsigned-missing-sign.ll | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 810d44343c4a9..83252bdb51ea2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20521,7 +20521,8 @@ void BoUpSLP::computeMinimumValueSizes() { } bool IsSignedCmp = false; if (UserIgnoreList && all_of(*UserIgnoreList, [](Value *V) { - return match(V, m_SMin(m_Value(), m_Value())); + return match(V, m_SMin(m_Value(), m_Value())) || + match(V, m_SMax(m_Value(), m_Value())); })) IsSignedCmp = true; while (NodeIdx < VectorizableTree.size()) { diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll index e6408572acf8f..a8efc2622aa80 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-reduction-unsigned-missing-sign.ll @@ -7,8 +7,9 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> , i8 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.smax.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = zext i1 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP4]] ; entry: From ce7466f66c8a279917cd1c9486846b6857a82fe8 Mon Sep 17 00:00:00 2001 From: AdityaK Date: Wed, 16 Apr 2025 10:30:53 -0700 Subject: [PATCH 153/710] NFC: Rewrite auto castIter -> const auto *castIter (#133521) --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index d6b093c5fb86b..1c160911ce780 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -3598,7 +3598,7 @@ void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, SmallVector yields; TypeFn castVal = TypeFn::cast_signed; - auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { + const auto *castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { return attr.getName() == "cast"; }); if (castIter != attrs.end()) { From 80c19b3b1d59294be63d8b55fedc317305abbdbe Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Wed, 16 Apr 2025 10:33:09 -0700 Subject: [PATCH 154/710] [CIR] Upstream initial support for complete record types (#135844) This adds basic support for populating record types. In order to keep the change small, everything non-essential was deferred to a later change set. Only non-recursive structures are handled. Structures padding is not yet implemented. Bitfields are not supported. No attempt is made to handle ABI requirements for passing structure arguments. --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 2 + .../include/clang/CIR/Dialect/IR/CIRTypes.td | 3 + clang/include/clang/CIR/MissingFeatures.h | 8 + clang/lib/CIR/CodeGen/CIRGenRecordLayout.h | 54 +++++ .../CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp | 223 ++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 47 +++- clang/lib/CIR/CodeGen/CIRGenTypes.h | 22 +- clang/lib/CIR/CodeGen/CMakeLists.txt | 1 + clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 2 +- clang/lib/CIR/Dialect/IR/CIRTypes.cpp | 6 + .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 29 +++ clang/test/CIR/CodeGen/struct.c | 27 +++ 12 files changed, 421 insertions(+), 3 deletions(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenRecordLayout.h create mode 100644 clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 68a4505ca7a5a..a787908fb9f8d 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -95,6 +95,8 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return getZeroAttr(arrTy); if (auto ptrTy = mlir::dyn_cast(ty)) return getConstNullPtrAttr(ptrTy); + if (auto recordTy = mlir::dyn_cast(ty)) + return getZeroAttr(recordTy); if (mlir::isa(ty)) { return getCIRBoolAttr(false); } diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td index c60af47f09def..23e20755dca95 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td @@ -499,6 +499,9 @@ def CIR_RecordType : CIR_Type<"Record", "record", std::string getPrefixedName() { return getKindAsStr() + "." + getName().getValue().str(); } + + void complete(llvm::ArrayRef members, bool packed, + bool isPadded); }]; let hasCustomAssemblyFormat = 1; diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 6f2fd2cb2b3ad..5ba55c53dfc4d 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -103,6 +103,14 @@ struct MissingFeatures { // RecordType static bool recordTypeLayoutInfo() { return false; } + static bool recursiveRecordLayout() { return false; } + static bool skippedLayout() { return false; } + static bool astRecordDeclAttr() { return false; } + static bool cxxSupport() { return false; } + static bool packedRecords() { return false; } + static bool recordPadding() { return false; } + static bool recordZeroInit() { return false; } + static bool zeroSizeRecordMembers() { return false; } // Misc static bool cxxABI() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h new file mode 100644 index 0000000000000..a51e0460d1074 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CIR_CIRGENRECORDLAYOUT_H +#define LLVM_CLANG_LIB_CIR_CIRGENRECORDLAYOUT_H + +#include "clang/AST/Decl.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" + +namespace clang::CIRGen { + +/// This class handles record and union layout info while lowering AST types +/// to CIR types. +/// +/// These layout objects are only created on demand as CIR generation requires. +class CIRGenRecordLayout { + friend class CIRGenTypes; + + CIRGenRecordLayout(const CIRGenRecordLayout &) = delete; + void operator=(const CIRGenRecordLayout &) = delete; + +private: + /// The CIR type corresponding to this record layout; used when laying it out + /// as a complete object. + cir::RecordType completeObjectType; + + /// Map from (non-bit-field) record field to the corresponding cir record type + /// field no. This info is populated by the record builder. + llvm::DenseMap fieldInfo; + +public: + CIRGenRecordLayout(cir::RecordType completeObjectType) + : completeObjectType(completeObjectType) {} + + /// Return the "complete object" LLVM type associated with + /// this record. + cir::RecordType getCIRType() const { return completeObjectType; } + + /// Return cir::RecordType element number that corresponds to the field FD. + unsigned getCIRFieldNo(const clang::FieldDecl *FD) const { + FD = FD->getCanonicalDecl(); + assert(fieldInfo.count(FD) && "Invalid field for record!"); + return fieldInfo.lookup(FD); + } +}; + +} // namespace clang::CIRGen + +#endif diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp new file mode 100644 index 0000000000000..4e3adeaf50187 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp @@ -0,0 +1,223 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to compute the layout of a record. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenBuilder.h" +#include "CIRGenModule.h" +#include "CIRGenTypes.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/RecordLayout.h" +#include "clang/CIR/Dialect/IR/CIRAttrs.h" +#include "llvm/Support/Casting.h" + +#include + +using namespace llvm; +using namespace clang; +using namespace clang::CIRGen; + +namespace { +/// The CIRRecordLowering is responsible for lowering an ASTRecordLayout to an +/// mlir::Type. Some of the lowering is straightforward, some is not. +// TODO: Detail some of the complexities and weirdnesses? +// (See CGRecordLayoutBuilder.cpp) +struct CIRRecordLowering final { + + // MemberInfo is a helper structure that contains information about a record + // member. In addition to the standard member types, there exists a sentinel + // member type that ensures correct rounding. + struct MemberInfo final { + CharUnits offset; + enum class InfoKind { Field } kind; + mlir::Type data; + union { + const FieldDecl *fieldDecl; + // CXXRecordDecl will be used here when base types are supported. + }; + MemberInfo(CharUnits offset, InfoKind kind, mlir::Type data, + const FieldDecl *fieldDecl = nullptr) + : offset(offset), kind(kind), data(data), fieldDecl(fieldDecl) {}; + // MemberInfos are sorted so we define a < operator. + bool operator<(const MemberInfo &other) const { + return offset < other.offset; + } + }; + // The constructor. + CIRRecordLowering(CIRGenTypes &cirGenTypes, const RecordDecl *recordDecl, + bool isPacked); + + void lower(); + + void accumulateFields(); + + CharUnits bitsToCharUnits(uint64_t bitOffset) { + return astContext.toCharUnitsFromBits(bitOffset); + } + + CharUnits getSize(mlir::Type Ty) { + assert(!cir::MissingFeatures::recordTypeLayoutInfo()); + return CharUnits::One(); + } + CharUnits getAlignment(mlir::Type Ty) { + assert(!cir::MissingFeatures::recordTypeLayoutInfo()); + return CharUnits::One(); + } + + mlir::Type getStorageType(const FieldDecl *fieldDecl) { + mlir::Type type = cirGenTypes.convertTypeForMem(fieldDecl->getType()); + if (fieldDecl->isBitField()) { + cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(), + "getStorageType for bitfields"); + } + return type; + } + + uint64_t getFieldBitOffset(const FieldDecl *fieldDecl) { + return astRecordLayout.getFieldOffset(fieldDecl->getFieldIndex()); + } + + /// Fills out the structures that are ultimately consumed. + void fillOutputFields(); + + CIRGenTypes &cirGenTypes; + CIRGenBuilderTy &builder; + const ASTContext &astContext; + const RecordDecl *recordDecl; + const ASTRecordLayout &astRecordLayout; + // Helpful intermediate data-structures + std::vector members; + // Output fields, consumed by CIRGenTypes::computeRecordLayout + llvm::SmallVector fieldTypes; + llvm::DenseMap fields; + + LLVM_PREFERRED_TYPE(bool) + unsigned zeroInitializable : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned packed : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned padded : 1; + +private: + CIRRecordLowering(const CIRRecordLowering &) = delete; + void operator=(const CIRRecordLowering &) = delete; +}; // CIRRecordLowering +} // namespace + +CIRRecordLowering::CIRRecordLowering(CIRGenTypes &cirGenTypes, + const RecordDecl *recordDecl, + bool isPacked) + : cirGenTypes(cirGenTypes), builder(cirGenTypes.getBuilder()), + astContext(cirGenTypes.getASTContext()), recordDecl(recordDecl), + astRecordLayout( + cirGenTypes.getASTContext().getASTRecordLayout(recordDecl)), + zeroInitializable(true), packed(isPacked), padded(false) {} + +void CIRRecordLowering::lower() { + if (recordDecl->isUnion()) { + cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(), + "lower: union"); + return; + } + + if (isa(recordDecl)) { + cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(), + "lower: class"); + return; + } + + assert(!cir::MissingFeatures::cxxSupport()); + + accumulateFields(); + + llvm::stable_sort(members); + // TODO: implement clipTailPadding once bitfields are implemented + assert(!cir::MissingFeatures::bitfields()); + // TODO: implemented packed records + assert(!cir::MissingFeatures::packedRecords()); + // TODO: implement padding + assert(!cir::MissingFeatures::recordPadding()); + // TODO: support zeroInit + assert(!cir::MissingFeatures::recordZeroInit()); + + fillOutputFields(); +} + +void CIRRecordLowering::fillOutputFields() { + for (const MemberInfo &member : members) { + if (member.data) + fieldTypes.push_back(member.data); + if (member.kind == MemberInfo::InfoKind::Field) { + if (member.fieldDecl) + fields[member.fieldDecl->getCanonicalDecl()] = fieldTypes.size() - 1; + // A field without storage must be a bitfield. + assert(!cir::MissingFeatures::bitfields()); + } + assert(!cir::MissingFeatures::cxxSupport()); + } +} + +void CIRRecordLowering::accumulateFields() { + for (const FieldDecl *field : recordDecl->fields()) { + if (field->isBitField()) { + cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(), + "accumulate bitfields"); + ++field; + } else if (!field->isZeroSize(astContext)) { + members.push_back(MemberInfo(bitsToCharUnits(getFieldBitOffset(field)), + MemberInfo::InfoKind::Field, + getStorageType(field), field)); + ++field; + } else { + // TODO(cir): do we want to do anything special about zero size members? + assert(!cir::MissingFeatures::zeroSizeRecordMembers()); + ++field; + } + } +} + +std::unique_ptr +CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) { + CIRRecordLowering lowering(*this, rd, /*packed=*/false); + assert(ty->isIncomplete() && "recomputing record layout?"); + lowering.lower(); + + // If we're in C++, compute the base subobject type. + if (llvm::isa(rd) && !rd->isUnion() && + !rd->hasAttr()) { + cgm.errorNYI(rd->getSourceRange(), "computeRecordLayout: CXXRecordDecl"); + } + + // Fill in the record *after* computing the base type. Filling in the body + // signifies that the type is no longer opaque and record layout is complete, + // but we may need to recursively layout rd while laying D out as a base type. + assert(!cir::MissingFeatures::astRecordDeclAttr()); + ty->complete(lowering.fieldTypes, lowering.packed, lowering.padded); + + auto rl = std::make_unique(ty ? *ty : cir::RecordType()); + + assert(!cir::MissingFeatures::recordZeroInit()); + assert(!cir::MissingFeatures::cxxSupport()); + assert(!cir::MissingFeatures::bitfields()); + + // Add all the field numbers. + rl->fieldInfo.swap(lowering.fields); + + // Dump the layout, if requested. + if (getASTContext().getLangOpts().DumpRecordLayouts) { + cgm.errorNYI(rd->getSourceRange(), "computeRecordLayout: dump layout"); + } + + // TODO: implement verification + return rl; +} diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index f625f83257859..ec77c4428d43b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -112,6 +112,18 @@ std::string CIRGenTypes::getRecordTypeName(const clang::RecordDecl *recordDecl, return builder.getUniqueRecordName(std::string(typeName)); } +// Return true if it is safe to convert the specified record decl to CIR and lay +// it out, false if doing so would cause us to get into a recursive compilation +// mess. +static bool isSafeToConvert(const RecordDecl *RD, CIRGenTypes &CGT) { + // If no records are being laid out, we can certainly do this one. + if (CGT.noRecordsBeingLaidOut()) + return true; + + assert(!cir::MissingFeatures::recursiveRecordLayout()); + return false; +} + /// Lay out a tagged decl type like struct or union. mlir::Type CIRGenTypes::convertRecordDeclType(const clang::RecordDecl *rd) { // TagDecl's are not necessarily unique, instead use the (clang) type @@ -132,7 +144,40 @@ mlir::Type CIRGenTypes::convertRecordDeclType(const clang::RecordDecl *rd) { if (!rd || !rd->isCompleteDefinition() || entry.isComplete()) return entry; - cgm.errorNYI(rd->getSourceRange(), "Complete record type"); + // If converting this type would cause us to infinitely loop, don't do it! + if (!isSafeToConvert(rd, *this)) { + cgm.errorNYI(rd->getSourceRange(), "recursive record layout"); + return entry; + } + + // Okay, this is a definition of a type. Compile the implementation now. + bool insertResult = recordsBeingLaidOut.insert(key).second; + (void)insertResult; + assert(insertResult && "isSafeToCovert() should have caught this."); + + // Force conversion of non-virtual base classes recursively. + if (const auto *cxxRecordDecl = dyn_cast(rd)) { + cgm.errorNYI(rd->getSourceRange(), "CXXRecordDecl"); + } + + // Layout fields. + std::unique_ptr layout = computeRecordLayout(rd, &entry); + recordDeclTypes[key] = entry; + cirGenRecordLayouts[key] = std::move(layout); + + // We're done laying out this record. + bool eraseResult = recordsBeingLaidOut.erase(key); + (void)eraseResult; + assert(eraseResult && "record not in RecordsBeingLaidOut set?"); + + // If this record blocked a FunctionType conversion, then recompute whatever + // was derived from that. + assert(!cir::MissingFeatures::skippedLayout()); + + // If we're done converting the outer-most record, then convert any deferred + // records as well. + assert(!cir::MissingFeatures::recursiveRecordLayout()); + return entry; } diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h index fd855bf958ccb..38e4bb2f688ab 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h @@ -14,9 +14,10 @@ #define LLVM_CLANG_LIB_CODEGEN_CODEGENTYPES_H #include "CIRGenFunctionInfo.h" -#include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "CIRGenRecordLayout.h" #include "clang/AST/Type.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" #include "llvm/ADT/SmallPtrSet.h" @@ -45,12 +46,22 @@ class CIRGenTypes { clang::ASTContext &astContext; CIRGenBuilderTy &builder; + /// Contains the CIR type for any converted RecordDecl. + llvm::DenseMap> + cirGenRecordLayouts; + /// Contains the CIR type for any converted RecordDecl llvm::DenseMap recordDeclTypes; /// Hold memoized CIRGenFunctionInfo results llvm::FoldingSet functionInfos; + /// This set keeps track of records that we're currently converting to a CIR + /// type. For example, when converting: + /// struct A { struct B { int x; } } when processing 'x', the 'A' and 'B' + /// types will be in this set. + llvm::SmallPtrSet recordsBeingLaidOut; + llvm::SmallPtrSet functionsBeingProcessed; /// Heper for convertType. mlir::Type convertFunctionTypeInternal(clang::QualType ft); @@ -59,6 +70,9 @@ class CIRGenTypes { CIRGenTypes(CIRGenModule &cgm); ~CIRGenTypes(); + CIRGenBuilderTy &getBuilder() const { return builder; } + CIRGenModule &getCGModule() const { return cgm; } + /// Utility to check whether a function type can be converted to a CIR type /// (i.e. doesn't depend on an incomplete tag type). bool isFuncTypeConvertible(const clang::FunctionType *ft); @@ -70,12 +84,18 @@ class CIRGenTypes { TypeCacheTy typeCache; mlir::MLIRContext &getMLIRContext() const; + clang::ASTContext &getASTContext() const { return astContext; } + + bool noRecordsBeingLaidOut() const { return recordsBeingLaidOut.empty(); } /// Convert a Clang type into a mlir::Type. mlir::Type convertType(clang::QualType type); mlir::Type convertRecordDeclType(const clang::RecordDecl *recordDecl); + std::unique_ptr + computeRecordLayout(const clang::RecordDecl *rd, cir::RecordType *ty); + std::string getRecordTypeName(const clang::RecordDecl *, llvm::StringRef suffix); diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index dc18f7f2af160..418bc2db408cb 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -18,6 +18,7 @@ add_clang_library(clangCIR CIRGenExprScalar.cpp CIRGenFunction.cpp CIRGenModule.cpp + CIRGenRecordLayoutBuilder.cpp CIRGenStmt.cpp CIRGenStmtOpenACC.cpp CIRGenTypes.cpp diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index f3e5e572653da..1f4232b9e29ec 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -212,7 +212,7 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType, } if (isa(attrType)) { - if (::mlir::isa(opType)) + if (isa(opType)) return success(); return op->emitOpError("zero expects struct or array type"); } diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp index 9b0177f159084..160732d9c3610 100644 --- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp @@ -214,6 +214,12 @@ cir::RecordType::RecordKind RecordType::getKind() const { return getImpl()->kind; } +void RecordType::complete(ArrayRef members, bool packed, bool padded) { + assert(!cir::MissingFeatures::astRecordDeclAttr()); + if (mutate(members, packed, padded).failed()) + llvm_unreachable("failed to complete record"); +} + //===----------------------------------------------------------------------===// // Data Layout information for types //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 7159f89c93a53..cb318c88c09c5 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1337,6 +1337,35 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, converter.addConversion([&](cir::BF16Type type) -> mlir::Type { return mlir::BFloat16Type::get(type.getContext()); }); + converter.addConversion([&](cir::RecordType type) -> mlir::Type { + // Convert struct members. + llvm::SmallVector llvmMembers; + switch (type.getKind()) { + case cir::RecordType::Struct: + for (mlir::Type ty : type.getMembers()) + llvmMembers.push_back(convertTypeForMemory(converter, dataLayout, ty)); + break; + // Unions are lowered as only the largest member. + case cir::RecordType::Union: + llvm_unreachable("Lowering of unions is NYI"); + break; + } + + // Record has a name: lower as an identified record. + mlir::LLVM::LLVMStructType llvmStruct; + if (type.getName()) { + llvmStruct = mlir::LLVM::LLVMStructType::getIdentified( + type.getContext(), type.getPrefixedName()); + assert(!cir::MissingFeatures::packedRecords()); + if (llvmStruct.setBody(llvmMembers, /*isPacked=*/true).failed()) + llvm_unreachable("Failed to set body of record"); + } else { // Record has no name: lower as literal record. + llvmStruct = mlir::LLVM::LLVMStructType::getLiteral( + type.getContext(), llvmMembers, /*isPacked=*/true); + } + + return llvmStruct; + }); } // The applyPartialConversion function traverses blocks in the dominance order, diff --git a/clang/test/CIR/CodeGen/struct.c b/clang/test/CIR/CodeGen/struct.c index 4edd591e609a2..e1b01e6a8e86c 100644 --- a/clang/test/CIR/CodeGen/struct.c +++ b/clang/test/CIR/CodeGen/struct.c @@ -11,6 +11,15 @@ struct IncompleteS *p; // LLVM: @p = dso_local global ptr null // OGCG: @p = global ptr null, align 8 +struct CompleteS { + int a; + char b; +} cs; + +// CIR: cir.global external @cs = #cir.zero : !cir.record +// LLVM: @cs = dso_local global %struct.CompleteS zeroinitializer +// OGCG: @cs = global %struct.CompleteS zeroinitializer, align 4 + void f(void) { struct IncompleteS *p; } @@ -28,3 +37,21 @@ void f(void) { // OGCG-NEXT: entry: // OGCG-NEXT: %[[P:.*]] = alloca ptr, align 8 // OGCG-NEXT: ret void + +void f2(void) { + struct CompleteS s; +} + +// CIR: cir.func @f2() +// CIR-NEXT: cir.alloca !cir.record, +// CIR-SAME: !cir.ptr>, ["s"] +// CIR-NEXT: cir.return + +// LLVM: define void @f2() +// LLVM-NEXT: %[[S:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM-NEXT: ret void + +// OGCG: define{{.*}} void @f2() +// OGCG-NEXT: entry: +// OGCG-NEXT: %[[S:.*]] = alloca %struct.CompleteS, align 4 +// OGCG-NEXT: ret void From 1576fa10104b9a88bef88ae851c2df479502fea9 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 16 Apr 2025 10:39:34 -0700 Subject: [PATCH 155/710] [ctxprof] Extend the notion of "cannot return" (#135651) At the time of instrumentation (and instrumentation lowering), `noreturn` is not applied uniformously. Rather than running `FunctionAttrs` pass, we just need to use `llvm::canReturn` exposed in PR #135650 --- .../Instrumentation/PGOCtxProfLowering.cpp | 19 ++++++++------ .../ctx-instrumentation-invalid-roots.ll | 25 +++++++++++-------- .../PGOProfile/ctx-instrumentation.ll | 13 ++++++++++ 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index f99d7b9d03e02..d741695d4e53c 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -9,6 +9,7 @@ #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Analysis.h" @@ -105,6 +106,12 @@ std::pair getNumCountersAndCallsites(const Function &F) { } return {NumCounters, NumCallsites}; } + +void emitUnsupportedRootError(const Function &F, StringRef Reason) { + F.getContext().emitError("[ctxprof] The function " + F.getName() + + " was indicated as context root but " + Reason + + ", which is not supported."); +} } // namespace // set up tie-in with compiler-rt. @@ -164,12 +171,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, for (const auto &BB : *F) for (const auto &I : BB) if (const auto *CB = dyn_cast(&I)) - if (CB->isMustTailCall()) { - M.getContext().emitError("The function " + Fname + - " was indicated as a context root, " - "but it features musttail " - "calls, which is not supported."); - } + if (CB->isMustTailCall()) + emitUnsupportedRootError(*F, "it features musttail calls"); } } @@ -230,11 +233,13 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { // Probably pointless to try to do anything here, unlikely to be // performance-affecting. - if (F.doesNotReturn()) { + if (!llvm::canReturn(F)) { for (auto &BB : F) for (auto &I : make_early_inc_range(BB)) if (isa(&I)) I.eraseFromParent(); + if (ContextRootSet.contains(&F)) + emitUnsupportedRootError(F, "it does not return"); return true; } diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll index 454780153b823..b5ceb4602c60b 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll @@ -1,17 +1,22 @@ -; RUN: not opt -passes=ctx-instr-gen,ctx-instr-lower -profile-context-root=good \ -; RUN: -profile-context-root=bad \ -; RUN: -S < %s 2>&1 | FileCheck %s +; RUN: split-file %s %t +; RUN: not opt -passes=ctx-instr-gen,ctx-instr-lower -profile-context-root=the_func -S %t/musttail.ll -o - 2>&1 | FileCheck %s +; RUN: not opt -passes=ctx-instr-gen,ctx-instr-lower -profile-context-root=the_func -S %t/unreachable.ll -o - 2>&1 | FileCheck %s +; RUN: not opt -passes=ctx-instr-gen,ctx-instr-lower -profile-context-root=the_func -S %t/noreturn.ll -o - 2>&1 | FileCheck %s +;--- musttail.ll declare void @foo() -define void @good() { - call void @foo() - ret void -} - -define void @bad() { +define void @the_func() { musttail call void @foo() ret void } +;--- unreachable.ll +define void @the_func() { + unreachable +} +;--- noreturn.ll +define void @the_func() noreturn { + unreachable +} -; CHECK: error: The function bad was indicated as a context root, but it features musttail calls, which is not supported. +; CHECK: error: [ctxprof] The function the_func was indicated as context root diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll index 6b2f25a585ec3..71d54f98d26e1 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll @@ -323,6 +323,18 @@ define void @does_not_return() noreturn { ; unreachable } + +define void @unreachable() { +; INSTRUMENT-LABEL: define void @unreachable() { +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @unreachable, i64 742261418966908927, i32 1, i32 0) +; INSTRUMENT-NEXT: unreachable +; +; LOWERING-LABEL: define void @unreachable( +; LOWERING-SAME: ) !guid [[META9:![0-9]+]] { +; LOWERING-NEXT: unreachable +; + unreachable +} ;. ; LOWERING: attributes #[[ATTR0]] = { noreturn } ; LOWERING: attributes #[[ATTR1:[0-9]+]] = { nounwind } @@ -340,4 +352,5 @@ define void @does_not_return() noreturn { ; LOWERING: [[META6]] = !{i64 -3771893999295659109} ; LOWERING: [[META7]] = !{i64 -4680624981836544329} ; LOWERING: [[META8]] = !{i64 5519225910966780583} +; LOWERING: [[META9]] = !{i64 -565652589829076809} ;. From 6ccc9280ba891bbea349c12a064bf23bdf9000e7 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 16 Apr 2025 19:40:28 +0200 Subject: [PATCH 156/710] Revert "[Clang][RFC] Bypass TAD during overload resolution if a perfect match exists" (#135993) Reverts llvm/llvm-project#133426 This is failing on some bots https://lab.llvm.org/buildbot/#/builders/163/builds/17265 --- clang/docs/ReleaseNotes.rst | 6 - clang/include/clang/Sema/Overload.h | 224 +------- clang/lib/Sema/SemaCodeComplete.cpp | 6 +- clang/lib/Sema/SemaInit.cpp | 15 +- clang/lib/Sema/SemaOverload.cpp | 522 ++++-------------- clang/lib/Sema/SemaTemplateDeduction.cpp | 4 +- .../constrant-satisfaction-conversions.cpp | 8 +- clang/test/SemaCUDA/function-overload.cu | 3 + .../SemaCXX/implicit-member-functions.cpp | 21 +- ...overload-resolution-deferred-templates.cpp | 185 ------- .../instantiate-function-params.cpp | 7 +- .../Templight/templight-empty-entries-fix.cpp | 126 +++-- 12 files changed, 217 insertions(+), 910 deletions(-) delete mode 100644 clang/test/SemaCXX/overload-resolution-deferred-templates.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index acbc9c5a6fac9..0891fd058bb57 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -96,12 +96,6 @@ C++ Language Changes asm((std::string_view("nop")) ::: (std::string_view("memory"))); } -- Clang now implements the changes to overload resolution proposed by section 1 and 2 of - `P3606 `_. If a non-template candidate exists in an overload set that is - a perfect match (all conversion sequences are identity conversions) template candiates are not instantiated. - Diagnostics that would have resulted from the instantiation of these template candidates are no longer - produced. This aligns Clang closer to the behavior of GCC, and fixes (#GH62096), (#GH74581), and (#GH74581). - C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 813811af06e89..6e08762dcc6d7 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -407,24 +407,6 @@ class Sema; Third == ICK_Identity; } - /// A conversion sequence is perfect if it is an identity conversion and - /// the type of the source is the same as the type of the target. - bool isPerfect(const ASTContext &C) const { - if (!isIdentityConversion()) - return false; - // If we are not performing a reference binding, we can skip comparing - // the types, which has a noticeable performance impact. - if (!ReferenceBinding) { - assert(First || C.hasSameUnqualifiedType(getFromType(), getToType(2))); - return true; - } - if (!C.hasSameType(getFromType(), getToType(2))) - return false; - if (BindsToRvalue && IsLvalueReference) - return false; - return true; - } - ImplicitConversionRank getRank() const; NarrowingKind getNarrowingKind(ASTContext &Context, const Expr *Converted, @@ -761,12 +743,6 @@ class Sema; Standard.setAllToTypes(T); } - /// A conversion sequence is perfect if it is an identity conversion and - /// the type of the source is the same as the type of the target. - bool isPerfect(const ASTContext &C) const { - return isStandard() && Standard.isPerfect(C); - } - // True iff this is a conversion sequence from an initializer list to an // array or std::initializer. bool hasInitializerListContainerType() const { @@ -1003,20 +979,6 @@ class Sema; return false; } - // An overload is a perfect match if the conversion - // sequences for each argument are perfect. - bool isPerfectMatch(const ASTContext &Ctx) const { - if (!Viable) - return false; - for (const auto &C : Conversions) { - if (!C.isInitialized() || !C.isPerfect(Ctx)) - return false; - } - if (isa_and_nonnull(Function)) - return FinalConversion.isPerfect(Ctx); - return true; - } - bool TryToFixBadConversion(unsigned Idx, Sema &S) { bool CanFix = Fix.tryToFixConversion( Conversions[Idx].Bad.FromExpr, @@ -1053,65 +1015,6 @@ class Sema; RewriteKind(CRK_None) {} }; - struct DeferredTemplateOverloadCandidate { - - // intrusive linked list support for allocateDeferredCandidate - DeferredTemplateOverloadCandidate *Next = nullptr; - - enum Kind { Function, Method, Conversion }; - - LLVM_PREFERRED_TYPE(Kind) - unsigned Kind : 2; - LLVM_PREFERRED_TYPE(bool) - unsigned AllowObjCConversionOnExplicit : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned AllowResultConversion : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned AllowExplicit : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned SuppressUserConversions : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned PartialOverloading : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned AggregateCandidateDeduction : 1; - }; - - struct DeferredFunctionTemplateOverloadCandidate - : public DeferredTemplateOverloadCandidate { - FunctionTemplateDecl *FunctionTemplate; - DeclAccessPair FoundDecl; - ArrayRef Args; - CallExpr::ADLCallKind IsADLCandidate; - OverloadCandidateParamOrder PO; - }; - static_assert(std::is_trivially_destructible_v< - DeferredFunctionTemplateOverloadCandidate>); - - struct DeferredMethodTemplateOverloadCandidate - : public DeferredTemplateOverloadCandidate { - FunctionTemplateDecl *FunctionTemplate; - DeclAccessPair FoundDecl; - ArrayRef Args; - CXXRecordDecl *ActingContext; - Expr::Classification ObjectClassification; - QualType ObjectType; - OverloadCandidateParamOrder PO; - }; - static_assert(std::is_trivially_destructible_v< - DeferredMethodTemplateOverloadCandidate>); - - struct DeferredConversionTemplateOverloadCandidate - : public DeferredTemplateOverloadCandidate { - FunctionTemplateDecl *FunctionTemplate; - DeclAccessPair FoundDecl; - CXXRecordDecl *ActingContext; - Expr *From; - QualType ToType; - }; - - static_assert(std::is_trivially_destructible_v< - DeferredConversionTemplateOverloadCandidate>); - /// OverloadCandidateSet - A set of overload candidates, used in C++ /// overload resolution (C++ 13.3). class OverloadCandidateSet { @@ -1140,11 +1043,6 @@ class Sema; /// C++ [over.match.call.general] /// Resolve a call through the address of an overload set. CSK_AddressOfOverloadSet, - - /// When doing overload resolution during code completion, - /// we want to show all viable candidates, including otherwise - /// deferred template candidates. - CSK_CodeCompletion, }; /// Information about operator rewrites to consider when adding operator @@ -1219,15 +1117,7 @@ class Sema; SmallVector Candidates; llvm::SmallPtrSet Functions; - DeferredTemplateOverloadCandidate *FirstDeferredCandidate = nullptr; - unsigned DeferredCandidatesCount : 8 * sizeof(unsigned) - 2; - LLVM_PREFERRED_TYPE(bool) - unsigned HasDeferredTemplateConstructors : 1; - LLVM_PREFERRED_TYPE(bool) - unsigned ResolutionByPerfectCandidateIsDisabled : 1; - - // Allocator for ConversionSequenceLists and deferred candidate args. - // We store the first few of these + // Allocator for ConversionSequenceLists. We store the first few of these // inline to avoid allocation for small sets. llvm::BumpPtrAllocator SlabAllocator; @@ -1235,11 +1125,8 @@ class Sema; CandidateSetKind Kind; OperatorRewriteInfo RewriteInfo; - /// Small storage size for ImplicitConversionSequences - /// and the persisted arguments of deferred candidates. constexpr static unsigned NumInlineBytes = - 32 * sizeof(ImplicitConversionSequence); - + 24 * sizeof(ImplicitConversionSequence); unsigned NumInlineBytesUsed = 0; alignas(void *) char InlineSpace[NumInlineBytes]; @@ -1250,13 +1137,15 @@ class Sema; /// from the slab allocator. /// FIXME: It would probably be nice to have a SmallBumpPtrAllocator /// instead. + /// FIXME: Now that this only allocates ImplicitConversionSequences, do we + /// want to un-generalize this? template T *slabAllocate(unsigned N) { // It's simpler if this doesn't need to consider alignment. static_assert(alignof(T) == alignof(void *), "Only works for pointer-aligned types."); - static_assert(std::is_trivially_destructible_v || - (std::is_same_v), + static_assert(std::is_trivial::value || + std::is_same::value, "Add destruction logic to OverloadCandidateSet::clear()."); unsigned NBytes = sizeof(T) * N; @@ -1270,34 +1159,12 @@ class Sema; return reinterpret_cast(FreeSpaceStart); } - // Because the size of OverloadCandidateSet has a noticeable impact on - // performance, we store each deferred template candidate in the slab - // allocator such that deferred candidates are ultimately a singly-linked - // intrusive linked list. This ends up being much more efficient than a - // SmallVector that is empty in the common case. - template T *allocateDeferredCandidate() { - T *C = slabAllocate(1); - if (!FirstDeferredCandidate) - FirstDeferredCandidate = C; - else { - auto *F = FirstDeferredCandidate; - while (F->Next) - F = F->Next; - F->Next = C; - } - DeferredCandidatesCount++; - return C; - } - void destroyCandidates(); public: OverloadCandidateSet(SourceLocation Loc, CandidateSetKind CSK, OperatorRewriteInfo RewriteInfo = {}) - : FirstDeferredCandidate(nullptr), DeferredCandidatesCount(0), - HasDeferredTemplateConstructors(false), - ResolutionByPerfectCandidateIsDisabled(false), Loc(Loc), Kind(CSK), - RewriteInfo(RewriteInfo) {} + : Loc(Loc), Kind(CSK), RewriteInfo(RewriteInfo) {} OverloadCandidateSet(const OverloadCandidateSet &) = delete; OverloadCandidateSet &operator=(const OverloadCandidateSet &) = delete; ~OverloadCandidateSet() { destroyCandidates(); } @@ -1309,9 +1176,6 @@ class Sema; /// Whether diagnostics should be deferred. bool shouldDeferDiags(Sema &S, ArrayRef Args, SourceLocation OpLoc); - // Whether the resolution of template candidates should be deferred - bool shouldDeferTemplateArgumentDeduction(const LangOptions &Opts) const; - /// Determine when this overload candidate will be new to the /// overload set. bool isNewCandidate(Decl *F, OverloadCandidateParamOrder PO = @@ -1335,10 +1199,8 @@ class Sema; iterator begin() { return Candidates.begin(); } iterator end() { return Candidates.end(); } - size_t size() const { return Candidates.size() + DeferredCandidatesCount; } - bool empty() const { - return Candidates.empty() && DeferredCandidatesCount == 0; - } + size_t size() const { return Candidates.size(); } + bool empty() const { return Candidates.empty(); } /// Allocate storage for conversion sequences for NumConversions /// conversions. @@ -1354,24 +1216,6 @@ class Sema; return ConversionSequenceList(Conversions, NumConversions); } - /// Provide storage for any Expr* arg that must be preserved - /// until deferred template candidates are deduced. - /// Typically this should be used for reversed operator arguments - /// and any time the argument array is transformed while adding - /// a template candidate. - llvm::MutableArrayRef getPersistentArgsArray(unsigned N) { - Expr **Exprs = slabAllocate(N); - return llvm::MutableArrayRef(Exprs, N); - } - - template - llvm::MutableArrayRef getPersistentArgsArray(T *...Exprs) { - llvm::MutableArrayRef Arr = - getPersistentArgsArray(sizeof...(Exprs)); - llvm::copy(std::initializer_list{Exprs...}, Arr.data()); - return Arr; - } - /// Add a new candidate with NumConversions conversion sequence slots /// to the overload set. OverloadCandidate &addCandidate(unsigned NumConversions = 0, @@ -1387,32 +1231,6 @@ class Sema; return C; } - void AddDeferredTemplateCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - ArrayRef Args, bool SuppressUserConversions, - bool PartialOverloading, bool AllowExplicit, - CallExpr::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, - bool AggregateCandidateDeduction); - - void AddDeferredMethodTemplateCandidate( - FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, QualType ObjectType, - Expr::Classification ObjectClassification, ArrayRef Args, - bool SuppressUserConversions, bool PartialOverloading, - OverloadCandidateParamOrder PO); - - void AddDeferredConversionTemplateCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, Expr *From, QualType ToType, - bool AllowObjCConversionOnExplicit, bool AllowExplicit, - bool AllowResultConversion); - - void InjectNonDeducedTemplateCandidates(Sema &S); - - void DisableResolutionByPerfectCandidate() { - ResolutionByPerfectCandidateIsDisabled = true; - } - /// Find the best viable function on this overload set, if it exists. OverloadingResult BestViableFunction(Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator& Best); @@ -1445,15 +1263,6 @@ class Sema; DestAS = AS; } - private: - OverloadingResult ResultForBestCandidate(const iterator &Best); - void CudaExcludeWrongSideCandidates( - Sema &S, SmallVectorImpl &Candidates); - OverloadingResult - BestViableFunctionImpl(Sema &S, SourceLocation Loc, - OverloadCandidateSet::iterator &Best); - void PerfectViableFunction(Sema &S, SourceLocation Loc, - OverloadCandidateSet::iterator &Best); }; bool isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1, @@ -1502,21 +1311,6 @@ class Sema; // parameter. bool shouldEnforceArgLimit(bool PartialOverloading, FunctionDecl *Function); - inline bool OverloadCandidateSet::shouldDeferTemplateArgumentDeduction( - const LangOptions &Opts) const { - return - // For user defined conversion we need to check against different - // combination of CV qualifiers and look at any explicit specifier, so - // always deduce template candidates. - Kind != CSK_InitByUserDefinedConversion - // When doing code completion, we want to see all the - // viable candidates. - && Kind != CSK_CodeCompletion - // CUDA may prefer template candidates even when a non-candidate - // is a perfect match - && !Opts.CUDA; - } - } // namespace clang #endif // LLVM_CLANG_SEMA_OVERLOAD_H diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 45405d4709e14..f6ec4cb0f069e 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -6354,8 +6354,7 @@ SemaCodeCompletion::ProduceCallSignatureHelp(Expr *Fn, ArrayRef Args, Expr *NakedFn = Fn->IgnoreParenCasts(); // Build an overload candidate set based on the functions we find. SourceLocation Loc = Fn->getExprLoc(); - OverloadCandidateSet CandidateSet(Loc, - OverloadCandidateSet::CSK_CodeCompletion); + OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal); if (auto ULE = dyn_cast(NakedFn)) { SemaRef.AddOverloadedCallCandidates(ULE, ArgsWithoutDependentTypes, @@ -6558,8 +6557,7 @@ QualType SemaCodeCompletion::ProduceConstructorSignatureHelp( // FIXME: Provide support for variadic template constructors. if (CRD) { - OverloadCandidateSet CandidateSet(Loc, - OverloadCandidateSet::CSK_CodeCompletion); + OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal); for (NamedDecl *C : SemaRef.LookupConstructors(CRD)) { if (auto *FD = dyn_cast(C)) { // FIXME: we can't yet provide correct signature help for initializer diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 82489847b589b..a1e4bb4321d53 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -10029,19 +10029,12 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( // When [...] the constructor [...] is a candidate by // - [over.match.copy] (in all cases) if (TD) { - - // As template candidates are not deduced immediately, - // persist the array in the overload set. - MutableArrayRef TmpInits = - Candidates.getPersistentArgsArray(Inits.size()); - - for (auto [I, E] : llvm::enumerate(Inits)) { + SmallVector TmpInits; + for (Expr *E : Inits) if (auto *DI = dyn_cast(E)) - TmpInits[I] = DI->getInit(); + TmpInits.push_back(DI->getInit()); else - TmpInits[I] = E; - } - + TmpInits.push_back(E); AddTemplateOverloadCandidate( TD, FoundDecl, /*ExplicitArgs=*/nullptr, TmpInits, Candidates, /*SuppressUserConversions=*/false, diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index deef01c946feb..55634aa75ae25 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1123,10 +1123,6 @@ void OverloadCandidateSet::clear(CandidateSetKind CSK) { Candidates.clear(); Functions.clear(); Kind = CSK; - FirstDeferredCandidate = nullptr; - DeferredCandidatesCount = 0; - HasDeferredTemplateConstructors = false; - ResolutionByPerfectCandidateIsDisabled = false; } namespace { @@ -7799,14 +7795,15 @@ void Sema::AddMethodCandidate( } } -static void AddMethodTemplateCandidateImmediately( - Sema &S, OverloadCandidateSet &CandidateSet, +void Sema::AddMethodTemplateCandidate( FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ObjectType, Expr::Classification ObjectClassification, ArrayRef Args, - bool SuppressUserConversions, bool PartialOverloading, - OverloadCandidateParamOrder PO) { + OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, + bool PartialOverloading, OverloadCandidateParamOrder PO) { + if (!CandidateSet.isNewCandidate(MethodTmpl, PO)) + return; // C++ [over.match.funcs]p7: // In each case where a candidate is a function template, candidate @@ -7820,12 +7817,12 @@ static void AddMethodTemplateCandidateImmediately( TemplateDeductionInfo Info(CandidateSet.getLocation()); FunctionDecl *Specialization = nullptr; ConversionSequenceList Conversions; - if (TemplateDeductionResult Result = S.DeduceTemplateArguments( + if (TemplateDeductionResult Result = DeduceTemplateArguments( MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, /*AggregateDeductionCandidate=*/false, /*PartialOrdering=*/false, ObjectType, ObjectClassification, [&](ArrayRef ParamTypes) { - return S.CheckNonDependentConversions( + return CheckNonDependentConversions( MethodTmpl, ParamTypes, Args, CandidateSet, Conversions, SuppressUserConversions, ActingContext, ObjectType, ObjectClassification, PO); @@ -7847,8 +7844,8 @@ static void AddMethodTemplateCandidateImmediately( Candidate.FailureKind = ovl_fail_bad_conversion; else { Candidate.FailureKind = ovl_fail_bad_deduction; - Candidate.DeductionFailure = - MakeDeductionFailureInfo(S.Context, Result, Info); + Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, + Info); } return; } @@ -7858,34 +7855,10 @@ static void AddMethodTemplateCandidateImmediately( assert(Specialization && "Missing member function template specialization?"); assert(isa(Specialization) && "Specialization is not a member function?"); - S.AddMethodCandidate( - cast(Specialization), FoundDecl, ActingContext, ObjectType, - ObjectClassification, Args, CandidateSet, SuppressUserConversions, - PartialOverloading, Conversions, PO, Info.hasStrictPackMatch()); -} - -void Sema::AddMethodTemplateCandidate( - FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, - TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ObjectType, - Expr::Classification ObjectClassification, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, OverloadCandidateParamOrder PO) { - if (!CandidateSet.isNewCandidate(MethodTmpl, PO)) - return; - - if (ExplicitTemplateArgs || - !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts())) { - AddMethodTemplateCandidateImmediately( - *this, CandidateSet, MethodTmpl, FoundDecl, ActingContext, - ExplicitTemplateArgs, ObjectType, ObjectClassification, Args, - SuppressUserConversions, PartialOverloading, PO); - return; - } - - CandidateSet.AddDeferredMethodTemplateCandidate( - MethodTmpl, FoundDecl, ActingContext, ObjectType, ObjectClassification, - Args, SuppressUserConversions, PartialOverloading, PO); + AddMethodCandidate(cast(Specialization), FoundDecl, + ActingContext, ObjectType, ObjectClassification, Args, + CandidateSet, SuppressUserConversions, PartialOverloading, + Conversions, PO, Info.hasStrictPackMatch()); } /// Determine whether a given function template has a simple explicit specifier @@ -7894,18 +7867,14 @@ static bool isNonDependentlyExplicit(FunctionTemplateDecl *FTD) { return ExplicitSpecifier::getFromDecl(FTD->getTemplatedDecl()).isExplicit(); } -static bool hasDependentExplicit(FunctionTemplateDecl *FTD) { - return ExplicitSpecifier::getFromDecl(FTD->getTemplatedDecl()).getKind() == - ExplicitSpecKind::Unresolved; -} - -static void AddTemplateOverloadCandidateImmediately( - Sema &S, OverloadCandidateSet &CandidateSet, +void Sema::AddTemplateOverloadCandidate( FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, - bool SuppressUserConversions, bool PartialOverloading, bool AllowExplicit, - Sema::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, - bool AggregateCandidateDeduction) { + OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, + bool PartialOverloading, bool AllowExplicit, ADLCallKind IsADLCandidate, + OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { + if (!CandidateSet.isNewCandidate(FunctionTemplate, PO)) + return; // If the function template has a non-dependent explicit specification, // exclude it now if appropriate; we are not permitted to perform deduction @@ -7932,14 +7901,14 @@ static void AddTemplateOverloadCandidateImmediately( FunctionTemplate->getTemplateDepth()); FunctionDecl *Specialization = nullptr; ConversionSequenceList Conversions; - if (TemplateDeductionResult Result = S.DeduceTemplateArguments( + if (TemplateDeductionResult Result = DeduceTemplateArguments( FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, AggregateCandidateDeduction, /*PartialOrdering=*/false, /*ObjectType=*/QualType(), /*ObjectClassification=*/Expr::Classification(), [&](ArrayRef ParamTypes) { - return S.CheckNonDependentConversions( + return CheckNonDependentConversions( FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions, SuppressUserConversions, nullptr, QualType(), {}, PO); }); @@ -7963,8 +7932,8 @@ static void AddTemplateOverloadCandidateImmediately( Candidate.FailureKind = ovl_fail_bad_conversion; else { Candidate.FailureKind = ovl_fail_bad_deduction; - Candidate.DeductionFailure = - MakeDeductionFailureInfo(S.Context, Result, Info); + Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, + Info); } return; } @@ -7972,7 +7941,7 @@ static void AddTemplateOverloadCandidateImmediately( // Add the function template specialization produced by template argument // deduction as a candidate. assert(Specialization && "Missing function template specialization?"); - S.AddOverloadCandidate( + AddOverloadCandidate( Specialization, FoundDecl, Args, CandidateSet, SuppressUserConversions, PartialOverloading, AllowExplicit, /*AllowExplicitConversions=*/false, IsADLCandidate, Conversions, PO, @@ -7980,38 +7949,6 @@ static void AddTemplateOverloadCandidateImmediately( Info.hasStrictPackMatch()); } -void Sema::AddTemplateOverloadCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, bool AllowExplicit, ADLCallKind IsADLCandidate, - OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { - if (!CandidateSet.isNewCandidate(FunctionTemplate, PO)) - return; - - bool DependentExplicitSpecifier = hasDependentExplicit(FunctionTemplate); - - if (ExplicitTemplateArgs || - !CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) || - (isa(FunctionTemplate->getTemplatedDecl()) && - DependentExplicitSpecifier)) { - - AddTemplateOverloadCandidateImmediately( - *this, CandidateSet, FunctionTemplate, FoundDecl, ExplicitTemplateArgs, - Args, SuppressUserConversions, PartialOverloading, AllowExplicit, - IsADLCandidate, PO, AggregateCandidateDeduction); - - if (DependentExplicitSpecifier) - CandidateSet.DisableResolutionByPerfectCandidate(); - return; - } - - CandidateSet.AddDeferredTemplateCandidate( - FunctionTemplate, FoundDecl, Args, SuppressUserConversions, - PartialOverloading, AllowExplicit, IsADLCandidate, PO, - AggregateCandidateDeduction); -} - bool Sema::CheckNonDependentConversions( FunctionTemplateDecl *FunctionTemplate, ArrayRef ParamTypes, ArrayRef Args, OverloadCandidateSet &CandidateSet, @@ -8327,12 +8264,16 @@ void Sema::AddConversionCandidate( } } -static void AddTemplateConversionCandidateImmediately( - Sema &S, OverloadCandidateSet &CandidateSet, +void Sema::AddTemplateConversionCandidate( FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, Expr *From, QualType ToType, - bool AllowObjCConversionOnExplicit, bool AllowExplicit, - bool AllowResultConversion) { + CXXRecordDecl *ActingDC, Expr *From, QualType ToType, + OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, + bool AllowExplicit, bool AllowResultConversion) { + assert(isa(FunctionTemplate->getTemplatedDecl()) && + "Only conversion function templates permitted here"); + + if (!CandidateSet.isNewCandidate(FunctionTemplate)) + return; // If the function template has a non-dependent explicit specification, // exclude it now if appropriate; we are not permitted to perform deduction @@ -8347,11 +8288,11 @@ static void AddTemplateConversionCandidateImmediately( } QualType ObjectType = From->getType(); - Expr::Classification ObjectClassification = From->Classify(S.Context); + Expr::Classification ObjectClassification = From->Classify(getASTContext()); TemplateDeductionInfo Info(CandidateSet.getLocation()); CXXConversionDecl *Specialization = nullptr; - if (TemplateDeductionResult Result = S.DeduceTemplateArguments( + if (TemplateDeductionResult Result = DeduceTemplateArguments( FunctionTemplate, ObjectType, ObjectClassification, ToType, Specialization, Info); Result != TemplateDeductionResult::Success) { @@ -8361,47 +8302,18 @@ static void AddTemplateConversionCandidateImmediately( Candidate.Viable = false; Candidate.FailureKind = ovl_fail_bad_deduction; Candidate.ExplicitCallArguments = 1; - Candidate.DeductionFailure = - MakeDeductionFailureInfo(S.Context, Result, Info); + Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result, + Info); return; } // Add the conversion function template specialization produced by // template argument deduction as a candidate. assert(Specialization && "Missing function template specialization?"); - S.AddConversionCandidate(Specialization, FoundDecl, ActingContext, From, - ToType, CandidateSet, AllowObjCConversionOnExplicit, - AllowExplicit, AllowResultConversion, - Info.hasStrictPackMatch()); -} - -void Sema::AddTemplateConversionCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingDC, Expr *From, QualType ToType, - OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion) { - assert(isa(FunctionTemplate->getTemplatedDecl()) && - "Only conversion function templates permitted here"); - - if (!CandidateSet.isNewCandidate(FunctionTemplate)) - return; - - if (!CandidateSet.shouldDeferTemplateArgumentDeduction(getLangOpts()) || - CandidateSet.getKind() == - OverloadCandidateSet::CSK_InitByUserDefinedConversion || - CandidateSet.getKind() == OverloadCandidateSet::CSK_InitByConstructor) { - AddTemplateConversionCandidateImmediately( - *this, CandidateSet, FunctionTemplate, FoundDecl, ActingDC, From, - ToType, AllowObjCConversionOnExplicit, AllowExplicit, - AllowResultConversion); - - CandidateSet.DisableResolutionByPerfectCandidate(); - return; - } - - CandidateSet.AddDeferredConversionTemplateCandidate( - FunctionTemplate, FoundDecl, ActingDC, From, ToType, - AllowObjCConversionOnExplicit, AllowExplicit, AllowResultConversion); + AddConversionCandidate(Specialization, FoundDecl, ActingDC, From, ToType, + CandidateSet, AllowObjCConversionOnExplicit, + AllowExplicit, AllowResultConversion, + Info.hasStrictPackMatch()); } void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion, @@ -8551,17 +8463,11 @@ void Sema::AddNonMemberOperatorCandidates( if (FunTmpl) { AddTemplateOverloadCandidate(FunTmpl, F.getPair(), ExplicitTemplateArgs, FunctionArgs, CandidateSet); - if (CandidateSet.getRewriteInfo().shouldAddReversed(*this, Args, FD)) { - - // As template candidates are not deduced immediately, - // persist the array in the overload set. - ArrayRef Reversed = CandidateSet.getPersistentArgsArray( - FunctionArgs[1], FunctionArgs[0]); - AddTemplateOverloadCandidate(FunTmpl, F.getPair(), ExplicitTemplateArgs, - Reversed, CandidateSet, false, false, true, - ADLCallKind::NotADL, - OverloadCandidateParamOrder::Reversed); - } + if (CandidateSet.getRewriteInfo().shouldAddReversed(*this, Args, FD)) + AddTemplateOverloadCandidate( + FunTmpl, F.getPair(), ExplicitTemplateArgs, + {FunctionArgs[1], FunctionArgs[0]}, CandidateSet, false, false, + true, ADLCallKind::NotADL, OverloadCandidateParamOrder::Reversed); } else { if (ExplicitTemplateArgs) continue; @@ -10293,8 +10199,6 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name, // FIXME: Pass in the explicit template arguments? ArgumentDependentLookup(Name, Loc, Args, Fns); - ArrayRef ReversedArgs; - // Erase all of the candidates we already knew about. for (OverloadCandidateSet::iterator Cand = CandidateSet.begin(), CandEnd = CandidateSet.end(); @@ -10334,15 +10238,9 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name, /*AllowExplicit=*/true, ADLCallKind::UsesADL); if (CandidateSet.getRewriteInfo().shouldAddReversed( *this, Args, FTD->getTemplatedDecl())) { - - // As template candidates are not deduced immediately, - // persist the array in the overload set. - if (ReversedArgs.empty()) - ReversedArgs = CandidateSet.getPersistentArgsArray(Args[1], Args[0]); - AddTemplateOverloadCandidate( - FTD, FoundDecl, ExplicitTemplateArgs, ReversedArgs, CandidateSet, - /*SuppressUserConversions=*/false, PartialOverloading, + FTD, FoundDecl, ExplicitTemplateArgs, {Args[1], Args[0]}, + CandidateSet, /*SuppressUserConversions=*/false, PartialOverloading, /*AllowExplicit=*/true, ADLCallKind::UsesADL, OverloadCandidateParamOrder::Reversed); } @@ -11015,147 +10913,23 @@ bool OverloadCandidate::NotValidBecauseConstraintExprHasError() const { ->Satisfaction.ContainsErrors; } -void OverloadCandidateSet::AddDeferredTemplateCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - ArrayRef Args, bool SuppressUserConversions, - bool PartialOverloading, bool AllowExplicit, - CallExpr::ADLCallKind IsADLCandidate, OverloadCandidateParamOrder PO, - bool AggregateCandidateDeduction) { - - auto *C = - allocateDeferredCandidate(); - - C = new (C) DeferredFunctionTemplateOverloadCandidate{ - {nullptr, DeferredFunctionTemplateOverloadCandidate::Function, - /*AllowObjCConversionOnExplicit=*/false, - /*AllowResultConversion=*/false, AllowExplicit, SuppressUserConversions, - PartialOverloading, AggregateCandidateDeduction}, - FunctionTemplate, - FoundDecl, - Args, - IsADLCandidate, - PO}; - - HasDeferredTemplateConstructors |= - isa(FunctionTemplate->getTemplatedDecl()); -} - -void OverloadCandidateSet::AddDeferredMethodTemplateCandidate( - FunctionTemplateDecl *MethodTmpl, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, QualType ObjectType, - Expr::Classification ObjectClassification, ArrayRef Args, - bool SuppressUserConversions, bool PartialOverloading, - OverloadCandidateParamOrder PO) { - - assert(!isa(MethodTmpl->getTemplatedDecl())); - - auto *C = - allocateDeferredCandidate(); - - C = new (C) DeferredMethodTemplateOverloadCandidate{ - {nullptr, DeferredFunctionTemplateOverloadCandidate::Method, - /*AllowObjCConversionOnExplicit=*/false, - /*AllowResultConversion=*/false, - /*AllowExplicit=*/false, SuppressUserConversions, PartialOverloading, - /*AggregateCandidateDeduction=*/false}, - MethodTmpl, - FoundDecl, - Args, - ActingContext, - ObjectClassification, - ObjectType, - PO}; -} - -void OverloadCandidateSet::AddDeferredConversionTemplateCandidate( - FunctionTemplateDecl *FunctionTemplate, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, Expr *From, QualType ToType, - bool AllowObjCConversionOnExplicit, bool AllowExplicit, - bool AllowResultConversion) { - - auto *C = - allocateDeferredCandidate(); - - C = new (C) DeferredConversionTemplateOverloadCandidate{ - {nullptr, DeferredFunctionTemplateOverloadCandidate::Conversion, - AllowObjCConversionOnExplicit, AllowResultConversion, - /*AllowExplicit=*/false, - /*SuppressUserConversions=*/false, - /*PartialOverloading*/ false, - /*AggregateCandidateDeduction=*/false}, - FunctionTemplate, - FoundDecl, - ActingContext, - From, - ToType}; -} - -static void -AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, - DeferredMethodTemplateOverloadCandidate &C) { - - AddMethodTemplateCandidateImmediately( - S, CandidateSet, C.FunctionTemplate, C.FoundDecl, C.ActingContext, - /*ExplicitTemplateArgs=*/nullptr, C.ObjectType, C.ObjectClassification, - C.Args, C.SuppressUserConversions, C.PartialOverloading, C.PO); -} - -static void -AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, - DeferredFunctionTemplateOverloadCandidate &C) { - AddTemplateOverloadCandidateImmediately( - S, CandidateSet, C.FunctionTemplate, C.FoundDecl, - /*ExplicitTemplateArgs=*/nullptr, C.Args, C.SuppressUserConversions, - C.PartialOverloading, C.AllowExplicit, C.IsADLCandidate, C.PO, - C.AggregateCandidateDeduction); -} - -static void -AddTemplateOverloadCandidate(Sema &S, OverloadCandidateSet &CandidateSet, - DeferredConversionTemplateOverloadCandidate &C) { - return AddTemplateConversionCandidateImmediately( - S, CandidateSet, C.FunctionTemplate, C.FoundDecl, C.ActingContext, C.From, - C.ToType, C.AllowObjCConversionOnExplicit, C.AllowExplicit, - C.AllowResultConversion); -} - -void OverloadCandidateSet::InjectNonDeducedTemplateCandidates(Sema &S) { - Candidates.reserve(Candidates.size() + DeferredCandidatesCount); - DeferredTemplateOverloadCandidate *Cand = FirstDeferredCandidate; - while (Cand) { - switch (Cand->Kind) { - case DeferredTemplateOverloadCandidate::Function: - AddTemplateOverloadCandidate( - S, *this, - *static_cast(Cand)); - break; - case DeferredTemplateOverloadCandidate::Method: - AddTemplateOverloadCandidate( - S, *this, - *static_cast(Cand)); - break; - case DeferredTemplateOverloadCandidate::Conversion: - AddTemplateOverloadCandidate( - S, *this, - *static_cast(Cand)); - break; - } - Cand = Cand->Next; - } - FirstDeferredCandidate = nullptr; - DeferredCandidatesCount = 0; -} - +/// Computes the best viable function (C++ 13.3.3) +/// within an overload candidate set. +/// +/// \param Loc The location of the function name (or operator symbol) for +/// which overload resolution occurs. +/// +/// \param Best If overload resolution was successful or found a deleted +/// function, \p Best points to the candidate function found. +/// +/// \returns The result of overload resolution. OverloadingResult -OverloadCandidateSet::ResultForBestCandidate(const iterator &Best) { - Best->Best = true; - if (Best->Function && Best->Function->isDeleted()) - return OR_Deleted; - return OR_Success; -} +OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, + iterator &Best) { + llvm::SmallVector Candidates; + std::transform(begin(), end(), std::back_inserter(Candidates), + [](OverloadCandidate &Cand) { return &Cand; }); -void OverloadCandidateSet::CudaExcludeWrongSideCandidates( - Sema &S, SmallVectorImpl &Candidates) { // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but // are accepted by both clang and NVCC. However, during a particular // compilation mode only one call variant is viable. We need to @@ -11167,112 +10941,27 @@ void OverloadCandidateSet::CudaExcludeWrongSideCandidates( // -fgpu-exclude-wrong-side-overloads is off. When // -fgpu-exclude-wrong-side-overloads is on, all candidates are compared // uniformly in isBetterOverloadCandidate. - if (!S.getLangOpts().CUDA || S.getLangOpts().GPUExcludeWrongSideOverloads) - return; - const FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true); - - bool ContainsSameSideCandidate = - llvm::any_of(Candidates, [&](const OverloadCandidate *Cand) { - // Check viable function only. + if (S.getLangOpts().CUDA && !S.getLangOpts().GPUExcludeWrongSideOverloads) { + const FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true); + bool ContainsSameSideCandidate = + llvm::any_of(Candidates, [&](OverloadCandidate *Cand) { + // Check viable function only. + return Cand->Viable && Cand->Function && + S.CUDA().IdentifyPreference(Caller, Cand->Function) == + SemaCUDA::CFP_SameSide; + }); + if (ContainsSameSideCandidate) { + auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) { + // Check viable function only to avoid unnecessary data copying/moving. return Cand->Viable && Cand->Function && S.CUDA().IdentifyPreference(Caller, Cand->Function) == - SemaCUDA::CFP_SameSide; - }); - - if (!ContainsSameSideCandidate) - return; - - auto IsWrongSideCandidate = [&](const OverloadCandidate *Cand) { - // Check viable function only to avoid unnecessary data copying/moving. - return Cand->Viable && Cand->Function && - S.CUDA().IdentifyPreference(Caller, Cand->Function) == - SemaCUDA::CFP_WrongSide; - }; - llvm::erase_if(Candidates, IsWrongSideCandidate); -} - -/// Computes the best viable function (C++ 13.3.3) -/// within an overload candidate set. -/// -/// \param Loc The location of the function name (or operator symbol) for -/// which overload resolution occurs. -/// -/// \param Best If overload resolution was successful or found a deleted -/// function, \p Best points to the candidate function found. -/// -/// \returns The result of overload resolution. -OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S, - SourceLocation Loc, - iterator &Best) { - - assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || - DeferredCandidatesCount == 0 && - "Unexpected deferred template candidates"); - - bool TwoPhaseResolution = - DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled; - - if (TwoPhaseResolution) { - - PerfectViableFunction(S, Loc, Best); - if (Best != end()) - return ResultForBestCandidate(Best); - } - - InjectNonDeducedTemplateCandidates(S); - return BestViableFunctionImpl(S, Loc, Best); -} - -void OverloadCandidateSet::PerfectViableFunction( - Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { - - Best = end(); - for (auto It = begin(); It != end(); ++It) { - - if (!It->isPerfectMatch(S.getASTContext())) - continue; - - // We found a suitable conversion function - // but if there is a template constructor in the target class - // we might prefer that instead. - if (HasDeferredTemplateConstructors && - isa_and_nonnull(It->Function)) { - Best = end(); - break; - } - - if (Best == end()) { - Best = It; - continue; - } - if (Best->Function && It->Function) { - FunctionDecl *D = - S.getMoreConstrainedFunction(Best->Function, It->Function); - if (D == nullptr) { - Best = end(); - break; - } - if (D == It->Function) - Best = It; - continue; + SemaCUDA::CFP_WrongSide; + }; + llvm::erase_if(Candidates, IsWrongSideCandidate); } - // ambiguous - Best = end(); - break; } -} - -OverloadingResult OverloadCandidateSet::BestViableFunctionImpl( - Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { - - llvm::SmallVector Candidates; - Candidates.reserve(this->Candidates.size()); - std::transform(begin(), end(), std::back_inserter(Candidates), - [](OverloadCandidate &Cand) { return &Cand; }); - - if (S.getLangOpts().CUDA) - CudaExcludeWrongSideCandidates(S, Candidates); + // Find the best viable function. Best = end(); for (auto *Cand : Candidates) { Cand->Best = false; @@ -11294,8 +10983,9 @@ OverloadingResult OverloadCandidateSet::BestViableFunctionImpl( if (Best == end()) return OR_No_Viable_Function; - llvm::SmallVector PendingBest; llvm::SmallVector EquivalentCands; + + llvm::SmallVector PendingBest; PendingBest.push_back(&*Best); Best->Best = true; @@ -11318,15 +11008,25 @@ OverloadingResult OverloadCandidateSet::BestViableFunctionImpl( } } + // If we found more than one best candidate, this is ambiguous. if (Best == end()) return OR_Ambiguous; - OverloadingResult R = ResultForBestCandidate(Best); + // Best is the best viable function. + if (Best->Function && Best->Function->isDeleted()) + return OR_Deleted; + + if (auto *M = dyn_cast_or_null(Best->Function); + Kind == CSK_AddressOfOverloadSet && M && + M->isImplicitObjectMemberFunction()) { + return OR_No_Viable_Function; + } if (!EquivalentCands.empty()) S.diagnoseEquivalentInternalLinkageDeclarations(Loc, Best->Function, EquivalentCands); - return R; + + return OR_Success; } namespace { @@ -13033,9 +12733,6 @@ SmallVector OverloadCandidateSet::CompleteCandidates( Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef Args, SourceLocation OpLoc, llvm::function_ref Filter) { - - InjectNonDeducedTemplateCandidates(S); - // Sort the candidates by viability and position. Sorting directly would // be prohibitive, so we make a set of pointers and sort those. SmallVector Cands; @@ -14657,12 +14354,10 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, Expr *ExecConfig, bool AllowTypoCorrection, bool CalleesAddressIsTaken) { - - OverloadCandidateSet::CandidateSetKind CSK = - CalleesAddressIsTaken ? OverloadCandidateSet::CSK_AddressOfOverloadSet - : OverloadCandidateSet::CSK_Normal; - - OverloadCandidateSet CandidateSet(Fn->getExprLoc(), CSK); + OverloadCandidateSet CandidateSet( + Fn->getExprLoc(), CalleesAddressIsTaken + ? OverloadCandidateSet::CSK_AddressOfOverloadSet + : OverloadCandidateSet::CSK_Normal); ExprResult result; if (buildOverloadedCallSet(S, Fn, ULE, Args, LParenLoc, &CandidateSet, @@ -14678,17 +14373,6 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, OverloadingResult OverloadResult = CandidateSet.BestViableFunction(*this, Fn->getBeginLoc(), Best); - // [C++23][over.call.func] - // if overload resolution selects a non-static member function, - // the call is ill-formed; - if (CSK == OverloadCandidateSet::CSK_AddressOfOverloadSet && - Best != CandidateSet.end()) { - if (auto *M = dyn_cast_or_null(Best->Function); - M && M->isImplicitObjectMemberFunction()) { - OverloadResult = OR_No_Viable_Function; - } - } - // Model the case with a call to a templated function whose definition // encloses the call and whose return type contains a placeholder type as if // the UnresolvedLookupExpr was type-dependent. @@ -15024,24 +14708,18 @@ void Sema::LookupOverloadedBinOp(OverloadCandidateSet &CandidateSet, // rewritten candidates using these functions if necessary. AddNonMemberOperatorCandidates(Fns, Args, CandidateSet); - // As template candidates are not deduced immediately, - // persist the array in the overload set. - ArrayRef ReversedArgs; - if (CandidateSet.getRewriteInfo().allowsReversed(Op) || - CandidateSet.getRewriteInfo().allowsReversed(ExtraOp)) - ReversedArgs = CandidateSet.getPersistentArgsArray(Args[1], Args[0]); - // Add operator candidates that are member functions. AddMemberOperatorCandidates(Op, OpLoc, Args, CandidateSet); if (CandidateSet.getRewriteInfo().allowsReversed(Op)) - AddMemberOperatorCandidates(Op, OpLoc, ReversedArgs, CandidateSet, + AddMemberOperatorCandidates(Op, OpLoc, {Args[1], Args[0]}, CandidateSet, OverloadCandidateParamOrder::Reversed); // In C++20, also add any rewritten member candidates. if (ExtraOp) { AddMemberOperatorCandidates(ExtraOp, OpLoc, Args, CandidateSet); if (CandidateSet.getRewriteInfo().allowsReversed(ExtraOp)) - AddMemberOperatorCandidates(ExtraOp, OpLoc, ReversedArgs, CandidateSet, + AddMemberOperatorCandidates(ExtraOp, OpLoc, {Args[1], Args[0]}, + CandidateSet, OverloadCandidateParamOrder::Reversed); } diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 0ecdbb3ffb89f..772962ac653f7 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -6142,9 +6142,9 @@ FunctionDecl *Sema::getMoreConstrainedFunction(FunctionDecl *FD1, assert(!FD1->getDescribedTemplate() && !FD2->getDescribedTemplate() && "not for function templates"); assert(!FD1->isFunctionTemplateSpecialization() || - (isa(FD1))); + isa(FD1)); assert(!FD2->isFunctionTemplateSpecialization() || - (isa(FD2))); + isa(FD2)); FunctionDecl *F1 = FD1; if (FunctionDecl *P = FD1->getTemplateInstantiationPattern(false)) diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp index 083e743818121..ba8e2dc372e98 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp @@ -14,7 +14,7 @@ template struct S { // expected-note@#FINST{{in instantiation of function template specialization}} template requires (S{}) void f(T); -void f(long); +void f(int); // Ensure this applies to operator && as well. // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S')}} @@ -22,7 +22,7 @@ void f(long); // expected-note@#F2INST{{in instantiation of function template specialization}} template requires (S{} && true) void f2(T); -void f2(long); +void f2(int); template requires requires { requires S{}; @@ -36,12 +36,12 @@ template requires requires { // } void f3(T); -void f3(long); +void f3(int); // Doesn't diagnose, since this is no longer a compound requirement. template requires (bool(1 && 2)) void f4(T); -void f4(long); +void f4(int); void g() { f(0); // #FINST diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu index 3d05839af7528..4710c81763adf 100644 --- a/clang/test/SemaCUDA/function-overload.cu +++ b/clang/test/SemaCUDA/function-overload.cu @@ -1,3 +1,6 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \ // RUN: -verify=host,hostdefer,devdefer,expected %s // RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only \ diff --git a/clang/test/SemaCXX/implicit-member-functions.cpp b/clang/test/SemaCXX/implicit-member-functions.cpp index 8350eac5b88a0..1554b1af5d59a 100644 --- a/clang/test/SemaCXX/implicit-member-functions.cpp +++ b/clang/test/SemaCXX/implicit-member-functions.cpp @@ -54,24 +54,31 @@ namespace PR7594 { namespace Recursion { template struct InvokeCopyConstructor { static const T &get(); - typedef decltype(T(get())) type; + typedef decltype(T(get())) type; // expected-error {{no matching conver}} }; struct B; struct A { + // expected-note@-1 {{while substituting deduced template arguments}} typedef B type; template::type> + // expected-note@-1 {{in instantiation of template class}} A(const T &); + // expected-note@-1 {{in instantiation of default argument}} }; - struct B { - B(); + struct B { // expected-note {{while declaring the implicit copy constructor for 'B'}} + // expected-note@-1 {{candidate constructor (the implicit move }} + B(); // expected-note {{candidate constructor not viable}} A a; }; // Triggering the declaration of B's copy constructor causes overload - // resolution to occur for A's copying constructor, which picks - // the implicit copy constructor of A. - // Because that copy constructor is always a perfect match the template - // candidate is not instantiated. + // resolution to occur for A's copying constructor, which instantiates + // InvokeCopyConstructor, which triggers the declaration of B's copy + // constructor. Notionally, this happens when we get to the end of the + // definition of 'struct B', so there is no declared copy constructor yet. + // + // This behavior is g++-compatible, but isn't exactly right; the class is + // supposed to be incomplete when we implicitly declare its special members. B b = B(); diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp deleted file mode 100644 index 877816ca013ec..0000000000000 --- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp +++ /dev/null @@ -1,185 +0,0 @@ -// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++11 %s -// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++20 %s -// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -fsyntax-only -verify -std=c++2c %s - -template -struct Invalid { static_assert(false, "instantiated Invalid"); }; // #err-invalid - -template -int f(T a, Invalid = {}); // #note-f - -// sanity check -int e1 = f(0); -//expected-error@#err-invalid {{static assertion failed: instantiated Invalid}} -//expected-note@-2 {{in instantiation of default function argument expression for 'f' required here}} -//expected-note@#note-f {{in instantiation of template class 'Invalid' requested here}} -//expected-note@#note-f {{passing argument to parameter here}} - -int f(int); -int ok1 = f(0); -int e4 = f((const int&)(ok1)); - -int f(int, int = 0); -int ok2 = f(0, 0); - -int e2 = f(0L); -//expected-error@#err-invalid {{static assertion failed: instantiated Invalid}} -//expected-note@-2 {{in instantiation of default function argument expression for 'f' required here}} -//expected-note@#note-f {{in instantiation of template class 'Invalid' requested here}} -//expected-note@#note-f {{passing argument to parameter here}} - -int f(long); -int ok3 = f(0L); - -template -struct Invalid2 { static_assert(false, "instantiated Invalid2"); }; // #err-qualifiers - -template -int ref(T a, Invalid2 = {}); // expected-note 2{{here}} -int ref(int&); -int ref1 = ref(ok3); -int ref2 = ref((const int&)ok3); // expected-note {{here}} -//expected-error@#err-qualifiers {{static assertion failed: instantiated Invalid2}} - - -template -int f_alias(T a, Invalid = {}); -using Alias = int; -int f_alias(Alias); -int ok4 = f_alias(0); - -#if __cplusplus >= 202002 - -struct Copyable { - template - requires __is_constructible(Copyable, T) - explicit Copyable(T op) noexcept; // #1 - Copyable(const Copyable&) noexcept = default; // #2 -}; -static_assert(__is_constructible(Copyable, const Copyable&)); - -struct ImplicitlyCopyable { - template - requires __is_constructible(ImplicitlyCopyable, T) - explicit ImplicitlyCopyable(T op) = delete; // #1 -}; -static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&)); - - -struct Movable { - template - requires __is_constructible(Movable, T) // #err-self-constraint-1 - explicit Movable(T op) noexcept; // #1 - Movable(Movable&&) noexcept = default; // #2 -}; -static_assert(__is_constructible(Movable, Movable&&)); -static_assert(__is_constructible(Movable, const Movable&)); -// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} - -static_assert(__is_constructible(Movable, int)); -// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \ -// expected-note@-1 2{{}} -// expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}} -// expected-note@#err-self-constraint-1 4{{}} - -template -struct Members { - constexpr auto f(auto) { - static_assert(false, ""); - } - constexpr auto f(int) { return 1; } - constexpr auto f(int) requires true { return 2; } - - constexpr auto g(auto) { - static_assert(false, "instantiated member"); //#err-qualified-member - return 0; - } - constexpr auto g(int) & { return 1; } - - static constexpr auto s(auto) { - static_assert(false, ""); - } - static constexpr auto s(int) { - return 1; - } - static constexpr auto s(int) requires true { - return 2; - } -}; - -static_assert(Members{}.f(0) == 2); -static_assert(Members{}.g(0) == 0); -// expected-error@#err-qualified-member {{static assertion failed: instantiated member}} \ -// expected-note@-1{{in instantiation of function template specialization 'Members::g' }} -Members m1; -static_assert(m1.g(0) == 1); -static_assert(Members{}.s(0) == 2); - - -namespace ConstructorInit{ -struct S { - template - S(T&&) {} -}; -struct Test { - operator S() = delete; -}; - -static_assert(__is_constructible(S, Test)); -} - -namespace RefBinding { - -template struct remove_reference; -template struct remove_reference<_Tp &> { - using type = _Tp; -}; -template remove_reference<_Tp>::type move(_Tp &&); -template struct _Head_base { - _Head_base(_Head &__h) : _M_head_impl(__h) {} - template _Head_base(_UHead &&); - _Head _M_head_impl; -}; - -template void forward_as_tuple(_Elements &&) { - _Head_base<_Elements &&>(_Elements{}); -} -struct StringRef { - void operator[](const StringRef __k) { forward_as_tuple((move)(__k)); } -}; - -} - -template struct tuple {}; -struct BonkersBananas { - template operator T(); - template explicit operator tuple() = delete; -}; -static_assert(!__is_constructible(tuple, BonkersBananas)); - -namespace GH62096 { -template -struct Oops { - static_assert(sizeof(T) == 0); // #GH62096-err - static constexpr bool value = true; -}; - -template -concept Operator = Oops::value; // #GH62096-note1 - -template void f(OP op); // // #GH62096-note2 -void f(int); - -void g(int n) { f(n); } // OK -void h(short n) { f(n); } -// expected-error@#GH62096-err {{static assertion failed due to requirement 'sizeof(short) == 0'}} \ -// expected-note@-1{{in instantiation of function template specialization}} \ -// expected-note@-1{{while checking constraint satisfaction for template}} -// expected-note@#GH62096-note1{{in instantiation}} -// expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}} -// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}} -// expected-note@#GH62096-note2{{while checking the satisfaction of concept}} -// expected-note@#GH62096-err {{expression evaluates}} -} - -#endif diff --git a/clang/test/SemaTemplate/instantiate-function-params.cpp b/clang/test/SemaTemplate/instantiate-function-params.cpp index eb2a7c5d4e8d6..7dd5595de58a3 100644 --- a/clang/test/SemaTemplate/instantiate-function-params.cpp +++ b/clang/test/SemaTemplate/instantiate-function-params.cpp @@ -6,12 +6,13 @@ template struct if_ { typedef if_c< static_cast(T1::value)> almost_type_; // expected-note 7{{in instantiation}} }; template struct wrap_constraints { }; -template +template inline char has_constraints_(Model* , // expected-note 3{{candidate template ignored}} - wrap_constraints* = 0); + wrap_constraints* = 0); // expected-note 4{{in instantiation}} + template struct not_satisfied { static const bool value = sizeof( has_constraints_((Model*)0) == 1); // expected-error 3{{no matching function}} \ - // expected-note 4{{in instantiation}} + // expected-note 4{{while substituting deduced template arguments into function template 'has_constraints_' [with }} }; template struct requirement_; template struct instantiate { diff --git a/clang/test/Templight/templight-empty-entries-fix.cpp b/clang/test/Templight/templight-empty-entries-fix.cpp index 7f34b10134929..d13b748068efe 100644 --- a/clang/test/Templight/templight-empty-entries-fix.cpp +++ b/clang/test/Templight/templight-empty-entries-fix.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -templight-dump -Wno-unused-value %s 2>&1 | FileCheck %s -void a(long) { +void a() { [] {}; } @@ -17,14 +17,14 @@ void a(long) { // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:4:3'$}} // CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:4:3'$}} -template void a(long) { a(0); } +template void a() { a(); } // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template non-type parameter 0 of a$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -42,29 +42,29 @@ template void a(long) { a(0); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a<0>'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a<0>'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:20:31'$}} template struct b { typedef int c; }; -template ::c> void a(long) { a(0); } +template ::c> void a() { a(); } // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -130,25 +130,25 @@ template ::c> void a(long) { a(0); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'a'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+a$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template non-type parameter 0 of a$}} // CHECK: {{^kind:[ ]+DefaultTemplateArgumentInstantiation$}} @@ -166,10 +166,34 @@ template ::c> void a(long) { a(0); } // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:67'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} template void d(int = 0) { d(); } +// CHECK-LABEL: {{^---$}} +// CHECK: {{^name:[ ]+a$}} +// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} +// CHECK: {{^event:[ ]+Begin$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK-LABEL: {{^---$}} +// CHECK: {{^name:[ ]+a$}} +// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} +// CHECK: {{^event:[ ]+End$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:60:57'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK-LABEL: {{^---$}} +// CHECK: {{^name:[ ]+a$}} +// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} +// CHECK: {{^event:[ ]+Begin$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} +// CHECK-LABEL: {{^---$}} +// CHECK: {{^name:[ ]+a$}} +// CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} +// CHECK: {{^event:[ ]+End$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:20:25'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:60:63'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} @@ -225,41 +249,41 @@ void e() { } // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:224:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:248:5'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} // CHECK-LABEL: {{^---$}} -// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:223:3\)'$}} +// CHECK: {{^name:[ ]+'\(unnamed struct at .*templight-empty-entries-fix.cpp:247:3\)'$}} // CHECK: {{^kind:[ ]+Memoization$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:223:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:247:3'$}} template class> @@ -275,71 +299,71 @@ void foo() { // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} // CHECK: {{^poi:[ ]+''$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} // CHECK: {{^poi:[ ]+''$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PartialOrderingTTP$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}} // CHECK: {{^kind:[ ]+PartialOrderingTTP$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:265:35'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:5'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+DeducedTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'d'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+Begin$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+'d'$}} // CHECK: {{^kind:[ ]+TemplateInstantiation$}} // CHECK: {{^event:[ ]+End$}} -// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:266:6'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:290:6'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+Begin$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:171:29'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} // CHECK-LABEL: {{^---$}} // CHECK: {{^name:[ ]+d$}} // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}} // CHECK: {{^event:[ ]+End$}} // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:171:29'$}} -// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:271:3'$}} +// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:3'$}} From 6d03f51f0c59171f1ec3c5cc5c1fe71c30956273 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 16 Apr 2025 12:02:56 -0600 Subject: [PATCH 157/710] [SystemZ] Add support for 16-bit floating point. (#109164) - _Float16 is now accepted by Clang. - The half IR type is fully handled by the backend. - These values are passed in FP registers and converted to/from float around each operation. - Compiler-rt conversion functions are now built for s390x including the missing extendhfdf2 which was added. Fixes #50374 --- clang/docs/LanguageExtensions.rst | 1 + clang/include/clang/Basic/TargetInfo.h | 2 +- clang/lib/Basic/Targets/SystemZ.h | 13 + clang/lib/CodeGen/Targets/SystemZ.cpp | 40 +- clang/test/CodeGen/SystemZ/Float16.c | 85 ++ clang/test/CodeGen/SystemZ/fp16.c | 39 + .../test/CodeGen/SystemZ/strictfp_builtins.c | 14 +- clang/test/CodeGen/SystemZ/systemz-abi.c | 53 ++ .../test/CodeGen/SystemZ/systemz-inline-asm.c | 8 + compiler-rt/cmake/builtin-config-ix.cmake | 3 +- compiler-rt/lib/builtins/CMakeLists.txt | 6 + compiler-rt/lib/builtins/clear_cache.c | 2 + compiler-rt/lib/builtins/extendhfdf2.c | 15 + compiler-rt/test/builtins/CMakeLists.txt | 2 +- .../test/builtins/Unit/extendhfdf2_test.c | 87 +++ llvm/docs/LangRef.rst | 2 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 19 + .../SystemZ/AsmParser/SystemZAsmParser.cpp | 16 + .../MCTargetDesc/SystemZMCTargetDesc.cpp | 16 + .../MCTargetDesc/SystemZMCTargetDesc.h | 2 + llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 46 +- llvm/lib/Target/SystemZ/SystemZCallingConv.td | 4 +- llvm/lib/Target/SystemZ/SystemZFeatures.td | 2 + .../Target/SystemZ/SystemZISelDAGToDAG.cpp | 7 +- .../Target/SystemZ/SystemZISelLowering.cpp | 412 ++++++++-- llvm/lib/Target/SystemZ/SystemZISelLowering.h | 11 + llvm/lib/Target/SystemZ/SystemZInstrFP.td | 43 +- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 19 +- llvm/lib/Target/SystemZ/SystemZInstrVector.td | 8 +- .../Target/SystemZ/SystemZRegisterInfo.cpp | 5 +- llvm/lib/Target/SystemZ/SystemZRegisterInfo.h | 2 +- .../lib/Target/SystemZ/SystemZRegisterInfo.td | 34 +- llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 22 +- llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 22 +- llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 22 +- llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 22 +- llvm/lib/Target/SystemZ/SystemZScheduleZ17.td | 22 +- .../lib/Target/SystemZ/SystemZScheduleZ196.td | 14 +- .../Target/SystemZ/SystemZScheduleZEC12.td | 14 +- llvm/test/CodeGen/SystemZ/asm-10.ll | 9 + llvm/test/CodeGen/SystemZ/asm-17.ll | 11 + llvm/test/CodeGen/SystemZ/asm-19.ll | 19 + llvm/test/CodeGen/SystemZ/atomic-load-10.ll | 22 + llvm/test/CodeGen/SystemZ/atomic-store-10.ll | 24 + .../test/CodeGen/SystemZ/atomicrmw-fadd-04.ll | 76 ++ .../CodeGen/SystemZ/fmuladd-soft-float.ll | 37 + llvm/test/CodeGen/SystemZ/fp-abs-01.ll | 12 + llvm/test/CodeGen/SystemZ/fp-abs-03.ll | 12 + llvm/test/CodeGen/SystemZ/fp-abs-04.ll | 16 + llvm/test/CodeGen/SystemZ/fp-add-01.ll | 12 + llvm/test/CodeGen/SystemZ/fp-cmp-04.ll | 68 +- llvm/test/CodeGen/SystemZ/fp-conv-05.ll | 10 + llvm/test/CodeGen/SystemZ/fp-conv-06.ll | 12 + llvm/test/CodeGen/SystemZ/fp-conv-07.ll | 10 + llvm/test/CodeGen/SystemZ/fp-conv-08.ll | 11 + llvm/test/CodeGen/SystemZ/fp-conv-09.ll | 10 + llvm/test/CodeGen/SystemZ/fp-conv-10.ll | 36 +- llvm/test/CodeGen/SystemZ/fp-conv-11.ll | 10 + llvm/test/CodeGen/SystemZ/fp-conv-12.ll | 36 +- llvm/test/CodeGen/SystemZ/fp-conv-13.ll | 32 +- llvm/test/CodeGen/SystemZ/fp-conv-14.ll | 32 +- llvm/test/CodeGen/SystemZ/fp-conv-20.ll | 78 +- llvm/test/CodeGen/SystemZ/fp-copysign-03.ll | 145 ++++ llvm/test/CodeGen/SystemZ/fp-div-01.ll | 12 + llvm/test/CodeGen/SystemZ/fp-half-cmp.ll | 161 ++++ llvm/test/CodeGen/SystemZ/fp-half-libcall.ll | 312 ++++++++ llvm/test/CodeGen/SystemZ/fp-half-mem.ll | 63 ++ llvm/test/CodeGen/SystemZ/fp-half-move.ll | 83 ++ llvm/test/CodeGen/SystemZ/fp-half-strict.ll | 207 +++++ llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 725 ++++++++++++++++++ llvm/test/CodeGen/SystemZ/fp-half.ll | 612 +++++++++++++++ llvm/test/CodeGen/SystemZ/fp-libcall.ll | 10 + llvm/test/CodeGen/SystemZ/fp-mul-01.ll | 12 + llvm/test/CodeGen/SystemZ/fp-mul-06.ll | 14 + llvm/test/CodeGen/SystemZ/fp-mul-08.ll | 18 + llvm/test/CodeGen/SystemZ/fp-mul-10.ll | 38 +- llvm/test/CodeGen/SystemZ/fp-mul-15.ll | 20 + llvm/test/CodeGen/SystemZ/fp-neg-01.ll | 11 + llvm/test/CodeGen/SystemZ/fp-neg-02.ll | 11 + llvm/test/CodeGen/SystemZ/fp-round-01.ll | 48 ++ llvm/test/CodeGen/SystemZ/fp-round-02.ll | 36 + llvm/test/CodeGen/SystemZ/fp-round-03.ll | 15 +- llvm/test/CodeGen/SystemZ/fp-sqrt-01.ll | 12 + llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll | 114 ++- .../test/CodeGen/SystemZ/fp-strict-cmps-01.ll | 20 + .../test/CodeGen/SystemZ/fp-strict-cmps-04.ll | 37 + .../test/CodeGen/SystemZ/fp-strict-conv-01.ll | 29 + .../test/CodeGen/SystemZ/fp-strict-conv-02.ll | 11 + .../test/CodeGen/SystemZ/fp-strict-conv-05.ll | 13 + .../test/CodeGen/SystemZ/fp-strict-conv-06.ll | 15 + .../test/CodeGen/SystemZ/fp-strict-conv-07.ll | 13 + .../test/CodeGen/SystemZ/fp-strict-conv-08.ll | 14 + .../test/CodeGen/SystemZ/fp-strict-conv-09.ll | 12 + .../test/CodeGen/SystemZ/fp-strict-conv-10.ll | 50 +- .../test/CodeGen/SystemZ/fp-strict-conv-11.ll | 12 + .../test/CodeGen/SystemZ/fp-strict-conv-12.ll | 50 +- .../test/CodeGen/SystemZ/fp-strict-conv-13.ll | 38 +- .../test/CodeGen/SystemZ/fp-strict-conv-14.ll | 36 +- .../test/CodeGen/SystemZ/fp-strict-conv-15.ll | 28 + .../test/CodeGen/SystemZ/fp-strict-conv-17.ll | 88 ++- llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll | 16 + llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll | 18 + .../CodeGen/SystemZ/fp-strict-round-01.ll | 71 ++ .../CodeGen/SystemZ/fp-strict-round-02.ll | 42 + .../CodeGen/SystemZ/fp-strict-round-03.ll | 15 + .../test/CodeGen/SystemZ/fp-strict-sqrt-01.ll | 16 + llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll | 16 + llvm/test/CodeGen/SystemZ/fp-sub-01.ll | 12 + ...-asm-fp-int-casting-explicit-regs-zEC12.ll | 34 + ...inline-asm-fp-int-casting-explicit-regs.ll | 40 + .../inline-asm-fp-int-casting-zEC12.ll | 32 + .../SystemZ/inline-asm-fp-int-casting.ll | 52 ++ llvm/test/CodeGen/SystemZ/is_fpclass.ll | 19 + llvm/test/CodeGen/SystemZ/spill-half-01.mir | 63 ++ llvm/test/CodeGen/SystemZ/spill-half-02.mir | 27 + llvm/test/CodeGen/SystemZ/stackmap.ll | 30 +- llvm/test/CodeGen/SystemZ/tdc-01.ll | 12 + llvm/test/CodeGen/SystemZ/tdc-02.ll | 17 + llvm/test/CodeGen/SystemZ/tdc-03.ll | 26 +- llvm/test/CodeGen/SystemZ/tdc-04.ll | 18 +- llvm/test/CodeGen/SystemZ/tdc-05.ll | 25 + llvm/test/CodeGen/SystemZ/tdc-06.ll | 2 - llvm/test/CodeGen/SystemZ/twoaddr-kill.mir | 8 +- llvm/test/CodeGen/SystemZ/vec-max-05.ll | 14 + llvm/test/CodeGen/SystemZ/vec-min-05.ll | 14 + .../test/CodeGen/SystemZ/vec-strict-max-01.ll | 31 + .../test/CodeGen/SystemZ/vec-strict-min-01.ll | 31 + 127 files changed, 5301 insertions(+), 296 deletions(-) create mode 100644 clang/test/CodeGen/SystemZ/Float16.c create mode 100644 clang/test/CodeGen/SystemZ/fp16.c create mode 100644 compiler-rt/lib/builtins/extendhfdf2.c create mode 100644 compiler-rt/test/builtins/Unit/extendhfdf2_test.c create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-10.ll create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-10.ll create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fadd-04.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-copysign-03.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-cmp.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-libcall.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-mem.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-move.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-strict.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half-vector.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-half.ll create mode 100644 llvm/test/CodeGen/SystemZ/fp-mul-15.ll create mode 100644 llvm/test/CodeGen/SystemZ/spill-half-01.mir create mode 100644 llvm/test/CodeGen/SystemZ/spill-half-02.mir diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 971ab50cc9a69..7835eceadf660 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1000,6 +1000,7 @@ to ``float``; see below for more information on this emulation. * SPIR (natively) * X86 (if SSE2 is available; natively if AVX512-FP16 is also available) * RISC-V (natively if Zfh or Zhinx is available) + * SystemZ (emulated) * ``__bf16`` is supported on the following targets (currently never natively): diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 93cffe84e2f42..8c3dcda25bc8d 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -235,7 +235,7 @@ class TargetInfo : public TransferrableTargetInfo, bool NoAsmVariants; // True if {|} are normal characters. bool HasLegalHalfType; // True if the backend supports operations on the half // LLVM IR type. - bool HalfArgsAndReturns; + bool HalfArgsAndReturns; // OpenCL 6.1.1.1, NEON (IEEE 754-2008 half) type. bool HasFloat128; bool HasFloat16; bool HasBFloat16; diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index 4d1509b84e82b..cb71c5d7e75d8 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -93,11 +93,24 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo { "-v128:64-a:8:16-n32:64"); } MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128; + + // True if the backend supports operations on the half LLVM IR type. + // By setting this to false, conversions will happen for _Float16 around + // a statement by default, with operations done in float. However, if + // -ffloat16-excess-precision=none is given, no conversions will be made + // and instead the backend will promote each half operation to float + // individually. + HasLegalHalfType = false; + // Support _Float16. + HasFloat16 = true; + HasStrictFP = true; } unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override; + bool useFP16ConversionIntrinsics() const override { return false; } + void getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const override; diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 8a9fddace76d9..6ea6c7a546436 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -31,7 +31,7 @@ class SystemZABIInfo : public ABIInfo { bool isPromotableIntegerTypeForABI(QualType Ty) const; bool isCompoundType(QualType Ty) const; bool isVectorArgumentType(QualType Ty) const; - bool isFPArgumentType(QualType Ty) const; + llvm::Type *getFPArgumentType(QualType Ty, uint64_t Size) const; QualType GetSingleElementType(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy) const; @@ -107,7 +107,8 @@ class SystemZTargetCodeGenInfo : public TargetCodeGenInfo { return nullptr; llvm::Type *Ty = V->getType(); - if (Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isFP128Ty()) { + if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy() || + Ty->isFP128Ty()) { llvm::Module &M = CGM.getModule(); auto &Ctx = M.getContext(); llvm::Function *TDCFunc = llvm::Intrinsic::getOrInsertDeclaration( @@ -179,20 +180,31 @@ bool SystemZABIInfo::isVectorArgumentType(QualType Ty) const { getContext().getTypeSize(Ty) <= 128); } -bool SystemZABIInfo::isFPArgumentType(QualType Ty) const { +// The Size argument will in case of af an overaligned single element struct +// reflect the overalignment value. In such a case the argument will be +// passed using the type matching Size. +llvm::Type *SystemZABIInfo::getFPArgumentType(QualType Ty, + uint64_t Size) const { if (IsSoftFloatABI) - return false; + return nullptr; if (const BuiltinType *BT = Ty->getAs()) switch (BT->getKind()) { + case BuiltinType::Float16: + if (Size == 16) + return llvm::Type::getHalfTy(getVMContext()); + LLVM_FALLTHROUGH; case BuiltinType::Float: + if (Size == 32) + return llvm::Type::getFloatTy(getVMContext()); + LLVM_FALLTHROUGH; case BuiltinType::Double: - return true; + return llvm::Type::getDoubleTy(getVMContext()); default: - return false; + return nullptr; } - return false; + return nullptr; } QualType SystemZABIInfo::GetSingleElementType(QualType Ty) const { @@ -277,7 +289,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, } else { if (AI.getCoerceToType()) ArgTy = AI.getCoerceToType(); - InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy())); + InFPRs = (!IsSoftFloatABI && + (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy())); IsVector = ArgTy->isVectorTy(); UnpaddedSize = TyInfo.Width; DirectAlign = TyInfo.Align; @@ -447,12 +460,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const { return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(), /*ByVal=*/false); - // The structure is passed as an unextended integer, a float, or a double. - if (isFPArgumentType(SingleElementTy)) { - assert(Size == 32 || Size == 64); - return ABIArgInfo::getDirect( - Size == 32 ? llvm::Type::getFloatTy(getVMContext()) - : llvm::Type::getDoubleTy(getVMContext())); + // The structure is passed as an unextended integer, a half, a float, + // or a double. + if (llvm::Type *FPArgTy = getFPArgumentType(SingleElementTy, Size)) { + assert(Size == 16 || Size == 32 || Size == 64); + return ABIArgInfo::getDirect(FPArgTy); } else { llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size); return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy) diff --git a/clang/test/CodeGen/SystemZ/Float16.c b/clang/test/CodeGen/SystemZ/Float16.c new file mode 100644 index 0000000000000..4444dbdcc23ca --- /dev/null +++ b/clang/test/CodeGen/SystemZ/Float16.c @@ -0,0 +1,85 @@ +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=STANDARD + +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=NONE + +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=FAST + +_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) { + return a * b + c * d; +} + +// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// STANDARD-NEXT: entry: +// STANDARD-NEXT: %a.addr = alloca half, align 2 +// STANDARD-NEXT: %b.addr = alloca half, align 2 +// STANDARD-NEXT: %c.addr = alloca half, align 2 +// STANDARD-NEXT: %d.addr = alloca half, align 2 +// STANDARD-NEXT: store half %a, ptr %a.addr, align 2 +// STANDARD-NEXT: store half %b, ptr %b.addr, align 2 +// STANDARD-NEXT: store half %c, ptr %c.addr, align 2 +// STANDARD-NEXT: store half %d, ptr %d.addr, align 2 +// STANDARD-NEXT: %0 = load half, ptr %a.addr, align 2 +// STANDARD-NEXT: %ext = fpext half %0 to float +// STANDARD-NEXT: %1 = load half, ptr %b.addr, align 2 +// STANDARD-NEXT: %ext1 = fpext half %1 to float +// STANDARD-NEXT: %mul = fmul float %ext, %ext1 +// STANDARD-NEXT: %2 = load half, ptr %c.addr, align 2 +// STANDARD-NEXT: %ext2 = fpext half %2 to float +// STANDARD-NEXT: %3 = load half, ptr %d.addr, align 2 +// STANDARD-NEXT: %ext3 = fpext half %3 to float +// STANDARD-NEXT: %mul4 = fmul float %ext2, %ext3 +// STANDARD-NEXT: %add = fadd float %mul, %mul4 +// STANDARD-NEXT: %unpromotion = fptrunc float %add to half +// STANDARD-NEXT: ret half %unpromotion +// STANDARD-NEXT: } + +// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// NONE-NEXT: entry: +// NONE-NEXT: %a.addr = alloca half, align 2 +// NONE-NEXT: %b.addr = alloca half, align 2 +// NONE-NEXT: %c.addr = alloca half, align 2 +// NONE-NEXT: %d.addr = alloca half, align 2 +// NONE-NEXT: store half %a, ptr %a.addr, align 2 +// NONE-NEXT: store half %b, ptr %b.addr, align 2 +// NONE-NEXT: store half %c, ptr %c.addr, align 2 +// NONE-NEXT: store half %d, ptr %d.addr, align 2 +// NONE-NEXT: %0 = load half, ptr %a.addr, align 2 +// NONE-NEXT: %1 = load half, ptr %b.addr, align 2 +// NONE-NEXT: %mul = fmul half %0, %1 +// NONE-NEXT: %2 = load half, ptr %c.addr, align 2 +// NONE-NEXT: %3 = load half, ptr %d.addr, align 2 +// NONE-NEXT: %mul1 = fmul half %2, %3 +// NONE-NEXT: %add = fadd half %mul, %mul1 +// NONE-NEXT: ret half %add +// NONE-NEXT: } + +// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// FAST-NEXT: entry: +// FAST-NEXT: %a.addr = alloca half, align 2 +// FAST-NEXT: %b.addr = alloca half, align 2 +// FAST-NEXT: %c.addr = alloca half, align 2 +// FAST-NEXT: %d.addr = alloca half, align 2 +// FAST-NEXT: store half %a, ptr %a.addr, align 2 +// FAST-NEXT: store half %b, ptr %b.addr, align 2 +// FAST-NEXT: store half %c, ptr %c.addr, align 2 +// FAST-NEXT: store half %d, ptr %d.addr, align 2 +// FAST-NEXT: %0 = load half, ptr %a.addr, align 2 +// FAST-NEXT: %ext = fpext half %0 to float +// FAST-NEXT: %1 = load half, ptr %b.addr, align 2 +// FAST-NEXT: %ext1 = fpext half %1 to float +// FAST-NEXT: %mul = fmul float %ext, %ext1 +// FAST-NEXT: %2 = load half, ptr %c.addr, align 2 +// FAST-NEXT: %ext2 = fpext half %2 to float +// FAST-NEXT: %3 = load half, ptr %d.addr, align 2 +// FAST-NEXT: %ext3 = fpext half %3 to float +// FAST-NEXT: %mul4 = fmul float %ext2, %ext3 +// FAST-NEXT: %add = fadd float %mul, %mul4 +// FAST-NEXT: %unpromotion = fptrunc float %add to half +// FAST-NEXT: ret half %unpromotion +// FAST-NEXT: } diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c new file mode 100644 index 0000000000000..430958b69a177 --- /dev/null +++ b/clang/test/CodeGen/SystemZ/fp16.c @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \ +// RUN: | FileCheck %s + +void f(__fp16 *a, __fp16 *b, __fp16 *c, __fp16 *d, __fp16 *e) { + *e = (*a) * (*b) + (*c) * (*d); +} + +// CHECK-LABEL: define dso_local void @f(ptr noundef %a, ptr noundef %b, ptr noundef %c, ptr noundef %d, ptr noundef %e) #0 { +// CHECK-NEXT: entry: +// CHECK-NEXT: %a.addr = alloca ptr, align 8 +// CHECK-NEXT: %b.addr = alloca ptr, align 8 +// CHECK-NEXT: %c.addr = alloca ptr, align 8 +// CHECK-NEXT: %d.addr = alloca ptr, align 8 +// CHECK-NEXT: %e.addr = alloca ptr, align 8 +// CHECK-NEXT: store ptr %a, ptr %a.addr, align 8 +// CHECK-NEXT: store ptr %b, ptr %b.addr, align 8 +// CHECK-NEXT: store ptr %c, ptr %c.addr, align 8 +// CHECK-NEXT: store ptr %d, ptr %d.addr, align 8 +// CHECK-NEXT: store ptr %e, ptr %e.addr, align 8 +// CHECK-NEXT: %0 = load ptr, ptr %a.addr, align 8 +// CHECK-NEXT: %1 = load half, ptr %0, align 2 +// CHECK-NEXT: %conv = fpext half %1 to float +// CHECK-NEXT: %2 = load ptr, ptr %b.addr, align 8 +// CHECK-NEXT: %3 = load half, ptr %2, align 2 +// CHECK-NEXT: %conv1 = fpext half %3 to float +// CHECK-NEXT: %mul = fmul float %conv, %conv1 +// CHECK-NEXT: %4 = load ptr, ptr %c.addr, align 8 +// CHECK-NEXT: %5 = load half, ptr %4, align 2 +// CHECK-NEXT: %conv2 = fpext half %5 to float +// CHECK-NEXT: %6 = load ptr, ptr %d.addr, align 8 +// CHECK-NEXT: %7 = load half, ptr %6, align 2 +// CHECK-NEXT: %conv3 = fpext half %7 to float +// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3 +// CHECK-NEXT: %add = fadd float %mul, %mul4 +// CHECK-NEXT: %8 = fptrunc float %add to half +// CHECK-NEXT: %9 = load ptr, ptr %e.addr, align 8 +// CHECK-NEXT: store half %8, ptr %9, align 2 +// CHECK-NEXT: ret void +// CHECK-NEXT: } diff --git a/clang/test/CodeGen/SystemZ/strictfp_builtins.c b/clang/test/CodeGen/SystemZ/strictfp_builtins.c index 8c8f1f4cabd74..f871debde067e 100644 --- a/clang/test/CodeGen/SystemZ/strictfp_builtins.c +++ b/clang/test/CodeGen/SystemZ/strictfp_builtins.c @@ -4,12 +4,24 @@ #pragma float_control(except, on) +// CHECK-LABEL: @test_isnan__Float16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[F_ADDR:%.*]] = alloca half, align 2 +// CHECK-NEXT: store half [[F:%.*]], ptr [[F_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[F_ADDR]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.s390.tdc.f16(half [[TMP0]], i64 15) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int test_isnan__Float16(_Float16 f) { + return __builtin_isnan(f); +} + // CHECK-LABEL: @test_isnan_float( // CHECK-NEXT: entry: // CHECK-NEXT: [[F_ADDR:%.*]] = alloca float, align 4 // CHECK-NEXT: store float [[F:%.*]], ptr [[F_ADDR]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.s390.tdc.f32(float [[TMP0]], i64 15) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.s390.tdc.f32(float [[TMP0]], i64 15) #[[ATTR2]] // CHECK-NEXT: ret i32 [[TMP1]] // int test_isnan_float(float f) { diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c index 7de425950e9fd..f26084ab44eae 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi.c @@ -52,6 +52,9 @@ long long pass_longlong(long long arg) { return arg; } __int128 pass_int128(__int128 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0) +_Float16 pass__Float16(_Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}}) + float pass_float(float arg) { return arg; } // CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}}) @@ -79,6 +82,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; } _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg) +_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg) + _Complex float pass_complex_float(_Complex float arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg) @@ -130,6 +136,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; } // Float-like aggregate types +struct agg__Float16 { _Float16 a; }; +struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; } +// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}}) +// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}}) + struct agg_float { float a; }; struct agg_float pass_agg_float(struct agg_float arg) { return arg; } // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}}) @@ -144,6 +155,20 @@ struct agg_longdouble { long double a; }; struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}}) +struct agg__Float16_a4 { _Float16 a __attribute__((aligned (4))); }; +struct agg__Float16_a4 pass_agg__Float16_a4(struct agg__Float16_a4 arg) { return arg; } +// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a4(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a4) align 4 %{{.*}}, float %{{.*}}) +// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a4(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a4) align 4 %{{.*}}, i32 noext %{{.*}}) + +struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); }; +struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; } +// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}}) +// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}}) + +struct agg__Float16_a16 { _Float16 a __attribute__((aligned (16))); }; +struct agg__Float16_a16 pass_agg__Float16_a16(struct agg__Float16_a16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_agg__Float16_a16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a16) align 16 %{{.*}}, ptr %{{.*}}) + struct agg_float_a8 { float a __attribute__((aligned (8))); }; struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; } // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}}) @@ -171,6 +196,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; } // Union types likewise are *not* float-like aggregate types +union union__Float16 { _Float16 a; }; +union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}}) + union union_float { float a; }; union union_float pass_union_float(union union_float arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}}) @@ -448,6 +477,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ] // CHECK: ret void +struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); } +// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}} +// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1 +// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0 +// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]] +// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4 +// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 +// CHECK: br i1 [[FITS_IN_REGS]], +// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8 +// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128 +// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22 +// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3 +// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]] +// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]] +// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1 +// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]] +// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2 +// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]] +// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6 +// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8 +// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]] +// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ] +// CHECK: ret void + struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); } // CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}} // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1 diff --git a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c index 9e62b8e107900..434937a66389c 100644 --- a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c +++ b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c @@ -106,6 +106,14 @@ void test_M(void) { // CHECK: call void asm sideeffect "#FOO $0", "M"(i32 2147483647) } +_Float16 test_f16(_Float16 a) { + _Float16 f; + asm("ler %0, %1" : "=f" (f) : "f" (a)); + return f; +// CHECK-LABEL: define{{.*}} half @test_f16(half noundef %a) +// CHECK: call half asm "ler $0, $1", "=f,f"(half %a) +} + float test_f32(float f, float g) { asm("aebr %0, %2" : "=f" (f) : "0" (f), "f" (g)); return f; diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index 7bd3269bd999d..cbb43a5958d2f 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -73,6 +73,7 @@ set(PPC32 powerpc powerpcspe) set(PPC64 powerpc64 powerpc64le) set(RISCV32 riscv32) set(RISCV64 riscv64) +set(S390X s390x) set(SPARC sparc) set(SPARCV9 sparcv9) set(WASM32 wasm32) @@ -88,7 +89,7 @@ endif() set(ALL_BUILTIN_SUPPORTED_ARCH ${X86} ${X86_64} ${AMDGPU} ${ARM32} ${ARM64} ${AVR} ${HEXAGON} ${MIPS32} ${MIPS64} ${NVPTX} ${PPC32} ${PPC64} - ${RISCV32} ${RISCV64} ${SPARC} ${SPARCV9} + ${RISCV32} ${RISCV64} ${S390X} ${SPARC} ${SPARCV9} ${WASM32} ${WASM64} ${VE} ${LOONGARCH64}) include(CompilerRTUtils) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 3cdbf21ed403d..74d9627b9f102 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -104,6 +104,7 @@ set(GENERIC_SOURCES divti3.c extendsfdf2.c extendhfsf2.c + extendhfdf2.c ffsdi2.c ffssi2.c ffsti2.c @@ -768,6 +769,11 @@ set(riscv64_SOURCES set(sparc_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES}) set(sparcv9_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES}) +set(s390x_SOURCES + ${GENERIC_SOURCES} + ${GENERIC_TF_SOURCES} +) + set(wasm32_SOURCES ${GENERIC_TF_SOURCES} ${GENERIC_SOURCES} diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c index 2ac99b25c243f..441eabd1fe922 100644 --- a/compiler-rt/lib/builtins/clear_cache.c +++ b/compiler-rt/lib/builtins/clear_cache.c @@ -62,6 +62,8 @@ void __clear_cache(void *start, void *end) { #if __i386__ || __x86_64__ || defined(_M_IX86) || defined(_M_X64) // Intel processors have a unified instruction and data cache // so there is nothing to do +#elif defined(__s390__) +// no-op #elif defined(_WIN32) && (defined(__arm__) || defined(__aarch64__)) FlushInstructionCache(GetCurrentProcess(), start, end - start); #elif defined(__arm__) && !defined(__APPLE__) diff --git a/compiler-rt/lib/builtins/extendhfdf2.c b/compiler-rt/lib/builtins/extendhfdf2.c new file mode 100644 index 0000000000000..1cfbdb82730ad --- /dev/null +++ b/compiler-rt/lib/builtins/extendhfdf2.c @@ -0,0 +1,15 @@ +//===-- lib/extendhfdf2.c - half -> single conversion -------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define SRC_HALF +#define DST_DOUBLE +#include "fp_extend_impl.inc" + +COMPILER_RT_ABI NOINLINE dst_t __extendhfdf2(src_t a) { + return __extendXfYf2__(a); +} diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt index 8fdcec6029a2a..63f4c94605c90 100644 --- a/compiler-rt/test/builtins/CMakeLists.txt +++ b/compiler-rt/test/builtins/CMakeLists.txt @@ -56,7 +56,7 @@ foreach(arch ${BUILTIN_TEST_ARCH}) string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}") endif() else() - if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64" AND COMPILER_RT_HAS_${arch}_FLOAT16) + if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64|s390x" AND COMPILER_RT_HAS_${arch}_FLOAT16) list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_HAS_FLOAT16) string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}") endif() diff --git a/compiler-rt/test/builtins/Unit/extendhfdf2_test.c b/compiler-rt/test/builtins/Unit/extendhfdf2_test.c new file mode 100644 index 0000000000000..422e272c11f77 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/extendhfdf2_test.c @@ -0,0 +1,87 @@ +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_extendhfdf2 + +#include + +#include "fp_test.h" + +double __extendhfdf2(TYPE_FP16 a); + +int test__extendhfdf2(TYPE_FP16 a, uint64_t expected) +{ + double x = __extendhfdf2(a); + int ret = compareResultD(x, expected); + + if (ret){ + printf("error in test__extendhfdf2(%#.4x) = %f, " + "expected %f\n", toRep16(a), x, fromRep64(expected)); + } + return ret; +} + +char assumption_1[sizeof(TYPE_FP16) * CHAR_BIT == 16] = {0}; + +int main() +{ + // qNaN + if (test__extendhfdf2(makeQNaN16(), + UINT64_C(0x7ff8000000000000))) + return 1; + // NaN + if (test__extendhfdf2(fromRep16(0x7f80), + UINT64_C(0x7ffe000000000000))) + return 1; + // inf + if (test__extendhfdf2(makeInf16(), + UINT64_C(0x7ff0000000000000))) + return 1; + // -inf + if (test__extendhfdf2(makeNegativeInf16(), + UINT64_C(0xfff0000000000000))) + return 1; + // zero + if (test__extendhfdf2(fromRep16(0x0), + UINT64_C(0x0))) + return 1; + // -zero + if (test__extendhfdf2(fromRep16(0x8000), + UINT64_C(0x8000000000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x4248), + UINT64_C(0x4009200000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0xc248), + UINT64_C(0xc009200000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x6e62), + UINT64_C(0x40b9880000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x3c00), + UINT64_C(0x3ff0000000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x0400), + UINT64_C(0x3f10000000000000))) + return 1; + // denormal + if (test__extendhfdf2(fromRep16(0x0010), + UINT64_C(0x3eb0000000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x0001), + UINT64_C(0x3e70000000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x8001), + UINT64_C(0xbe70000000000000))) + return 1; + if (test__extendhfdf2(fromRep16(0x0001), + UINT64_C(0x3e70000000000000))) + return 1; + // max (precise) + if (test__extendhfdf2(fromRep16(0x7bff), + UINT64_C(0x40effc0000000000))) + return 1; + // max (rounded) + if (test__extendhfdf2(fromRep16(0x7bff), + UINT64_C(0x40effc0000000000))) + return 1; + return 0; +} diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 769003a90f959..110c30e19220f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5710,7 +5710,7 @@ SystemZ: address context evaluates as zero). - ``h``: A 32-bit value in the high part of a 64bit data register (LLVM-specific) -- ``f``: A 32, 64, or 128-bit floating-point register. +- ``f``: A 16, 32, 64, or 128-bit floating-point register. X86: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b8af281e1c24b..fddb99d2f0b22 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -5466,6 +5466,25 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp3, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true))); break; + + case ISD::STRICT_FMINIMUM: { + case ISD::STRICT_FMAXIMUM: + SDValue InChain = Node->getOperand(0); + SDVTList VTs = DAG.getVTList(NVT, MVT::Other); + Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, VTs, InChain, + Node->getOperand(1)); + Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, VTs, InChain, + Node->getOperand(2)); + SmallVector Ops = {InChain, Tmp1, Tmp2}; + Tmp3 = DAG.getNode(Node->getOpcode(), dl, VTs, Ops, Node->getFlags()); + Tmp4 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, DAG.getVTList(OVT, MVT::Other), + InChain, Tmp3, + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); + Results.push_back(Tmp4); + Results.push_back(Tmp4.getValue(1)); + break; + } + case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 4fa5f026602ef..6d9a7a73f72db 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -61,9 +61,11 @@ enum RegisterKind { GRH32Reg, GR64Reg, GR128Reg, + FP16Reg, FP32Reg, FP64Reg, FP128Reg, + VR16Reg, VR32Reg, VR64Reg, VR128Reg, @@ -365,9 +367,11 @@ class SystemZOperand : public MCParsedAsmOperand { bool isADDR32() const { return isReg(GR32Reg); } bool isADDR64() const { return isReg(GR64Reg); } bool isADDR128() const { return false; } + bool isFP16() const { return isReg(FP16Reg); } bool isFP32() const { return isReg(FP32Reg); } bool isFP64() const { return isReg(FP64Reg); } bool isFP128() const { return isReg(FP128Reg); } + bool isVR16() const { return isReg(VR16Reg); } bool isVR32() const { return isReg(VR32Reg); } bool isVR64() const { return isReg(VR64Reg); } bool isVF128() const { return false; } @@ -544,6 +548,9 @@ class SystemZAsmParser : public MCTargetAsmParser { ParseStatus parseADDR128(OperandVector &Operands) { llvm_unreachable("Shouldn't be used as an operand"); } + ParseStatus parseFP16(OperandVector &Operands) { + return parseRegister(Operands, FP16Reg); + } ParseStatus parseFP32(OperandVector &Operands) { return parseRegister(Operands, FP32Reg); } @@ -553,6 +560,9 @@ class SystemZAsmParser : public MCTargetAsmParser { ParseStatus parseFP128(OperandVector &Operands) { return parseRegister(Operands, FP128Reg); } + ParseStatus parseVR16(OperandVector &Operands) { + return parseRegister(Operands, VR16Reg); + } ParseStatus parseVR32(OperandVector &Operands) { return parseRegister(Operands, VR32Reg); } @@ -842,11 +852,13 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, case GR128Reg: Group = RegGR; break; + case FP16Reg: case FP32Reg: case FP64Reg: case FP128Reg: Group = RegFP; break; + case VR16Reg: case VR32Reg: case VR64Reg: case VR128Reg: @@ -889,21 +901,25 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, return ParseStatus::NoMatch; // Determine the LLVM register number according to Kind. + // clang-format off const unsigned *Regs; switch (Kind) { case GR32Reg: Regs = SystemZMC::GR32Regs; break; case GRH32Reg: Regs = SystemZMC::GRH32Regs; break; case GR64Reg: Regs = SystemZMC::GR64Regs; break; case GR128Reg: Regs = SystemZMC::GR128Regs; break; + case FP16Reg: Regs = SystemZMC::FP16Regs; break; case FP32Reg: Regs = SystemZMC::FP32Regs; break; case FP64Reg: Regs = SystemZMC::FP64Regs; break; case FP128Reg: Regs = SystemZMC::FP128Regs; break; + case VR16Reg: Regs = SystemZMC::VR16Regs; break; case VR32Reg: Regs = SystemZMC::VR32Regs; break; case VR64Reg: Regs = SystemZMC::VR64Regs; break; case VR128Reg: Regs = SystemZMC::VR128Regs; break; case AR32Reg: Regs = SystemZMC::AR32Regs; break; case CR64Reg: Regs = SystemZMC::CR64Regs; break; } + // clang-format on if (Regs[Reg.Num] == 0) return Error(Reg.StartLoc, "invalid register pair"); diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 2bef87696a913..d2ed5cac5c576 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -64,6 +64,12 @@ const unsigned SystemZMC::GR128Regs[16] = { SystemZ::R0Q, 0, SystemZ::R2Q, 0, SystemZ::R4Q, 0, SystemZ::R6Q, 0, SystemZ::R8Q, 0, SystemZ::R10Q, 0, SystemZ::R12Q, 0, SystemZ::R14Q, 0}; +const unsigned SystemZMC::FP16Regs[16] = { + SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H, + SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H, + SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H, + SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H}; + const unsigned SystemZMC::FP32Regs[16] = { SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, @@ -80,6 +86,15 @@ const unsigned SystemZMC::FP128Regs[16] = { SystemZ::F0Q, SystemZ::F1Q, 0, 0, SystemZ::F4Q, SystemZ::F5Q, 0, 0, SystemZ::F8Q, SystemZ::F9Q, 0, 0, SystemZ::F12Q, SystemZ::F13Q, 0, 0}; +const unsigned SystemZMC::VR16Regs[32] = { + SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H, SystemZ::F4H, + SystemZ::F5H, SystemZ::F6H, SystemZ::F7H, SystemZ::F8H, SystemZ::F9H, + SystemZ::F10H, SystemZ::F11H, SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, + SystemZ::F15H, SystemZ::F16H, SystemZ::F17H, SystemZ::F18H, SystemZ::F19H, + SystemZ::F20H, SystemZ::F21H, SystemZ::F22H, SystemZ::F23H, SystemZ::F24H, + SystemZ::F25H, SystemZ::F26H, SystemZ::F27H, SystemZ::F28H, SystemZ::F29H, + SystemZ::F30H, SystemZ::F31H}; + const unsigned SystemZMC::VR32Regs[32] = { SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, SystemZ::F8S, SystemZ::F9S, @@ -132,6 +147,7 @@ unsigned SystemZMC::getFirstReg(unsigned Reg) { Map[AR32Regs[I]] = I; } for (unsigned I = 0; I < 32; ++I) { + Map[VR16Regs[I]] = I; Map[VR32Regs[I]] = I; Map[VR64Regs[I]] = I; Map[VR128Regs[I]] = I; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 39c1836a13700..1db1b4b9da002 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -43,9 +43,11 @@ extern const unsigned GR32Regs[16]; extern const unsigned GRH32Regs[16]; extern const unsigned GR64Regs[16]; extern const unsigned GR128Regs[16]; +extern const unsigned FP16Regs[16]; extern const unsigned FP32Regs[16]; extern const unsigned FP64Regs[16]; extern const unsigned FP128Regs[16]; +extern const unsigned VR16Regs[32]; extern const unsigned VR32Regs[32]; extern const unsigned VR64Regs[32]; extern const unsigned VR128Regs[32]; diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index b3efa579dfe0c..c1ffc287235e5 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -136,6 +136,25 @@ static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) { .addImm(0); } +// MI extracts the first element of the source vector. +static MCInst lowerVecEltExtraction(const MachineInstr *MI, unsigned Opcode) { + return MCInstBuilder(Opcode) + .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg())) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg())) + .addReg(0) + .addImm(0); +} + +// MI inserts value into the first element of the destination vector. +static MCInst lowerVecEltInsertion(const MachineInstr *MI, unsigned Opcode) { + return MCInstBuilder(Opcode) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) + .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) + .addReg(MI->getOperand(1).getReg()) + .addReg(0) + .addImm(0); +} + // The XPLINK ABI requires that a no-op encoding the call type is emitted after // each call to a subroutine. This information can be used by the called // function to determine its entry point, e.g. for generating a backtrace. The @@ -549,6 +568,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTMAlign); break; + case SystemZ::VL16: + LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPH); + break; + case SystemZ::VL32: LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF); break; @@ -557,6 +580,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG); break; + case SystemZ::VST16: + LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEH); + break; + case SystemZ::VST32: LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF); break; @@ -566,18 +593,19 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { break; case SystemZ::LFER: - LoweredMI = MCInstBuilder(SystemZ::VLGVF) - .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg())) - .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg())) - .addReg(0).addImm(0); + LoweredMI = lowerVecEltExtraction(MI, SystemZ::VLGVF); + break; + + case SystemZ::LFER_16: + LoweredMI = lowerVecEltExtraction(MI, SystemZ::VLGVH); break; case SystemZ::LEFR: - LoweredMI = MCInstBuilder(SystemZ::VLVGF) - .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) - .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg())) - .addReg(MI->getOperand(1).getReg()) - .addReg(0).addImm(0); + LoweredMI = lowerVecEltInsertion(MI, SystemZ::VLVGF); + break; + + case SystemZ::LEFR_16: + LoweredMI = lowerVecEltInsertion(MI, SystemZ::VLVGH); break; #define LOWER_LOW(NAME) \ diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 99bb697ce2014..0ad872bcb63a7 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[ // other floating-point argument registers available for code that // doesn't care about the ABI. All floating-point argument registers // are call-clobbered, so we can use all of them here. + CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>, CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, @@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[ CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>, // The first 4 float and double arguments are passed in even registers F0-F6. + CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>, CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, @@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[ CCAssignToStack<16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. - CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> + CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>> ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td index ec1a7beeab213..2c48da8320fb9 100644 --- a/llvm/lib/Target/SystemZ/SystemZFeatures.td +++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td @@ -196,6 +196,8 @@ def FeatureVector : SystemZFeature< >; def FeatureNoVector : SystemZMissingFeature<"Vector">; +def NoVecHwMode : HwMode<"-vector", [FeatureNoVector]>; + def Arch11NewFeatures : SystemZFeatureList<[ FeatureLoadAndZeroRightmostByte, FeatureLoadStoreOnCond2, diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index caf01ccd1ef7c..6f146b67f8566 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1204,9 +1204,10 @@ void SystemZDAGToDAGISel::loadVectorConstant( SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); ReplaceNode(Node, BitCast.getNode()); SelectCode(BitCast.getNode()); - } else { // float or double - unsigned SubRegIdx = - (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64); + } else { // half, float or double + unsigned SubRegIdx = (VT.getSizeInBits() == 16 ? SystemZ::subreg_h16 + : VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 + : SystemZ::subreg_h64); ReplaceNode( Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode()); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 35cee7b39d143..fdbfc196e8fbc 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -103,9 +103,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass); if (!useSoftFloat()) { if (Subtarget.hasVector()) { + addRegisterClass(MVT::f16, &SystemZ::VR16BitRegClass); addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass); } else { + addRegisterClass(MVT::f16, &SystemZ::FP16BitRegClass); addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); } @@ -224,23 +226,20 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, VT, Custom); setOperationAction(ISD::UMUL_LOHI, VT, Custom); - // Only z196 and above have native support for conversions to unsigned. - // On z10, promoting to i64 doesn't generate an inexact condition for - // values that are outside the i32 range but in the i64 range, so use - // the default expansion. - if (!Subtarget.hasFPExtension()) - setOperationAction(ISD::FP_TO_UINT, VT, Expand); - - // Mirror those settings for STRICT_FP_TO_[SU]INT. Note that these all - // default to Expand, so need to be modified to Legal where appropriate. - setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal); - if (Subtarget.hasFPExtension()) - setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal); - - // And similarly for STRICT_[SU]INT_TO_FP. - setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Legal); - if (Subtarget.hasFPExtension()) - setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Legal); + // The fp<=>i32/i64 conversions are all Legal except for f16 and for + // unsigned on z10 (only z196 and above have native support for + // unsigned conversions). + for (auto Op : {ISD::FP_TO_SINT, ISD::STRICT_FP_TO_SINT, + ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP}) + setOperationAction(Op, VT, Custom); + for (auto Op : {ISD::FP_TO_UINT, ISD::STRICT_FP_TO_UINT}) + setOperationAction(Op, VT, Custom); + for (auto Op : {ISD::UINT_TO_FP, ISD::STRICT_UINT_TO_FP}) { + // Handle unsigned 32-bit input types as signed 64-bit types on z10. + auto OpAction = + (!Subtarget.hasFPExtension() && VT == MVT::i32) ? Promote : Custom; + setOperationAction(Op, VT, OpAction); + } } } @@ -289,18 +288,18 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, if (Subtarget.hasVectorEnhancements3()) { setOperationAction(ISD::ABS, MVT::i128, Legal); } - - // We have to use libcalls for these. - setOperationAction(ISD::FP_TO_UINT, MVT::i128, LibCall); - setOperationAction(ISD::FP_TO_SINT, MVT::i128, LibCall); - setOperationAction(ISD::UINT_TO_FP, MVT::i128, LibCall); - setOperationAction(ISD::SINT_TO_FP, MVT::i128, LibCall); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, LibCall); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, LibCall); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, LibCall); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, LibCall); } + // These need custom handling in order to handle the f16 conversions. + setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); + // Type legalization will convert 8- and 16-bit atomic operations into // forms that operate on i32s (but still keeping the original memory VT). // Lower them into full i32 operations. @@ -343,15 +342,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Traps are legal, as we will convert them to "j .+2". setOperationAction(ISD::TRAP, MVT::Other, Legal); - // z10 has instructions for signed but not unsigned FP conversion. - // Handle unsigned 32-bit types as signed 64-bit types. - if (!Subtarget.hasFPExtension()) { - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Promote); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand); - } - // We have native support for a 64-bit CTLZ, via FLOGR. setOperationAction(ISD::CTLZ, MVT::i32, Promote); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote); @@ -548,11 +538,29 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, } // Handle floating-point types. + if (!useSoftFloat()) { + // Promote all f16 operations to float, with some exceptions below. + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, MVT::f16, Promote); + setOperationAction(ISD::ConstantFP, MVT::f16, Expand); + for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setTruncStoreAction(VT, MVT::f16, Expand); + } + for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE}) + setOperationAction(Op, MVT::f16, Subtarget.hasVector() ? Legal : Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::IS_FPCLASS, MVT::f16, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Legal); + } + for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; ++I) { MVT VT = MVT::SimpleValueType(I); - if (isTypeLegal(VT)) { + if (isTypeLegal(VT) && VT != MVT::f16) { // We can use FI for FRINT. setOperationAction(ISD::FRINT, VT, Legal); @@ -585,7 +593,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); if (Subtarget.hasFPExtension()) { setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); @@ -594,6 +601,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FROUND, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); } + + // Extension from f16 needs libcall. + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } } @@ -1607,7 +1618,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( case 'f': // Floating-point register if (!useSoftFloat()) { - if (VT.getSizeInBits() == 64) + if (VT.getSizeInBits() == 16) + return std::make_pair(0U, &SystemZ::FP16BitRegClass); + else if (VT.getSizeInBits() == 64) return std::make_pair(0U, &SystemZ::FP64BitRegClass); else if (VT.getSizeInBits() == 128) return std::make_pair(0U, &SystemZ::FP128BitRegClass); @@ -1617,6 +1630,8 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( case 'v': // Vector register if (Subtarget.hasVector()) { + if (VT.getSizeInBits() == 16) + return std::make_pair(0U, &SystemZ::VR16BitRegClass); if (VT.getSizeInBits() == 32) return std::make_pair(0U, &SystemZ::VR32BitRegClass); if (VT.getSizeInBits() == 64) @@ -1652,6 +1667,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( if (useSoftFloat()) return std::make_pair( 0u, static_cast(nullptr)); + if (getVTSizeInBits() == 16) + return parseRegisterNumber(Constraint, &SystemZ::FP16BitRegClass, + SystemZMC::FP16Regs, 16); if (getVTSizeInBits() == 32) return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass, SystemZMC::FP32Regs, 16); @@ -1665,6 +1683,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( if (!Subtarget.hasVector()) return std::make_pair( 0u, static_cast(nullptr)); + if (getVTSizeInBits() == 16) + return parseRegisterNumber(Constraint, &SystemZ::VR16BitRegClass, + SystemZMC::VR16Regs, 32); if (getVTSizeInBits() == 32) return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass, SystemZMC::VR32Regs, 32); @@ -1941,6 +1962,10 @@ SDValue SystemZTargetLowering::LowerFormalArguments( NumFixedGPRs += 1; RC = &SystemZ::GR64BitRegClass; break; + case MVT::f16: + NumFixedFPRs += 1; + RC = &SystemZ::FP16BitRegClass; + break; case MVT::f32: NumFixedFPRs += 1; RC = &SystemZ::FP32BitRegClass; @@ -1985,9 +2010,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments( // from this parameter. Unpromoted ints and floats are // passed as right-justified 8-byte values. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32) + if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 || + VA.getLocVT() == MVT::f16) { + unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4; FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, - DAG.getIntPtrConstant(4, DL)); + DAG.getIntPtrConstant(SlotOffs, DL)); + } ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); } @@ -2300,6 +2328,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, VA.getLocMemOffset(); if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32) Offset += 4; + else if (VA.getLocVT() == MVT::f16) + Offset += 6; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, DAG.getIntPtrConstant(Offset, DL)); @@ -2736,13 +2766,21 @@ static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op, static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op, unsigned Opcode) { // Copy all operands except the intrinsic ID. + SDLoc DL(Op); unsigned NumOps = Op.getNumOperands(); SmallVector Ops; Ops.reserve(NumOps - 1); - for (unsigned I = 1; I < NumOps; ++I) - Ops.push_back(Op.getOperand(I)); + for (unsigned I = 1; I < NumOps; ++I) { + SDValue CurrOper = Op.getOperand(I); + if (CurrOper.getValueType() == MVT::f16) { + assert((Op.getConstantOperandVal(0) == Intrinsic::s390_tdc && I == 1) && + "Unhandled intrinsic with f16 operand."); + CurrOper = DAG.getFPExtendOrRound(CurrOper, DL, MVT::f32); + } + Ops.push_back(CurrOper); + } - SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops); + SDValue Intr = DAG.getNode(Opcode, DL, Op->getVTList(), Ops); return Intr.getNode(); } @@ -3884,6 +3922,14 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op, ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDLoc DL(Op); + // SELECT_CC involving f16 will not have the cmp-ops promoted by the + // legalizer, as it will be handled according to the type of the resulting + // value. Extend them here if needed. + if (CmpOp0.getSimpleValueType() == MVT::f16) { + CmpOp0 = DAG.getFPExtendOrRound(CmpOp0, SDLoc(CmpOp0), MVT::f32); + CmpOp1 = DAG.getFPExtendOrRound(CmpOp1, SDLoc(CmpOp1), MVT::f32); + } + Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL)); // Check for absolute and negative-absolute selections, including those @@ -4973,6 +5019,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op, return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0)); } +SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, + SelectionDAG &DAG) const { + EVT RegVT = Op.getValueType(); + if (RegVT.getSizeInBits() == 128) + return lowerATOMIC_LDST_I128(Op, DAG); + return lowerLoadF16(Op, DAG); +} + +SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op, + SelectionDAG &DAG) const { + auto *Node = cast(Op.getNode()); + if (Node->getMemoryVT().getSizeInBits() == 128) + return lowerATOMIC_LDST_I128(Op, DAG); + return lowerStoreF16(Op, DAG); +} + SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast(Op.getNode()); @@ -6736,6 +6798,161 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) { return Op; } +SDValue SystemZTargetLowering::lowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + SDValue In = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0); + if (In.getSimpleValueType() != MVT::f16) + return Op; // Legal + return SDValue(); // Let legalizer emit the libcall. +} + +SDValue SystemZTargetLowering::useLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, + MVT VT, SDValue Arg, SDLoc DL, + SDValue Chain, bool IsStrict) const { + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!"); + MakeLibCallOptions CallOptions; + SDValue Result; + std::tie(Result, Chain) = + makeLibCall(DAG, LC, VT, Arg, CallOptions, DL, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, DL) : Result; +} + +SDValue SystemZTargetLowering::lower_FP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { + bool IsSigned = (Op->getOpcode() == ISD::FP_TO_SINT || + Op->getOpcode() == ISD::STRICT_FP_TO_SINT); + bool IsStrict = Op->isStrictFPOpcode(); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue InOp = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + EVT InVT = InOp.getValueType(); + + // FP to unsigned is not directly supported on z10. Promoting an i32 + // result to (signed) i64 doesn't generate an inexact condition (fp + // exception) for values that are outside the i32 range but in the i64 + // range, so use the default expansion. + if (!Subtarget.hasFPExtension() && !IsSigned) + // Expand i32/i64. F16 values will be recognized to fit and extended. + return SDValue(); + + // Conversion from f16 is done via f32. + if (InOp.getSimpleValueType() == MVT::f16) { + SmallVector Results; + LowerOperationWrapper(Op.getNode(), Results, DAG); + return DAG.getMergeValues(Results, DL); + } + + if (VT == MVT::i128) { + RTLIB::Libcall LC = + IsSigned ? RTLIB::getFPTOSINT(InVT, VT) : RTLIB::getFPTOUINT(InVT, VT); + return useLibCall(DAG, LC, VT, InOp, DL, Chain, IsStrict); + } + + return Op; // Legal +} + +SDValue SystemZTargetLowering::lower_INT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + bool IsSigned = (Op->getOpcode() == ISD::SINT_TO_FP || + Op->getOpcode() == ISD::STRICT_SINT_TO_FP); + bool IsStrict = Op->isStrictFPOpcode(); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue InOp = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + EVT InVT = InOp.getValueType(); + + // Conversion to f16 is done via f32. + if (VT == MVT::f16) { + SmallVector Results; + LowerOperationWrapper(Op.getNode(), Results, DAG); + return DAG.getMergeValues(Results, DL); + } + + // Unsigned to fp is not directly supported on z10. + if (!Subtarget.hasFPExtension() && !IsSigned) + return SDValue(); // Expand i64. + + if (InVT == MVT::i128) { + RTLIB::Libcall LC = + IsSigned ? RTLIB::getSINTTOFP(InVT, VT) : RTLIB::getUINTTOFP(InVT, VT); + return useLibCall(DAG, LC, VT, InOp, DL, Chain, IsStrict); + } + + return Op; // Legal +} + +// Shift the lower 2 bytes of Op to the left in order to insert into the +// upper 2 bytes of the FP register. +static SDValue convertToF16(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::i64 && + "Expexted to convert i64 to f16."); + SDLoc DL(Op); + SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i64, Op, + DAG.getConstant(48, DL, MVT::i64)); + SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Shft); + SDValue F16Val = + DAG.getTargetExtractSubreg(SystemZ::subreg_h16, DL, MVT::f16, BCast); + return F16Val; +} + +// Extract Op into GPR and shift the 2 f16 bytes to the right. +static SDValue convertFromF16(SDValue Op, SDLoc DL, SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::f16 && + "Expected to convert f16 to i64."); + SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64); + SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL, MVT::f64, + SDValue(U32, 0), Op); + SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64); + SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i64, BCast, + DAG.getConstant(48, DL, MVT::i32)); + return Shft; +} + +// Lower an f16 LOAD in case of no vector support. +SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op, + SelectionDAG &DAG) const { + EVT RegVT = Op.getValueType(); + assert(RegVT == MVT::f16 && "Expected to lower an f16 load."); + + // Load as integer. + SDLoc DL(Op); + SDValue NewLd; + if (auto *AtomicLd = dyn_cast(Op.getNode())) { + assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load"); + NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i64, + AtomicLd->getChain(), AtomicLd->getBasePtr(), + AtomicLd->getMemOperand()); + cast(NewLd)->setExtensionType(ISD::EXTLOAD); + } else { + LoadSDNode *Ld = cast(Op.getNode()); + assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load"); + NewLd = + DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i64, Ld->getChain(), + Ld->getBasePtr(), Ld->getPointerInfo(), MVT::i16, + Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); + } + SDValue F16Val = convertToF16(NewLd, DAG); + return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL); +} + +// Lower an f16 STORE in case of no vector support. +SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Shft = convertFromF16(Op->getOperand(1), DL, DAG); + + if (auto *AtomicSt = dyn_cast(Op.getNode())) + return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(), + Shft, AtomicSt->getBasePtr(), + AtomicSt->getMemOperand()); + + StoreSDNode *St = cast(Op.getNode()); + return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(), MVT::i16, + St->getMemOperand()); +} + SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -6766,6 +6983,8 @@ SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op, TDCMask |= SystemZ::TDCMASK_ZERO_MINUS; SDValue TDCMaskV = DAG.getConstant(TDCMask, DL, MVT::i64); + if (Arg.getSimpleValueType() == MVT::f16) + Arg = DAG.getFPExtendOrRound(Arg, SDLoc(Arg), MVT::f32); SDValue Intr = DAG.getNode(SystemZISD::TDC, DL, ResultVT, Arg, TDCMaskV); return getCCResult(DAG, Intr); } @@ -6859,8 +7078,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, case ISD::ATOMIC_SWAP: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW); case ISD::ATOMIC_STORE: + return lowerATOMIC_STORE(Op, DAG); case ISD::ATOMIC_LOAD: - return lowerATOMIC_LDST_I128(Op, DAG); + return lowerATOMIC_LOAD(Op, DAG); case ISD::ATOMIC_LOAD_ADD: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD); case ISD::ATOMIC_LOAD_SUB: @@ -6921,6 +7141,23 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerFSHL(Op, DAG); case ISD::FSHR: return lowerFSHR(Op, DAG); + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: + return lowerFP_EXTEND(Op, DAG); + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + return lower_FP_TO_INT(Op, DAG); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + return lower_INT_TO_FP(Op, DAG); + case ISD::LOAD: + return lowerLoadF16(Op, DAG); + case ISD::STORE: + return lowerStoreF16(Op, DAG); case ISD::IS_FPCLASS: return lowerIS_FPCLASS(Op, DAG); case ISD::GET_ROUNDING: @@ -6984,8 +7221,7 @@ static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src, return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi); } -// Lower operations with invalid operand or result types (currently used -// only for 128-bit integer types). +// Lower operations with invalid operand or result types. void SystemZTargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, @@ -7045,11 +7281,87 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N, break; } case ISD::BITCAST: { + if (useSoftFloat()) + return; + SDLoc DL(N); SDValue Src = N->getOperand(0); - if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 && - !useSoftFloat()) { - SDLoc DL(N); + EVT SrcVT = Src.getValueType(); + EVT ResVT = N->getValueType(0); + if (ResVT == MVT::i128 && SrcVT == MVT::f128) Results.push_back(expandBitCastF128ToI128(DAG, Src, DL)); + else if (SrcVT == MVT::i16 && ResVT == MVT::f16) { + if (Subtarget.hasVector()) { + SDValue In32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); + Results.push_back(SDValue( + DAG.getMachineNode(SystemZ::LEFR_16, DL, MVT::f16, In32), 0)); + } else { + SDValue In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Src); + Results.push_back(convertToF16(In64, DAG)); + } + } else if (SrcVT == MVT::f16 && ResVT == MVT::i16) { + SDValue ExtractedVal = + Subtarget.hasVector() + ? SDValue(DAG.getMachineNode(SystemZ::LFER_16, DL, MVT::i32, Src), + 0) + : convertFromF16(Src, DL, DAG); + Results.push_back(DAG.getZExtOrTrunc(ExtractedVal, DL, ResVT)); + } + break; + } + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: { + if (useSoftFloat()) + return; + bool IsStrict = N->isStrictFPOpcode(); + SDLoc DL(N); + SDValue InOp = N->getOperand(IsStrict ? 1 : 0); + EVT ResVT = N->getValueType(0); + SDValue Chain = IsStrict ? N->getOperand(0) : DAG.getEntryNode(); + if (ResVT == MVT::f16) { + if (!IsStrict) { + SDValue OpF32 = DAG.getNode(N->getOpcode(), DL, MVT::f32, InOp); + Results.push_back(DAG.getFPExtendOrRound(OpF32, DL, MVT::f16)); + } else { + SDValue OpF32 = + DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::f32, MVT::Other), + {Chain, InOp}); + SDValue F16Res; + std::tie(F16Res, Chain) = DAG.getStrictFPExtendOrRound( + OpF32, OpF32.getValue(1), DL, MVT::f16); + Results.push_back(F16Res); + Results.push_back(Chain); + } + } + break; + } + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: { + if (useSoftFloat()) + return; + bool IsStrict = N->isStrictFPOpcode(); + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + SDValue InOp = N->getOperand(IsStrict ? 1 : 0); + EVT InVT = InOp->getValueType(0); + SDValue Chain = IsStrict ? N->getOperand(0) : DAG.getEntryNode(); + if (InVT == MVT::f16) { + if (!IsStrict) { + SDValue InF32 = DAG.getFPExtendOrRound(InOp, DL, MVT::f32); + Results.push_back(DAG.getNode(N->getOpcode(), DL, ResVT, InF32)); + } else { + SDValue InF32; + std::tie(InF32, Chain) = + DAG.getStrictFPExtendOrRound(InOp, Chain, DL, MVT::f32); + SDValue OpF32 = + DAG.getNode(N->getOpcode(), DL, DAG.getVTList(ResVT, MVT::Other), + {Chain, InF32}); + Results.push_back(OpF32); + Results.push_back(OpF32.getValue(1)); + } } break; } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 4763af75552da..f438332c2dc4f 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -627,6 +627,9 @@ class SystemZTargetLowering : public TargetLowering { bool IsSigned, SDLoc DL, bool DoesNotReturn, bool IsReturnValueUsed) const; + SDValue useLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, MVT VT, SDValue Arg, + SDLoc DL, SDValue Chain, bool IsStrict) const; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, @@ -720,6 +723,8 @@ class SystemZTargetLowering : public TargetLowering { SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG, unsigned Opcode) const; @@ -743,6 +748,12 @@ class SystemZTargetLowering : public TargetLowering { SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; SDValue lowerFSHL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFSHR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue lower_FP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue lower_INT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td index bef38b9cb809b..7775f456bbdc1 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -36,6 +36,8 @@ defm CondStoreF64 : CondStores; def LZER : InherentRRE<"lzer", 0xB374, FP32, fpimm0>; def LZDR : InherentRRE<"lzdr", 0xB375, FP64, fpimm0>; def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>; @@ -47,8 +49,11 @@ let isMoveReg = 1 in { def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>; def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>; // For z13 we prefer LDR over LER to avoid partial register dependencies. - let isCodeGenOnly = 1 in + let isCodeGenOnly = 1 in { + def LER16 : UnaryRR<"ler", 0x38, null_frag, FP16, FP16>; + def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>; def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>; + } } @@ -79,8 +84,25 @@ let Predicates = [FeatureNoVectorEnhancements1] in def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>; def LDGR : UnaryRRE<"ldgr", 0xB3C1, bitconvert, FP64, GR64>; +// fcopysign with an FP16 result. +let isCodeGenOnly = 1 in { + def CPSDRhh : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP16, FP16, FP16>; + def CPSDRhs : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP16, FP16, FP32>; + def CPSDRhd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP16, FP16, FP64>; +} + +// The sign of an FP128 is in the high register. +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fcopysign FP16:$src1, (f128 FP128:$src2)), + (CPSDRhd FP16:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureVectorEnhancements1] in + def : Pat<(fcopysign FP16:$src1, (f128 VR128:$src2)), + (CPSDRhd FP16:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_h64))>; + + // fcopysign with an FP32 result. let isCodeGenOnly = 1 in { + def CPSDRsh : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP16>; def CPSDRss : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP32>; def CPSDRsd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP64>; } @@ -94,8 +116,10 @@ let Predicates = [FeatureVectorEnhancements1] in (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_h64))>; // fcopysign with an FP64 result. -let isCodeGenOnly = 1 in +let isCodeGenOnly = 1 in { + def CPSDRdh : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP16>; def CPSDRds : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP32>; +} def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>; // The sign of an FP128 is in the high register. @@ -113,6 +137,8 @@ class CopySign128 (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>; let Predicates = [FeatureNoVectorEnhancements1] in { + def : CopySign128; def : CopySign128; def : CopySign128; //===----------------------------------------------------------------------===// let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in { + let isCodeGenOnly = 1 in + // Reload f16 from 4-byte spill slot. + defm LE16 : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP16, 4>; defm LE : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP32, 4>; defm LD : UnaryRXPair<"ld", 0x68, 0xED65, z_load, FP64, 8>; @@ -151,6 +180,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in { //===----------------------------------------------------------------------===// let SimpleBDXStore = 1, mayStore = 1 in { + let isCodeGenOnly = 1 in + // Spill f16 to 4-byte spill slot. + defm STE16 : StoreRXPair<"ste", 0x70, 0xED66, store, FP16, 4>; defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>; defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>; @@ -236,7 +268,7 @@ let Uses = [FPC], mayRaiseFPException = 1, Predicates = [FeatureFPExtension] in def CXGBRA : TernaryRRFe<"cxgbra", 0xB3A6, FP128, GR64>; } -// Convert am unsigned integer register value to a floating-point one. +// Convert an unsigned integer register value to a floating-point one. let Predicates = [FeatureFPExtension] in { let Uses = [FPC], mayRaiseFPException = 1 in { def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>; @@ -333,8 +365,10 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { } // Generic form, which does not set CC. def LCDFR : UnaryRRE<"lcdfr", 0xB373, fneg, FP64, FP64>; -let isCodeGenOnly = 1 in +let isCodeGenOnly = 1 in { + def LCDFR_16 : UnaryRRE<"lcdfr", 0xB373, fneg, FP16, FP16>; def LCDFR_32 : UnaryRRE<"lcdfr", 0xB373, fneg, FP32, FP32>; +} // Absolute value (Load Positive). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { @@ -600,6 +634,7 @@ let hasSideEffects = 1 in { // Peepholes //===----------------------------------------------------------------------===// +def : Pat<(f16 fpimmneg0), (LCDFR_16 (LZER_16))>; def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>; def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>; def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 91a4aa9c73010..1ae3994eb0e01 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -61,7 +61,8 @@ void SystemZInstrInfo::anchor() {} SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti) : SystemZGenInstrInfo(-1, -1), - RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister()), + RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), + sti.getHwMode()), STI(sti) {} // MI is a 128-bit load or store. Split it into two 64-bit loads or stores, @@ -989,6 +990,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg)) Opcode = SystemZ::LGR; + else if (SystemZ::FP16BitRegClass.contains(DestReg, SrcReg)) + Opcode = STI.hasVector() ? SystemZ::LDR16 : SystemZ::LER16; else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg)) // For z13 we prefer LDR over LER to avoid partial register dependencies. Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER; @@ -1259,9 +1262,10 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( return nullptr; unsigned OpNum = Ops[0]; - assert(Size * 8 == - TRI->getRegSizeInBits(*MF.getRegInfo() - .getRegClass(MI.getOperand(OpNum).getReg())) && + const TargetRegisterClass *RC = + MF.getRegInfo().getRegClass(MI.getOperand(OpNum).getReg()); + assert((Size * 8 == TRI->getRegSizeInBits(*RC) || + (RC == &SystemZ::FP16BitRegClass && Size == 4 && !STI.hasVector())) && "Invalid size combination"); if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 && @@ -1900,6 +1904,9 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, RC == &SystemZ::ADDR128BitRegClass) { LoadOpcode = SystemZ::L128; StoreOpcode = SystemZ::ST128; + } else if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) { + LoadOpcode = SystemZ::LE16; + StoreOpcode = SystemZ::STE16; } else if (RC == &SystemZ::FP32BitRegClass) { LoadOpcode = SystemZ::LE; StoreOpcode = SystemZ::STE; @@ -1909,6 +1916,10 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, } else if (RC == &SystemZ::FP128BitRegClass) { LoadOpcode = SystemZ::LX; StoreOpcode = SystemZ::STX; + } else if (RC == &SystemZ::FP16BitRegClass || + RC == &SystemZ::VR16BitRegClass) { + LoadOpcode = SystemZ::VL16; + StoreOpcode = SystemZ::VST16; } else if (RC == &SystemZ::VR32BitRegClass) { LoadOpcode = SystemZ::VL32; StoreOpcode = SystemZ::VST32; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index d8c48239ac633..10de8b05cf45f 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -142,7 +142,8 @@ let Predicates = [FeatureVector] in { // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayLoad = 1, canFoldAsLoad = 1 in { + let mayLoad = 1, SimpleBDXLoad = 1, canFoldAsLoad = 1 in { + def VL16 : UnaryAliasVRX; def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -239,7 +240,8 @@ let Predicates = [FeatureVector] in { // STEY and STDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayStore = 1 in { + let mayStore = 1, SimpleBDXStore = 1 in { + def VST16 : StoreAliasVRX; def VST32 : StoreAliasVRX; def VST64 : StoreAliasVRX; } @@ -2208,6 +2210,8 @@ let Predicates = [FeatureVector] in { def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>; def : Pat<(i32 (bitconvert (f32 VR32:$src))), (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>; + def LEFR_16 : UnaryAliasVRS; + def LFER_16 : UnaryAliasVRS; } // Floating-point values are stored in element 0 of the corresponding diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 1e0c043682157..177f2c50e4f01 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -232,8 +232,9 @@ SystemZELFRegisters::getCallPreservedMask(const MachineFunction &MF, return CSR_SystemZ_ELF_RegMask; } -SystemZRegisterInfo::SystemZRegisterInfo(unsigned int RA) - : SystemZGenRegisterInfo(RA) {} +SystemZRegisterInfo::SystemZRegisterInfo(unsigned int RA, unsigned int HwMode) + : SystemZGenRegisterInfo(RA, /*DwarfFlavour=*/0, /*EHFlavour=*/0, /*PC=*/0, + HwMode) {} const MCPhysReg * SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h index 4f497f8d23d29..460be432811a4 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -129,7 +129,7 @@ class SystemZELFRegisters : public SystemZCallingConventionRegisters { struct SystemZRegisterInfo : public SystemZGenRegisterInfo { public: - SystemZRegisterInfo(unsigned int RA); + SystemZRegisterInfo(unsigned int RA, unsigned int HwMode); /// getPointerRegClass - Return the register class to use to hold pointers. /// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0 diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index 8f9bb56f2eb3b..e79f12b449a88 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -20,6 +20,7 @@ class SystemZRegWithSubregs subregs> } let Namespace = "SystemZ" in { +def subreg_h16 : SubRegIndex<16, 16>; def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_hl32. def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_hh32. def subreg_l64 : SubRegIndex<64, 0>; @@ -34,7 +35,9 @@ def subreg_ll32 : ComposedSubRegIndex; // If the user provides an alternate order list of regs, it will be used for // XPLINK. Otherwise, by default, XPLINK will use the regList ordering as well multiclass SystemZRegClass types, int size, - dag regList, list altRegList = [regList], bit allocatable = 1> { + dag regList, list altRegList = [regList], + bit allocatable = 1, + RegInfoByHwMode RI = RegInfoByHwMode<[], []>> { def AsmOperand : AsmOperandClass { let Name = name; let ParserMethod = "parse"#name; @@ -48,6 +51,7 @@ multiclass SystemZRegClass types, int size, const SystemZSubtarget &S = MF.getSubtarget(); return S.isTargetXPLINK64(); }]; + let RegInfos = RI; } def "" : RegisterOperand(name#"Bit")> { let ParserMatchClass = !cast(name#"AsmOperand"); @@ -201,9 +205,16 @@ def F27Dwarf : DwarfMapping<81>; def F29Dwarf : DwarfMapping<82>; def F31Dwarf : DwarfMapping<83>; +// Upper 16 bits of one of the floating-point registers +class FPR16 num, string n> : SystemZReg { + let HWEncoding = num; +} + // Upper 32 bits of one of the floating-point registers -class FPR32 num, string n> : SystemZReg { +class FPR32 num, string n, FPR16 high> + : SystemZRegWithSubregs { let HWEncoding = num; + let SubRegIndices = [subreg_h16]; } // One of the floating-point registers. @@ -223,12 +234,14 @@ class FPR128 num, string n, FPR64 low, FPR64 high> // Floating-point registers. Registers 16-31 require the vector facility. foreach I = 0-15 in { - def F#I#S : FPR32; + def F#I#H : FPR16; + def F#I#S : FPR32("F"#I#"H")>; def F#I#D : FPR64("F"#I#"S")>, DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } foreach I = 16-31 in { - def F#I#S : FPR32; + def F#I#H : FPR16; + def F#I#S : FPR32("F"#I#"H")>; def F#I#D : FPR64("F"#I#"S")>, DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } @@ -240,6 +253,11 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in { // There's no store-multiple instruction for FPRs, so we're not fussy // about the order in which call-saved registers are allocated. +// Adjust the spill size of f16 to 32 bits in case of no vector support. +def FP16RI : RegInfoByHwMode<[DefaultMode, NoVecHwMode], + [RegInfo<16,16,16>, RegInfo<16,32,32>]>; +defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15), + [(sequence "F%uH", 0, 15)], 1, FP16RI>; defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>; defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>; defm FP128 : SystemZRegClass<"FP128", [f128], 128, @@ -262,6 +280,13 @@ foreach I = 0-31 in { DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } +// Class used to store 16-bit fp values in the first element of a vector +// register. +defm VR16 : SystemZRegClass<"VR16", [f16], 16, + (add (sequence "F%uH", 0, 7), + (sequence "F%uH", 16, 31), + (sequence "F%uH", 8, 15))>; + // Class used to store 32-bit values in the first element of a vector // register. f32 scalars are used for the WLEDB and WLDEB instructions. defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32, @@ -298,6 +323,7 @@ class TypedReg { RegisterOperand op = opin; } +def v16hb : TypedReg; def v32f : TypedReg; def v32sb : TypedReg; def v64g : TypedReg; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td index 5f15348654c75..84c6ca21d7e93 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -773,12 +773,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -787,13 +787,13 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; @@ -801,7 +801,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -840,7 +840,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1191,7 +1191,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1205,7 +1205,7 @@ def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; @@ -1376,8 +1376,8 @@ def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>; // Vector: Floating-point insertion and extraction //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; -def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>; +def : InstRW<[WLat4, FXb, NormalGr], (instregex "LFER(_16)?$")>; //===----------------------------------------------------------------------===// // Vector: String instructions diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td index 336bbe2483340..b48ed08c8a189 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -793,12 +793,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -807,13 +807,13 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; @@ -821,7 +821,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -860,7 +860,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1209,7 +1209,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1224,7 +1224,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; @@ -1448,8 +1448,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>; // Vector: Floating-point insertion and extraction //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; -def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>; //===----------------------------------------------------------------------===// // Vector: String instructions diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td index 5f2a04c298a25..e3ec7a6994221 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -811,12 +811,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -825,13 +825,13 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; @@ -839,7 +839,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -878,7 +878,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1231,7 +1231,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1246,7 +1246,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; @@ -1491,8 +1491,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>; // Vector: Floating-point insertion and extraction //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; -def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>; //===----------------------------------------------------------------------===// // Vector: String instructions diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td index 83e980940d758..4f904daec5052 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -812,12 +812,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -826,13 +826,13 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; @@ -840,7 +840,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -879,7 +879,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1237,7 +1237,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1252,7 +1252,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; @@ -1499,8 +1499,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>; // Vector: Floating-point insertion and extraction //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; -def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>; //===----------------------------------------------------------------------===// // Vector: String instructions diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td index bd52627f636a7..3b5ce6c9b5a0e 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ17.td @@ -827,12 +827,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -841,13 +841,13 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>; +def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "L(E16|E)(Y)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; @@ -855,7 +855,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -894,7 +894,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1252,7 +1252,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1267,7 +1267,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; @@ -1520,8 +1520,8 @@ def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>; // Vector: Floating-point insertion and extraction //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>; -def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR(_16)?$")>; +def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER(_16)?$")>; //===----------------------------------------------------------------------===// // Vector: String instructions diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td index f8397921bf684..26433b97484da 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -705,12 +705,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LZXR$")>; // Load -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LXR$")>; @@ -719,20 +719,20 @@ def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E16|E|D)(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; //===----------------------------------------------------------------------===// // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -771,7 +771,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")> // Load Complement / Negative / Positive def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td index 039c8146618fe..193a793f17367 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -743,12 +743,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>; // Load -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>; @@ -757,20 +757,20 @@ def : InstRW<[WLat9, WLat9, FPU, NormalGr], (instregex "LT(E|D)BR$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "LTXBR$")>; // Copy sign -def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s)(d|s)$")>; +def : InstRW<[WLat5, FXU2, GroupAlone], (instregex "CPSDR(d|s|h)(d|s|h)$")>; //===----------------------------------------------------------------------===// // FP: Load instructions //===----------------------------------------------------------------------===// -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E|D)(Y|E32)?$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(E16|E|D)(Y|E32)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>; //===----------------------------------------------------------------------===// // FP: Store instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>; +def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "ST(E16|E|D)(Y)?$")>; def : InstRW<[WLat1, FXU, LSU, NormalGr], (instregex "STX$")>; //===----------------------------------------------------------------------===// @@ -809,7 +809,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")> // Load Complement / Negative / Positive def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root diff --git a/llvm/test/CodeGen/SystemZ/asm-10.ll b/llvm/test/CodeGen/SystemZ/asm-10.ll index b71db8350781d..8226b8a1a2d25 100644 --- a/llvm/test/CodeGen/SystemZ/asm-10.ll +++ b/llvm/test/CodeGen/SystemZ/asm-10.ll @@ -2,6 +2,15 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s +define half @f0() { +; CHECK-LABEL: f0: +; CHECK: lzer %f1 +; CHECK: blah %f0 %f1 +; CHECK: br %r14 + %val = call half asm "blah $0 $1", "=&f,f" (half 0.0) + ret half %val +} + define float @f1() { ; CHECK-LABEL: f1: ; CHECK: lzer %f1 diff --git a/llvm/test/CodeGen/SystemZ/asm-17.ll b/llvm/test/CodeGen/SystemZ/asm-17.ll index c9c4d73c66ebb..dad75d4d012d1 100644 --- a/llvm/test/CodeGen/SystemZ/asm-17.ll +++ b/llvm/test/CodeGen/SystemZ/asm-17.ll @@ -25,6 +25,17 @@ define i64 @f2() { ret i64 %ret } +; Test 16-bit FPRs. +define half @f3_half() { +; CHECK-LABEL: f3_half: +; CHECK: lzer %f4 +; CHECK: blah %f4 +; CHECK: ler %f0, %f4 +; CHECK: br %r14 + %ret = call half asm "blah $0", "={f4},0" (half 0.0) + ret half %ret +} + ; Test i32 FPRs. define float @f3() { ; CHECK-LABEL: f3: diff --git a/llvm/test/CodeGen/SystemZ/asm-19.ll b/llvm/test/CodeGen/SystemZ/asm-19.ll index e16fdfa13fce6..6c77fb55071ca 100644 --- a/llvm/test/CodeGen/SystemZ/asm-19.ll +++ b/llvm/test/CodeGen/SystemZ/asm-19.ll @@ -3,6 +3,15 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -no-integrated-as | FileCheck %s ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -no-integrated-as | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-Z14 +define half @f0() { +; CHECK-LABEL: f0: +; CHECK: lzer %f1 +; CHECK: blah %f0 %f1 +; CHECK: br %r14 + %val = call half asm "blah $0 $1", "=&v,v" (half 0.0) + ret half %val +} + define float @f1() { ; CHECK-LABEL: f1: ; CHECK: lzer %f1 @@ -86,6 +95,16 @@ define <4 x float> @f9() { ret <4 x float> %val } +define half @f10_half() { +; CHECK-LABEL: f10_half: +; CHECK: lzer %f4 +; CHECK: blah %f4 +; CHECK: ldr %f0, %f4 +; CHECK: br %r14 + %ret = call half asm "blah $0", "={v4},0" (half 0.0) + ret half %ret +} + define float @f10() { ; CHECK-LABEL: f10: ; CHECK: lzer %f4 diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll new file mode 100644 index 0000000000000..4135a55bb6fbc --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test fp16 atomic loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR + +define half @f1(ptr %src) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: br %r14 + %val = load atomic half, ptr %src seq_cst, align 2 + ret half %val +} diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll new file mode 100644 index 0000000000000..3f228d58dcd8c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test half atomic stores. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR + +define void @f1(ptr %src, half %val) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r2) +; CHECK-NEXT: bcr 15, %r0 +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vsteh %v0, 0(%r2), 0 +; VECTOR-NEXT: bcr 14, %r0 +; VECTOR-NEXT: br %r14 + store atomic half %val, ptr %src seq_cst, align 2 + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-04.ll new file mode 100644 index 0000000000000..a0869e13a013d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-04.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test atomic half addition. Expect a compare-and-swap loop. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define half @f1(ptr %src, half %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -184 +; CHECK-NEXT: .cfi_def_cfa_offset 344 +; CHECK-NEXT: std %f8, 176(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f9, %r0 +; CHECK-NEXT: risbg %r12, %r2, 0, 189, 0 +; CHECK-NEXT: sll %r13, 3 +; CHECK-NEXT: lcr %r11, %r13 +; CHECK-NEXT: j .LBB0_2 +; CHECK-NEXT: .LBB0_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: ldgr %f9, %r0 +; CHECK-NEXT: je .LBB0_5 +; CHECK-NEXT: .LBB0_2: # %atomicrmw.start +; CHECK-NEXT: # =>This Loop Header: Depth=1 +; CHECK-NEXT: # Child Loop BB0_3 Depth 2 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r1, %f0 +; CHECK-NEXT: l %r0, 0(%r12) +; CHECK-NEXT: srlg %r1, %r1, 48 +; CHECK-NEXT: lgdr %r2, %f9 +; CHECK-NEXT: srlg %r2, %r2, 48 +; CHECK-NEXT: .LBB0_3: # %atomicrmw.start +; CHECK-NEXT: # Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-NEXT: rll %r3, %r0, 16(%r13) +; CHECK-NEXT: risbg %r1, %r3, 32, 47, 0 +; CHECK-NEXT: llhr %r3, %r3 +; CHECK-NEXT: cr %r3, %r2 +; CHECK-NEXT: jlh .LBB0_1 +; CHECK-NEXT: # %bb.4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=2 +; CHECK-NEXT: rll %r4, %r1, -16(%r11) +; CHECK-NEXT: cs %r0, %r4, 0(%r12) +; CHECK-NEXT: jl .LBB0_3 +; CHECK-NEXT: j .LBB0_1 +; CHECK-NEXT: .LBB0_5: # %atomicrmw.end +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r11, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %res = atomicrmw fadd ptr %src, half %b seq_cst + ret half %res +} diff --git a/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll index 1447c576f48ae..a982f9af52358 100644 --- a/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll +++ b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll @@ -1,6 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=s390x-linux < %s | FileCheck %s -check-prefix=SOFT-FLOAT +define half @fmuladd_intrinsic_f16(half %a, half %b, half %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f16: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r12, %r15, 96(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r12, -64 +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -160 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 320 +; SOFT-FLOAT-NEXT: # kill: def $r4l killed $r4l def $r4d +; SOFT-FLOAT-NEXT: llghr %r0, %r4 +; SOFT-FLOAT-NEXT: lr %r13, %r3 +; SOFT-FLOAT-NEXT: lr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __extendhfsf2@PLT +; SOFT-FLOAT-NEXT: llghr %r0, %r12 +; SOFT-FLOAT-NEXT: lgr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __extendhfsf2@PLT +; SOFT-FLOAT-NEXT: llghr %r0, %r13 +; SOFT-FLOAT-NEXT: lgr %r13, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __extendhfsf2@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r13 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r12 +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: brasl %r14, __truncsfhf2@PLT +; SOFT-FLOAT-NEXT: # kill: def $r2l killed $r2l killed $r2d +; SOFT-FLOAT-NEXT: lmg %r12, %r15, 256(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %result = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %result +} + define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { ; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f32: ; SOFT-FLOAT: # %bb.0: diff --git a/llvm/test/CodeGen/SystemZ/fp-abs-01.ll b/llvm/test/CodeGen/SystemZ/fp-abs-01.ll index bf0870a86702c..0cfdefe3bd61b 100644 --- a/llvm/test/CodeGen/SystemZ/fp-abs-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-abs-01.ll @@ -3,6 +3,18 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s +; Test f16. +declare half @llvm.fabs.f16(half %f) +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lpdfr %f0, %f0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.fabs.f16(half %f) + ret half %res +} + ; Test f32. declare float @llvm.fabs.f32(float %f) define float @f1(float %f) { diff --git a/llvm/test/CodeGen/SystemZ/fp-abs-03.ll b/llvm/test/CodeGen/SystemZ/fp-abs-03.ll index 72786ea203df4..29f2d06e75ff9 100644 --- a/llvm/test/CodeGen/SystemZ/fp-abs-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-abs-03.ll @@ -2,6 +2,18 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; Test f16. +declare half @llvm.fabs.f16(half %f) +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lpdfr %f0, %f0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.fabs.f16(half %f) + ret half %res +} + ; Test f32. declare float @llvm.fabs.f32(float %f) define float @f1(float %f) { diff --git a/llvm/test/CodeGen/SystemZ/fp-abs-04.ll b/llvm/test/CodeGen/SystemZ/fp-abs-04.ll index b02abc8443491..afaf3f6d22ac8 100644 --- a/llvm/test/CodeGen/SystemZ/fp-abs-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-abs-04.ll @@ -2,6 +2,22 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; Test f16. +declare half @llvm.fabs.f16(half %f) +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lpdfr %f0, %f0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lcdfr %f0, %f0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %abs = call half @llvm.fabs.f16(half %f) + %res = fneg half %abs + ret half %res +} + ; Test f32. declare float @llvm.fabs.f32(float %f) define float @f1(float %f) { diff --git a/llvm/test/CodeGen/SystemZ/fp-add-01.ll b/llvm/test/CodeGen/SystemZ/fp-add-01.ll index eb845bae9b804..b65744c4aac0c 100644 --- a/llvm/test/CodeGen/SystemZ/fp-add-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-add-01.ll @@ -6,6 +6,18 @@ declare float @foo() +; Check register addition. +define half @f0(half %f1, half %f2) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: aebr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fadd half %f1, %f2 + ret half %res +} + ; Check register addition. define float @f1(float %f1, float %f2) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll index c1773abe92305..d3d641357ae58 100644 --- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll @@ -227,6 +227,38 @@ exit: ret float %add } +define half @f12_half(half %dummy, half %val, ptr %dest) { +; CHECK-LABEL: f12_half: +; CHECK: ler %f8, %f2 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %f0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: jl .LBB11_2 +; CHECK-NEXT:# %bb.1: +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT:.LBB11_2: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: ld %f8, 160(%r15) +; CHECK-NEXT: lmg %r13, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + call void asm sideeffect "blah $0", "{f0}"(half %val) + %cmp = fcmp olt half %val, 0.0 + br i1 %cmp, label %exit, label %store + +store: + store half %val, ptr %dest + br label %exit + +exit: + ret half %val +} + ; %val in %f2 must be preserved during comparison and also copied to %f0. define float @f12(float %dummy, float %val, ptr %dest) { ; CHECK-LABEL: f12: @@ -304,6 +336,38 @@ exit: ret void } +define half @f15_half(half %val, half %dummy, ptr %dest) { +; CHECK-LABEL: f15_half: +; CHECK: ler %f8, %f0 +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %f2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: jl .LBB15_2 +; CHECK-NEXT:# %bb.1: +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT:.LBB15_2: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: ld %f8, 160(%r15) +; CHECK-NEXT: lmg %r13, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + call void asm sideeffect "blah $0", "{f2}"(half %val) + %cmp = fcmp olt half %val, 0.0 + br i1 %cmp, label %exit, label %store + +store: + store half %val, ptr %dest + br label %exit + +exit: + ret half %val +} + define float @f15(float %val, float %dummy, ptr %dest) { ; CHECK-LABEL: f15: ; CHECK: ltebr %f1, %f0 @@ -374,7 +438,7 @@ define float @f18(float %dummy, float %a, ptr %dest) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lnebr %f0, %f2 ; CHECK-NEXT: blr %r14 -; CHECK-NEXT: .LBB17_1: # %store +; CHECK-NEXT: .LBB19_1: # %store ; CHECK-NEXT: ste %f0, 0(%r2) ; CHECK-NEXT: br %r14 entry: @@ -397,7 +461,7 @@ define float @f19(float %dummy, float %a, ptr %dest) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lcebr %f0, %f2 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB18_1: # %store +; CHECK-NEXT: .LBB20_1: # %store ; CHECK-NEXT: ste %f0, 0(%r2) ; CHECK-NEXT: br %r14 entry: diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-05.ll b/llvm/test/CodeGen/SystemZ/fp-conv-05.ll index 4596649d5659c..fef2b842c54aa 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-05.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-05.ll @@ -2,6 +2,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; Check i32->f16. +define half @f0(i32 %i) { +; CHECK-LABEL: f0: +; CHECK: cefbr %f0, %r2 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = sitofp i32 %i to half + ret half %conv +} + ; Check i32->f32. define float @f1(i32 %i) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-06.ll b/llvm/test/CodeGen/SystemZ/fp-conv-06.ll index e754a7e161f8f..deb22ee4d19b4 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-06.ll @@ -2,6 +2,18 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +; Check i32->f16. There is no native instruction, so we must promote +; to i64 first. +define half @f0(i32 %i) { +; CHECK-LABEL: f0: +; CHECK: llgfr [[REGISTER:%r[0-5]]], %r2 +; CHECK: cegbr %f0, [[REGISTER]] +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = uitofp i32 %i to half + ret half %conv +} + ; Check i32->f32. There is no native instruction, so we must promote ; to i64 first. define float @f1(i32 %i) { diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-07.ll b/llvm/test/CodeGen/SystemZ/fp-conv-07.ll index 2941e77441461..02f47e481cc6a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-07.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-07.ll @@ -2,6 +2,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; Test i64->f16. +define half @f0(i64 %i) { +; CHECK-LABEL: f0: +; CHECK: cegbr %f0, %r2 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = sitofp i64 %i to half + ret half %conv +} + ; Test i64->f32. define float @f1(i64 %i) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-08.ll b/llvm/test/CodeGen/SystemZ/fp-conv-08.ll index e2a5f74185216..f2590b6566a62 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-08.ll @@ -2,6 +2,17 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +; Test i64->f16. For z10, this results in just a single a libcall. +define half @f0(i64 %i) { +; CHECK-LABEL: f0: +; CHECK: cegbr +; CHECK: aebr +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = uitofp i64 %i to half + ret half %conv +} + ; Test i64->f32. There's no native support for unsigned i64-to-fp conversions, ; but we should be able to implement them using signed i64-to-fp conversions. define float @f1(i64 %i) { diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-09.ll b/llvm/test/CodeGen/SystemZ/fp-conv-09.ll index 0e730c3705030..c29a202807c69 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-09.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-09.ll @@ -2,6 +2,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; Test f16->i32. +define i32 @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cfebr %r2, 5, %f0 +; CHECK: br %r14 + %conv = fptosi half %f to i32 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-10.ll b/llvm/test/CodeGen/SystemZ/fp-conv-10.ll index 82913265853a5..a0455a2cea2d6 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-10.ll @@ -8,18 +8,36 @@ ; Promoting to i64 doesn't generate an inexact condition for values that are ; outside the i32 range but in the i64 range, so use the default expansion. +; Test f16->i32. Converted to signed as the max float value is smaller than +; the signed integer range. +define i32 @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cfebr %r2, 5, %f0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %conv = fptoui half %f to i32 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: le %f1, 0(%r1) ; CHECK-NEXT: cebr %f0, %f1 -; CHECK-NEXT: jnl .LBB0_2 +; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cfebr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: sebr %f0, %f1 ; CHECK-NEXT: cfebr %r2, 5, %f0 ; CHECK-NEXT: xilf %r2, 2147483648 @@ -32,14 +50,14 @@ define i32 @f1(float %f) { define i32 @f2(double %f) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: ld %f1, 0(%r1) ; CHECK-NEXT: cdbr %f0, %f1 -; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cfdbr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: sdbr %f0, %f1 ; CHECK-NEXT: cfdbr %r2, 5, %f0 ; CHECK-NEXT: xilf %r2, 2147483648 @@ -54,14 +72,14 @@ define i32 @f3(ptr %src) { ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) ; CHECK-NEXT: ld %f2, 8(%r2) -; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: larl %r1, .LCPI3_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) ; CHECK-NEXT: cxbr %f0, %f1 -; CHECK-NEXT: jnl .LBB2_2 +; CHECK-NEXT: jnl .LBB3_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cfxbr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: sxbr %f0, %f1 ; CHECK-NEXT: cfxbr %r2, 5, %f0 ; CHECK-NEXT: xilf %r2, 2147483648 diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-11.ll b/llvm/test/CodeGen/SystemZ/fp-conv-11.ll index 2dd543b5810bf..55a2f8a51a526 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-11.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-11.ll @@ -2,6 +2,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; Test f16->i64. +define i64 @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cgebr %r2, 5, %f0 +; CHECK: br %r14 + %conv = fptosi half %f to i64 + ret i64 %conv +} + ; Test f32->i64. define i64 @f1(float %f) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-12.ll b/llvm/test/CodeGen/SystemZ/fp-conv-12.ll index 27afbf4d398a0..bb83a677210ac 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-12.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-12.ll @@ -7,18 +7,36 @@ ; they were added in z196 as the Convert to Logical family of instructions. ; Convert via signed i64s instead. +; Test f16->i64. Converted to signed as the max float value is smaller than +; the signed integer range. +define i64 @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cgebr %r2, 5, %f0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %conv = fptoui half %f to i64 + ret i64 %conv +} + ; Test f32->i64. define i64 @f1(float %f) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: le %f1, 0(%r1) ; CHECK-NEXT: cebr %f0, %f1 -; CHECK-NEXT: jnl .LBB0_2 +; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cgebr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: sebr %f0, %f1 ; CHECK-NEXT: cgebr %r2, 5, %f0 ; CHECK-NEXT: xihf %r2, 2147483648 @@ -31,14 +49,14 @@ define i64 @f1(float %f) { define i64 @f2(double %f) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: ld %f1, 0(%r1) ; CHECK-NEXT: cdbr %f0, %f1 -; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cgdbr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: sdbr %f0, %f1 ; CHECK-NEXT: cgdbr %r2, 5, %f0 ; CHECK-NEXT: xihf %r2, 2147483648 @@ -53,14 +71,14 @@ define i64 @f3(ptr %src) { ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) ; CHECK-NEXT: ld %f2, 8(%r2) -; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: larl %r1, .LCPI3_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) ; CHECK-NEXT: cxbr %f0, %f1 -; CHECK-NEXT: jnl .LBB2_2 +; CHECK-NEXT: jnl .LBB3_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cgxbr %r2, 5, %f0 ; CHECK-NEXT: br %r14 -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: sxbr %f0, %f1 ; CHECK-NEXT: cgxbr %r2, 5, %f0 ; CHECK-NEXT: xihf %r2, 2147483648 diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-13.ll b/llvm/test/CodeGen/SystemZ/fp-conv-13.ll index 6e6c96bea2b35..4869d070b6beb 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-13.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-13.ll @@ -3,6 +3,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s +; Check i32->f16. +define half @f0(i32 %i) { +; CHECK-LABEL: f0: +; CHECK: celfbr %f0, 0, %r2, 0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = uitofp i32 %i to half + ret half %conv +} + ; Check i32->f32. define float @f1(i32 %i) { ; CHECK-LABEL: f1: @@ -33,18 +43,28 @@ define void @f3(i32 %i, ptr %dst) { ret void } -; Check i64->f32. -define float @f4(i64 %i) { +; Check i64->f16. +define half @f4(i64 %i) { ; CHECK-LABEL: f4: ; CHECK: celgbr %f0, 0, %r2, 0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = uitofp i64 %i to half + ret half %conv +} + +; Check i64->f32. +define float @f5(i64 %i) { +; CHECK-LABEL: f5: +; CHECK: celgbr %f0, 0, %r2, 0 ; CHECK: br %r14 %conv = uitofp i64 %i to float ret float %conv } ; Check i64->f64. -define double @f5(i64 %i) { -; CHECK-LABEL: f5: +define double @f6(i64 %i) { +; CHECK-LABEL: f6: ; CHECK: cdlgbr %f0, 0, %r2, 0 ; CHECK: br %r14 %conv = uitofp i64 %i to double @@ -52,8 +72,8 @@ define double @f5(i64 %i) { } ; Check i64->f128. -define void @f6(i64 %i, ptr %dst) { -; CHECK-LABEL: f6: +define void @f7(i64 %i, ptr %dst) { +; CHECK-LABEL: f7: ; CHECK: cxlgbr %f0, 0, %r2, 0 ; CHECK-DAG: std %f0, 0(%r3) ; CHECK-DAG: std %f2, 8(%r3) diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-14.ll b/llvm/test/CodeGen/SystemZ/fp-conv-14.ll index 0d1f951994d27..c9448eac91fb1 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-14.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-14.ll @@ -2,6 +2,16 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s +; Test f16->i32. +define i32 @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: clfebr %r2, 5, %f0, 0 +; CHECK: br %r14 + %conv = fptoui half %f to i32 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) { ; CHECK-LABEL: f1: @@ -32,9 +42,19 @@ define i32 @f3(ptr %src) { ret i32 %conv } -; Test f32->i64. -define i64 @f4(float %f) { +; Test f16->i64. +define i64 @f4(half %f) { ; CHECK-LABEL: f4: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: clgebr %r2, 5, %f0, 0 +; CHECK: br %r14 + %conv = fptoui half %f to i64 + ret i64 %conv +} + +; Test f32->i64. +define i64 @f5(float %f) { +; CHECK-LABEL: f5: ; CHECK: clgebr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = fptoui float %f to i64 @@ -42,8 +62,8 @@ define i64 @f4(float %f) { } ; Test f64->i64. -define i64 @f5(double %f) { -; CHECK-LABEL: f5: +define i64 @f6(double %f) { +; CHECK-LABEL: f6: ; CHECK: clgdbr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = fptoui double %f to i64 @@ -51,8 +71,8 @@ define i64 @f5(double %f) { } ; Test f128->i64. -define i64 @f6(ptr %src) { -; CHECK-LABEL: f6: +define i64 @f7(ptr %src) { +; CHECK-LABEL: f7: ; CHECK-DAG: ld %f0, 0(%r2) ; CHECK-DAG: ld %f2, 8(%r2) ; CHECK: clgxbr %r2, 5, %f0, 0 diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-20.ll b/llvm/test/CodeGen/SystemZ/fp-conv-20.ll index 8006a8beb0789..58db2e10da8b2 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-20.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-20.ll @@ -30,9 +30,19 @@ define float @f3(i128 %i) { ret float %conv } -; Test unsigned i128->f128. -define fp128 @f4(i128 %i) { +; Test signed i128->f16. +define half @f4(i128 %i) { ; CHECK-LABEL: f4: +; CHECK: brasl %r14, __floattisf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = sitofp i128 %i to half + ret half %conv +} + +; Test unsigned i128->f128. +define fp128 @f5(i128 %i) { +; CHECK-LABEL: f5: ; CHECK: brasl %r14, __floatuntitf@PLT ; CHECK: br %r14 %conv = uitofp i128 %i to fp128 @@ -40,8 +50,8 @@ define fp128 @f4(i128 %i) { } ; Test unsigned i128->f64. -define double @f5(i128 %i) { -; CHECK-LABEL: f5: +define double @f6(i128 %i) { +; CHECK-LABEL: f6: ; CHECK: brasl %r14, __floatuntidf@PLT ; CHECK: br %r14 %conv = uitofp i128 %i to double @@ -49,17 +59,27 @@ define double @f5(i128 %i) { } ; Test unsigned i128->f32. -define float @f6(i128 %i) { -; CHECK-LABEL: f6: +define float @f7(i128 %i) { +; CHECK-LABEL: f7: ; CHECK: brasl %r14, __floatuntisf@PLT ; CHECK: br %r14 %conv = uitofp i128 %i to float ret float %conv } +; Test unsigned i128->f16. +define half @f8(i128 %i) { +; CHECK-LABEL: f8: +; CHECK: brasl %r14, __floatuntisf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = uitofp i128 %i to half + ret half %conv +} + ; Test signed f128->i128. -define i128 @f7(fp128 %f) { -; CHECK-LABEL: f7: +define i128 @f9(fp128 %f) { +; CHECK-LABEL: f9: ; CHECK: brasl %r14, __fixtfti@PLT ; CHECK: br %r14 %conv = fptosi fp128 %f to i128 @@ -67,26 +87,36 @@ define i128 @f7(fp128 %f) { } ; Test signed f64->i128. -define i128 @f8(double %f) { -; CHECK-LABEL: f8: +define i128 @f10(double %f) { +; CHECK-LABEL: f10: ; CHECK: brasl %r14, __fixdfti@PLT ; CHECK: br %r14 %conv = fptosi double %f to i128 ret i128 %conv } -; Test signed f9->i128. -define i128 @f9(float %f) { -; CHECK-LABEL: f9: +; Test signed f32->i128. +define i128 @f11(float %f) { +; CHECK-LABEL: f11: ; CHECK: brasl %r14, __fixsfti@PLT ; CHECK: br %r14 %conv = fptosi float %f to i128 ret i128 %conv } +; Test signed f16->i128. +define i128 @f12(half %f) { +; CHECK-LABEL: f12: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __fixsfti@PLT +; CHECK: br %r14 + %conv = fptosi half %f to i128 + ret i128 %conv +} + ; Test unsigned f128->i128. -define i128 @f10(fp128 %f) { -; CHECK-LABEL: f10: +define i128 @f13(fp128 %f) { +; CHECK-LABEL: f13: ; CHECK: brasl %r14, __fixunstfti@PLT ; CHECK: br %r14 %conv = fptoui fp128 %f to i128 @@ -94,8 +124,8 @@ define i128 @f10(fp128 %f) { } ; Test unsigned f64->i128. -define i128 @f11(double %f) { -; CHECK-LABEL: f11: +define i128 @f14(double %f) { +; CHECK-LABEL: f14: ; CHECK: brasl %r14, __fixunsdfti@PLT ; CHECK: br %r14 %conv = fptoui double %f to i128 @@ -103,10 +133,20 @@ define i128 @f11(double %f) { } ; Test unsigned f32->i128. -define i128 @f12(float %f) { -; CHECK-LABEL: f12: +define i128 @f15(float %f) { +; CHECK-LABEL: f15: ; CHECK: brasl %r14, __fixunssfti@PLT ; CHECK: br %r14 %conv = fptoui float %f to i128 ret i128 %conv } + +; Test unsigned f16->i128. +define i128 @f16(half %f) { +; CHECK-LABEL: f16: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __fixunssfti@PLT +; CHECK: br %r14 + %conv = fptoui half %f to i128 + ret i128 %conv +} diff --git a/llvm/test/CodeGen/SystemZ/fp-copysign-03.ll b/llvm/test/CodeGen/SystemZ/fp-copysign-03.ll new file mode 100644 index 0000000000000..909519e8ace55 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-copysign-03.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefixes=CHECK,Z10 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,Z16 +; +; Test copysign intrinsics with half. + +declare half @llvm.copysign.f16(half, half) +declare float @llvm.copysign.f32(float, float) +declare double @llvm.copysign.f64(double, double) +declare fp128 @llvm.copysign.f128(fp128, fp128) + +; Test copysign with an f16 result and f16 sign argument. +define half @f0(half %a, half %b) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: cpsdr %f0, %f2, %f0 +; CHECK-NEXT: br %r14 + %res = call half @llvm.copysign.f16(half %a, half %b) + ret half %res +} + +; Test copysign with an f16 result and f32 sign argument. +define half @f1(half %a, float %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: cpsdr %f0, %f2, %f0 +; CHECK-NEXT: br %r14 + %bh = fptrunc float %b to half + %res = call half @llvm.copysign.f16(half %a, half %bh) + ret half %res +} + +; Test copysign with an f16 result and f64 sign argument. +define half @f2(half %a, double %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: cpsdr %f0, %f2, %f0 +; CHECK-NEXT: br %r14 + %bh = fptrunc double %b to half + %res = call half @llvm.copysign.f16(half %a, half %bh) + ret half %res +} + +; Test copysign with an f16 result and f128 sign argument. +; TODO: Let the DAGCombiner remove the fp_round. +define half @f3(half %a, fp128 %b) { +; Z10-LABEL: f3: +; Z10: # %bb.0: +; Z10-NEXT: stmg %r14, %r15, 112(%r15) +; Z10-NEXT: .cfi_offset %r14, -48 +; Z10-NEXT: .cfi_offset %r15, -40 +; Z10-NEXT: aghi %r15, -184 +; Z10-NEXT: .cfi_def_cfa_offset 344 +; Z10-NEXT: std %f8, 176(%r15) # 8-byte Spill +; Z10-NEXT: .cfi_offset %f8, -168 +; Z10-NEXT: ld %f1, 0(%r2) +; Z10-NEXT: ld %f3, 8(%r2) +; Z10-NEXT: ler %f8, %f0 +; Z10-NEXT: la %r2, 160(%r15) +; Z10-NEXT: std %f1, 160(%r15) +; Z10-NEXT: std %f3, 168(%r15) +; Z10-NEXT: brasl %r14, __trunctfhf2@PLT +; Z10-NEXT: cpsdr %f0, %f0, %f8 +; Z10-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; Z10-NEXT: lmg %r14, %r15, 296(%r15) +; Z10-NEXT: br %r14 +; +; Z16-LABEL: f3: +; Z16: # %bb.0: +; Z16-NEXT: stmg %r14, %r15, 112(%r15) +; Z16-NEXT: .cfi_offset %r14, -48 +; Z16-NEXT: .cfi_offset %r15, -40 +; Z16-NEXT: aghi %r15, -184 +; Z16-NEXT: .cfi_def_cfa_offset 344 +; Z16-NEXT: std %f8, 176(%r15) # 8-byte Spill +; Z16-NEXT: .cfi_offset %f8, -168 +; Z16-NEXT: ldr %f8, %f0 +; Z16-NEXT: vl %v0, 0(%r2), 3 +; Z16-NEXT: la %r2, 160(%r15) +; Z16-NEXT: vst %v0, 160(%r15), 3 +; Z16-NEXT: brasl %r14, __trunctfhf2@PLT +; Z16-NEXT: cpsdr %f0, %f0, %f8 +; Z16-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; Z16-NEXT: lmg %r14, %r15, 296(%r15) +; Z16-NEXT: br %r14 + %bh = fptrunc fp128 %b to half + %res = call half @llvm.copysign.f16(half %a, half %bh) + ret half %res +} + +; Test copysign with an f32 result and half sign argument. +define float @f4(float %a, half %b) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: cpsdr %f0, %f2, %f0 +; CHECK-NEXT: br %r14 + %bf = fpext half %b to float + %res = call float @llvm.copysign.f32(float %a, float %bf) + ret float %res +} + +; Test copysign with an f64 result and half sign argument. +define double @f5(double %a, half %b) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: cpsdr %f0, %f2, %f0 +; CHECK-NEXT: br %r14 + %bd = fpext half %b to double + %res = call double @llvm.copysign.f64(double %a, double %bd) + ret double %res +} + +; Test copysign with an f128 result and half sign argument. +define fp128 @f6(fp128 %a, half %b) { +; Z10-LABEL: f6: +; Z10: # %bb.0: +; Z10-NEXT: ld %f1, 0(%r3) +; Z10-NEXT: ld %f3, 8(%r3) +; Z10-NEXT: cpsdr %f1, %f0, %f1 +; Z10-NEXT: std %f1, 0(%r2) +; Z10-NEXT: std %f3, 8(%r2) +; Z10-NEXT: br %r14 +; +; Z16-LABEL: f6: +; Z16: # %bb.0: +; Z16-NEXT: aghi %r15, -168 +; Z16-NEXT: .cfi_def_cfa_offset 328 +; Z16-NEXT: vl %v1, 0(%r3), 3 +; Z16-NEXT: vsteh %v0, 164(%r15), 0 +; Z16-NEXT: tm 164(%r15), 128 +; Z16-NEXT: je .LBB6_2 +; Z16-NEXT: # %bb.1: +; Z16-NEXT: wflnxb %v0, %v1 +; Z16-NEXT: j .LBB6_3 +; Z16-NEXT: .LBB6_2: +; Z16-NEXT: wflpxb %v0, %v1 +; Z16-NEXT: .LBB6_3: +; Z16-NEXT: vst %v0, 0(%r2), 3 +; Z16-NEXT: aghi %r15, 168 +; Z16-NEXT: br %r14 + %bd = fpext half %b to fp128 + %res = call fp128 @llvm.copysign.f128(fp128 %a, fp128 %bd) + ret fp128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/fp-div-01.ll b/llvm/test/CodeGen/SystemZ/fp-div-01.ll index d33e61bbd1eda..78df879613cb2 100644 --- a/llvm/test/CodeGen/SystemZ/fp-div-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-div-01.ll @@ -6,6 +6,18 @@ declare float @foo() +; Check register division. +define half @f0(half %f1, half %f2) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: debr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fdiv half %f1, %f2 + ret half %res +} + ; Check register division. define float @f1(float %f1, float %f2) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-half-cmp.ll b/llvm/test/CodeGen/SystemZ/fp-half-cmp.ll new file mode 100644 index 0000000000000..2714d6ad9a92c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-cmp.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s +; +; Some tests with comparisons and their uses involving 16-bit floating point. + +; fcmp half; select half +define half @fun0(half %Arg0, half %Arg1) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ldr %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ldr %f9, %f0 +; CHECK-NEXT: ldr %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f1, %f9 +; CHECK-NEXT: # kill: def $f0s killed $f0s def $v0 +; CHECK-NEXT: jl .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: vgmf %v0, 2, 8 +; CHECK-NEXT: .LBB0_2: # %entry +; CHECK-NEXT: # kill: def $f0s killed $f0s killed $v0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 +entry: + %cmp = fcmp olt half %Arg0, 0xH0000 + %cond = select i1 %cmp, half %Arg1, half 1.0 + ret half %cond +} + +; fcmp half; select i32 +define i32 @fun1(half %Arg0, i32 %Arg1) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lr %r13, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: lochinl %r13, 0 +; CHECK-NEXT: lr %r2, %r13 +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 +entry: + %cmp = fcmp olt half %Arg0, 0xH0000 + %cond = select i1 %cmp, i32 %Arg1, i32 0 + ret i32 %cond +} + +; icmp i32; select half +define half @fun2(i32 %Arg0, half %Arg1) { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lr %r13, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: # kill: def $f0s killed $f0s def $v0 +; CHECK-NEXT: cije %r13, 0, .LBB2_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: vgmf %v0, 2, 8 +; CHECK-NEXT: .LBB2_2: # %entry +; CHECK-NEXT: # kill: def $f0s killed $f0s killed $v0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 +entry: + %cmp = icmp eq i32 %Arg0, 0 + %cond = select i1 %cmp, half %Arg1, half 1.0 + ret half %cond +} + +define i64 @fun3(i64 %a, i64 %b, half %f1, half %f2) #0 { +; CHECK-LABEL: fun3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r12, %r15, 96(%r15) +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ldr %f8, %f0 +; CHECK-NEXT: ldr %f0, %f2 +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: lgr %r12, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ldr %f9, %f0 +; CHECK-NEXT: ldr %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: selgre %r2, %r12, %r13 +; CHECK-NEXT: lmg %r12, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %cond = call i1 @llvm.experimental.constrained.fcmp.f32( + half %f1, half %f2, + metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +define half @fun4(half %Arg0, ptr %Dst) { +; CHECK-LABEL: fun4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: je .LBB4_2 +; CHECK-NEXT: # %bb.1: # %store +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: vsteh %v0, 0(%r13), 0 +; CHECK-NEXT: .LBB4_2: # %exit +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 +entry: + %cmp = fcmp oeq half %Arg0, 0.0 + br i1 %cmp, label %exit, label %store + +store: + store half 0.0, ptr %Dst + br label %exit + +exit: + ret half 0.0 +} + +declare i1 @llvm.experimental.constrained.fcmp.f32(half, half, metadata, metadata) diff --git a/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll new file mode 100644 index 0000000000000..d8db549388c46 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test that library calls are emitted for LLVM IR intrinsics +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define half @f1(half %x, i16 %y) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lhr %r13, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: llgfr %r2, %r13 +; CHECK-NEXT: brasl %r14, __powisf2@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.powi.f16.i16(half %x, i16 %y) + ret half %tmp +} + +define half @f2(half %x, half %y) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, powf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.pow.f16(half %x, half %y) + ret half %tmp +} + +define half @f3(half %x) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, sinf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.sin.f16(half %x) + ret half %tmp +} + +define half @f4(half %x) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, cosf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.cos.f16(half %x) + ret half %tmp +} + +define half @f5(half %x) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, expf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.exp.f16(half %x) + ret half %tmp +} + +define half @f6(half %x) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, exp2f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.exp2.f16(half %x) + ret half %tmp +} + +define half @f7(half %x) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, logf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log.f16(half %x) + ret half %tmp +} + +define half @f8(half %x) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, log2f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log2.f16(half %x) + ret half %tmp +} + +define half @f9(half %x) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, log10f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log10.f16(half %x) + ret half %tmp +} + +define half @f10(half %x, half %y) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, fminf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.minnum.f16(half %x, half %y) + ret half %tmp +} + +define half @f11(half %x, half %y) { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, fmaxf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.maxnum.f16(half %x, half %y) + ret half %tmp +} + +; Verify that "nnan" minnum/maxnum calls are transformed to +; compare+select sequences instead of libcalls. +define half @f12(half %x, half %y) { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: jl .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call nnan half @llvm.minnum.f16(half %x, half %y) + ret half %tmp +} + +define half @f13(half %x, half %y) { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: jh .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call nnan half @llvm.maxnum.f16(half %x, half %y) + ret half %tmp +} + +declare half @llvm.powi.f16.i16(half, i16) +declare half @llvm.pow.f16(half, half) + +declare half @llvm.sin.f16(half) +declare half @llvm.cos.f16(half) + +declare half @llvm.exp.f16(half) +declare half @llvm.exp2.f16(half) + +declare half @llvm.log.f16(half) +declare half @llvm.log2.f16(half) +declare half @llvm.log10.f16(half) + +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) diff --git a/llvm/test/CodeGen/SystemZ/fp-half-mem.ll b/llvm/test/CodeGen/SystemZ/fp-half-mem.ll new file mode 100644 index 0000000000000..5988a379b3d9a --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-mem.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR + +declare void @foo(ptr) + +; Test an alloca. +define half @f1() { +; NOVEC-LABEL: f1: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -168 +; NOVEC-NEXT: .cfi_def_cfa_offset 328 +; NOVEC-NEXT: la %r2, 166(%r15) +; NOVEC-NEXT: brasl %r14, foo@PLT +; NOVEC-NEXT: lgh %r0, 166(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: lmg %r14, %r15, 280(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -168 +; VECTOR-NEXT: .cfi_def_cfa_offset 328 +; VECTOR-NEXT: la %r2, 166(%r15) +; VECTOR-NEXT: brasl %r14, foo@PLT +; VECTOR-NEXT: vlreph %v0, 166(%r15) +; VECTOR-NEXT: lmg %r14, %r15, 280(%r15) +; VECTOR-NEXT: br %r14 + %ptr = alloca half + call void @foo(ptr %ptr) + %orig = load half, ptr %ptr + ret half %orig +} + +; Test accessing a half element of an aggregate type. +%s.half = type { half, half, half, half, half } +define half @f2(ptr %P) { +; NOVEC-LABEL: f2: +; NOVEC: # %bb.0: +; NOVEC-NEXT: lgh %r0, 6(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlreph %v0, 6(%r2) +; VECTOR-NEXT: br %r14 + %gep = getelementptr inbounds %s.half, ptr %P, i64 0, i32 3 + %res = load half, ptr %gep + ret half %res +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-move.ll b/llvm/test/CodeGen/SystemZ/fp-half-move.ll new file mode 100644 index 0000000000000..1e761d4c70b22 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-move.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR +; +; Test moves (bitcasts) between i16 and half. + +define half @f1(ptr %ptr) { +; NOVEC-LABEL: f1: +; NOVEC: # %bb.0: +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: oill %r0, 255 +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lh %r0, 0(%r2) +; VECTOR-NEXT: oill %r0, 255 +; VECTOR-NEXT: vlvgh %v0, %r0, 0 +; VECTOR-NEXT: br %r14 + %L = load i16, ptr %ptr + %O = or i16 %L, 255 + %res = bitcast i16 %O to half + ret half %res +} + +define half @f2(i16 %Arg) { +; NOVEC-LABEL: f2: +; NOVEC: # %bb.0: +; NOVEC-NEXT: # kill: def $r2l killed $r2l def $r2d +; NOVEC-NEXT: sllg %r0, %r2, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlvgh %v0, %r2, 0 +; VECTOR-NEXT: br %r14 + %res = bitcast i16 %Arg to half + ret half %res +} + +define void @f3(half %val, ptr %ptr) { +; NOVEC-LABEL: f3: +; NOVEC: # %bb.0: +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: stc %r0, 0(%r2) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlgvh %r0, %v0, 0 +; VECTOR-NEXT: stc %r0, 0(%r2) +; VECTOR-NEXT: br %r14 + %res = bitcast half %val to i16 + %trunc = trunc i16 %res to i8 + store i8 %trunc, ptr %ptr + ret void +} + +define i16 @f4(half %Arg) { +; NOVEC-LABEL: f4: +; NOVEC: # %bb.0: +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r2, %r0, 48 +; NOVEC-NEXT: # kill: def $r2l killed $r2l killed $r2d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: f4: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlgvh %r2, %v0, 0 +; VECTOR-NEXT: br %r14 + %res = bitcast half %Arg to i16 + ret i16 %res +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll new file mode 100644 index 0000000000000..4f58eb4c6cb20 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR +; +; Tests for strict 16-bit floating point (half). + +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) + +; Test register addition. +define half @fun0(half %f1, half %f2) #0 { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %res = call half @llvm.experimental.constrained.fadd.f16( + half %f1, half %f2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + +; Test atomic memory accesses and extension/truncation inside a strictfp +; function. +define void @fun1(ptr %Src, ptr %Dst) #0 { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: bcr 14, %r0 +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: adbr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: bcr 14, %r0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load atomic half, ptr %Src seq_cst, align 2 + %E0 = fpext half %Op0 to double + %Add = call double @llvm.experimental.constrained.fadd.f64( + double %E0, double %E0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %Res = fptrunc double %Add to half + store atomic half %Res, ptr %Dst seq_cst, align 2 + ret void +} + +; Test a chain of half operations which should have each operation surrounded +; by conversions to/from fp32 for proper emulation. +define half @fun2(half %Op0, half %Op1, half %Op2) #0 { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -184 +; NOVEC-NEXT: .cfi_def_cfa_offset 344 +; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: ler %f8, %f4 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: meebr %f0, %f10 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: meebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: ldr %f8, %f4 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: meebr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: wfmsb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %A0 = call half @llvm.experimental.constrained.fmul.f16( + half %Op0, half %Op1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %Res = call half @llvm.experimental.constrained.fmul.f16( + half %A0, half %Op2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %Res +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll new file mode 100644 index 0000000000000..4997c5b0c617d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll @@ -0,0 +1,725 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR + +; Add the <8 x half> argument with itself and return it. +define <8 x half> @fun0(<8 x half> %Op) { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -224 +; NOVEC-NEXT: .cfi_def_cfa_offset 384 +; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: .cfi_offset %f12, -200 +; NOVEC-NEXT: .cfi_offset %f13, -208 +; NOVEC-NEXT: .cfi_offset %f14, -216 +; NOVEC-NEXT: .cfi_offset %f15, -224 +; NOVEC-NEXT: lgh %r0, 414(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f14, %r0 +; NOVEC-NEXT: lgh %r0, 406(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f12, %r0 +; NOVEC-NEXT: lgh %r0, 398(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f9, %r0 +; NOVEC-NEXT: lgh %r0, 390(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ler %f10, %f6 +; NOVEC-NEXT: ler %f11, %f4 +; NOVEC-NEXT: ler %f13, %f2 +; NOVEC-NEXT: ler %f15, %f0 +; NOVEC-NEXT: lgr %r13, %r2 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f12 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f12, %f0 +; NOVEC-NEXT: ler %f0, %f14 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f14, %f0 +; NOVEC-NEXT: ler %f0, %f15 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f15, %f0 +; NOVEC-NEXT: ler %f0, %f13 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f13, %f0 +; NOVEC-NEXT: ler %f0, %f11 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f11, %f0 +; NOVEC-NEXT: ler %f0, %f10 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 6(%r13) +; NOVEC-NEXT: lgdr %r0, %f11 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 4(%r13) +; NOVEC-NEXT: lgdr %r0, %f13 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 2(%r13) +; NOVEC-NEXT: lgdr %r0, %f15 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lgdr %r0, %f14 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 14(%r13) +; NOVEC-NEXT: lgdr %r0, %f12 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 12(%r13) +; NOVEC-NEXT: lgdr %r0, %f9 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 10(%r13) +; NOVEC-NEXT: lgdr %r0, %f8 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 8(%r13) +; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r13, %r15, 328(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: .cfi_offset %f12, -200 +; VECTOR-NEXT: .cfi_offset %f13, -208 +; VECTOR-NEXT: .cfi_offset %f14, -216 +; VECTOR-NEXT: .cfi_offset %f15, -224 +; VECTOR-NEXT: vlreph %v11, 414(%r15) +; VECTOR-NEXT: vlreph %v12, 406(%r15) +; VECTOR-NEXT: vlreph %v13, 398(%r15) +; VECTOR-NEXT: vlreph %v14, 390(%r15) +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: ldr %f9, %f4 +; VECTOR-NEXT: ldr %f10, %f2 +; VECTOR-NEXT: lgr %r13, %r2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f15, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f14, %f0 +; VECTOR-NEXT: ldr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f13, %f0 +; VECTOR-NEXT: ldr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f12, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 14(%r13), 0 +; VECTOR-NEXT: vsteh %v12, 12(%r13), 0 +; VECTOR-NEXT: vsteh %v13, 10(%r13), 0 +; VECTOR-NEXT: vsteh %v14, 8(%r13), 0 +; VECTOR-NEXT: vsteh %v8, 6(%r13), 0 +; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v15, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd <8 x half> %Op, %Op + ret <8 x half> %Res +} + +; Same, but with partial vector values. +define <4 x half> @fun1(<4 x half> %Op) { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -192 +; NOVEC-NEXT: .cfi_def_cfa_offset 352 +; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: ler %f8, %f6 +; NOVEC-NEXT: ler %f9, %f4 +; NOVEC-NEXT: ler %f10, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f11, %f0 +; NOVEC-NEXT: ler %f0, %f10 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f6, %f0 +; NOVEC-NEXT: ler %f0, %f11 +; NOVEC-NEXT: ler %f2, %f10 +; NOVEC-NEXT: ler %f4, %f9 +; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 304(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: ldr %f9, %f4 +; VECTOR-NEXT: ldr %f10, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f11, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f6, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: ldr %f2, %f10 +; VECTOR-NEXT: ldr %f4, %f9 +; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd <4 x half> %Op, %Op + ret <4 x half> %Res +} + +; Test a vector extension. +define <2 x half> @fun2(<2 x half> %Op) { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: ldr %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f9, %f9 +; NOVEC-NEXT: ldr %f8, %f0 +; NOVEC-NEXT: adbr %f8, %f0 +; NOVEC-NEXT: ldr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ldr %f0, %f8 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ler %f2, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vmrhg %v0, %v0, %v1 +; VECTOR-NEXT: vfadb %v0, %v0, %v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vrepg %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f2, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E = fpext <2 x half> %Op to <2 x double> + %Add = fadd <2 x double> %E, %E + %Res = fptrunc <2 x double> %Add to <2 x half> + ret <2 x half> %Res +} + +; Load and store an <8 x half> vector. +define void @fun3(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun3: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lgh %r0, 2(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lgh %r0, 4(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: lgh %r0, 6(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lgh %r0, 8(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lgh %r0, 10(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lgh %r0, 12(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: lgh %r0, 14(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 14(%r3) +; NOVEC-NEXT: lgdr %r0, %f6 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 12(%r3) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 10(%r3) +; NOVEC-NEXT: lgdr %r0, %f4 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 8(%r3) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 6(%r3) +; NOVEC-NEXT: lgdr %r0, %f2 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 4(%r3) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 2(%r3) +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r3) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun3: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vlreph %v1, 2(%r2) +; VECTOR-NEXT: vlreph %v2, 4(%r2) +; VECTOR-NEXT: vlreph %v3, 6(%r2) +; VECTOR-NEXT: vlreph %v4, 8(%r2) +; VECTOR-NEXT: vlreph %v5, 10(%r2) +; VECTOR-NEXT: vlreph %v6, 12(%r2) +; VECTOR-NEXT: vlreph %v7, 14(%r2) +; VECTOR-NEXT: vsteh %v7, 14(%r3), 0 +; VECTOR-NEXT: vsteh %v6, 12(%r3), 0 +; VECTOR-NEXT: vsteh %v5, 10(%r3), 0 +; VECTOR-NEXT: vsteh %v4, 8(%r3), 0 +; VECTOR-NEXT: vsteh %v3, 6(%r3), 0 +; VECTOR-NEXT: vsteh %v2, 4(%r3), 0 +; VECTOR-NEXT: vsteh %v1, 2(%r3), 0 +; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 +; VECTOR-NEXT: br %r14 +entry: + %L = load <8 x half>, ptr %Src + store <8 x half> %L, ptr %Dst + ret void +} + +; Call a function with <8 x half> argument and return values. +declare <8 x half> @foo(<8 x half>) +define void @fun4(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun4: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -208 +; NOVEC-NEXT: .cfi_def_cfa_offset 368 +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lgh %r0, 2(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d +; NOVEC-NEXT: lgh %r0, 4(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d +; NOVEC-NEXT: lgh %r0, 6(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d +; NOVEC-NEXT: lgh %r0, 8(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lgh %r0, 10(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lgh %r0, 12(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lgh %r0, 14(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 190(%r15) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 182(%r15) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 174(%r15) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: la %r2, 192(%r15) +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, foo@PLT +; NOVEC-NEXT: lgh %r0, 192(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lgh %r0, 194(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lgh %r0, 196(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: lgh %r0, 198(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lgh %r0, 200(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lgh %r0, 202(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lgh %r0, 204(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: lgh %r0, 206(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 14(%r13) +; NOVEC-NEXT: lgdr %r0, %f6 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 12(%r13) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 10(%r13) +; NOVEC-NEXT: lgdr %r0, %f4 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 8(%r13) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 6(%r13) +; NOVEC-NEXT: lgdr %r0, %f2 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 4(%r13) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 2(%r13) +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 312(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun4: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -208 +; VECTOR-NEXT: .cfi_def_cfa_offset 368 +; VECTOR-NEXT: vlreph %v6, 6(%r2) +; VECTOR-NEXT: vlreph %v4, 4(%r2) +; VECTOR-NEXT: vlreph %v2, 2(%r2) +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vlreph %v1, 8(%r2) +; VECTOR-NEXT: vlreph %v3, 10(%r2) +; VECTOR-NEXT: vlreph %v5, 12(%r2) +; VECTOR-NEXT: vlreph %v7, 14(%r2) +; VECTOR-NEXT: la %r2, 192(%r15) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 +; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 +; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 +; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, foo@PLT +; VECTOR-NEXT: vlreph %v0, 192(%r15) +; VECTOR-NEXT: vlreph %v1, 194(%r15) +; VECTOR-NEXT: vlreph %v2, 196(%r15) +; VECTOR-NEXT: vlreph %v3, 198(%r15) +; VECTOR-NEXT: vlreph %v4, 200(%r15) +; VECTOR-NEXT: vlreph %v5, 202(%r15) +; VECTOR-NEXT: vlreph %v6, 204(%r15) +; VECTOR-NEXT: vlreph %v7, 206(%r15) +; VECTOR-NEXT: vsteh %v7, 14(%r13), 0 +; VECTOR-NEXT: vsteh %v6, 12(%r13), 0 +; VECTOR-NEXT: vsteh %v5, 10(%r13), 0 +; VECTOR-NEXT: vsteh %v4, 8(%r13), 0 +; VECTOR-NEXT: vsteh %v3, 6(%r13), 0 +; VECTOR-NEXT: vsteh %v2, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v1, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 312(%r15) +; VECTOR-NEXT: br %r14 +entry: + %arg = load <8 x half>, ptr %Src + %Res = call <8 x half> @foo(<8 x half> %arg) + store <8 x half> %Res, ptr %Dst + ret void +} + +; Receive and pass argument fully on stack. +declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5) +define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) { +; NOVEC-LABEL: fun5: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -256 +; NOVEC-NEXT: .cfi_def_cfa_offset 416 +; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: lgh %r0, 422(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lgh %r0, 430(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lgh %r0, 438(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lgh %r0, 446(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgh %r0, 454(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f8, %r0 +; NOVEC-NEXT: lgh %r0, 462(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f9, %r0 +; NOVEC-NEXT: lgh %r0, 470(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f10, %r0 +; NOVEC-NEXT: lgh %r0, 478(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f11, %r0 +; NOVEC-NEXT: lgdr %r0, %f11 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 222(%r15) +; NOVEC-NEXT: lgdr %r0, %f10 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 214(%r15) +; NOVEC-NEXT: lgdr %r0, %f9 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 206(%r15) +; NOVEC-NEXT: lgdr %r0, %f8 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 198(%r15) +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 190(%r15) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 182(%r15) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 174(%r15) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, foo2@PLT +; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 368(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun5: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: vlreph %v1, 390(%r15) +; VECTOR-NEXT: vlreph %v3, 398(%r15) +; VECTOR-NEXT: vlreph %v5, 406(%r15) +; VECTOR-NEXT: vlreph %v7, 414(%r15) +; VECTOR-NEXT: vlreph %v16, 422(%r15) +; VECTOR-NEXT: vlreph %v17, 430(%r15) +; VECTOR-NEXT: vlreph %v18, 438(%r15) +; VECTOR-NEXT: vlreph %v19, 446(%r15) +; VECTOR-NEXT: vsteh %v19, 222(%r15), 0 +; VECTOR-NEXT: vsteh %v18, 214(%r15), 0 +; VECTOR-NEXT: vsteh %v17, 206(%r15), 0 +; VECTOR-NEXT: vsteh %v16, 198(%r15), 0 +; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 +; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 +; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 +; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, foo2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 336(%r15) +; VECTOR-NEXT: br %r14 + call void @foo2(<4 x half> %dummy, <8 x half> %Arg5) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll new file mode 100644 index 0000000000000..f479e405b04e9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half.ll @@ -0,0 +1,612 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR +; +; Tests for 16-bit floating point (half). + +; Incoming half arguments added together and returned. +define half @fun0(half %Op0, half %Op1) { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd half %Op0, %Op1 + ret half %Res +} + +define half @fun1(half %Op0, half %Op1) { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: ldr %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: wfadb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E0 = fpext half %Op0 to double + %E1 = fpext half %Op1 to double + %Add = fadd double %E0, %E1 + %Res = fptrunc double %Add to half + ret half %Res +} + +define half @fun2(half %Op0, half %Op1) { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -232 +; NOVEC-NEXT: .cfi_def_cfa_offset 392 +; NOVEC-NEXT: std %f8, 224(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 216(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f11, 208(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f11, -184 +; NOVEC-NEXT: la %r2, 160(%r15) +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f9, 160(%r15) +; NOVEC-NEXT: ld %f11, 168(%r15) +; NOVEC-NEXT: la %r2, 176(%r15) +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f0, 176(%r15) +; NOVEC-NEXT: ld %f2, 184(%r15) +; NOVEC-NEXT: la %r2, 192(%r15) +; NOVEC-NEXT: axbr %f0, %f9 +; NOVEC-NEXT: std %f0, 192(%r15) +; NOVEC-NEXT: std %f2, 200(%r15) +; NOVEC-NEXT: brasl %r14, __trunctfhf2@PLT +; NOVEC-NEXT: ld %f8, 224(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 216(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f11, 208(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 344(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -232 +; VECTOR-NEXT: .cfi_def_cfa_offset 392 +; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: la %r2, 176(%r15) +; VECTOR-NEXT: ldr %f8, %f2 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: mvc 160(16,%r15), 176(%r15) +; VECTOR-NEXT: la %r2, 192(%r15) +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: vl %v0, 192(%r15), 3 +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: wfaxb %v0, %v1, %v0 +; VECTOR-NEXT: la %r2, 208(%r15) +; VECTOR-NEXT: vst %v0, 208(%r15), 3 +; VECTOR-NEXT: brasl %r14, __trunctfhf2@PLT +; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 344(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E0 = fpext half %Op0 to fp128 + %E1 = fpext half %Op1 to fp128 + %Add = fadd fp128 %E0, %E1 + %Res = fptrunc fp128 %Add to half + ret half %Res +} + +; Test loading and storing a half value. +define void @fun3(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun3: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r3) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun3: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 +; VECTOR-NEXT: br %r14 +entry: + %L = load half, ptr %Src, align 2 + store half %L, ptr %Dst, align 2 + ret void +} + +define void @fun4(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun4: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun4: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: adbr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load half, ptr %Src, align 2 + %E0 = fpext half %Op0 to double + %Add = fadd double %E0, %E0 + %Res = fptrunc double %Add to half + store half %Res, ptr %Dst, align 2 + ret void +} + +define void @fun5(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun5: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -192 +; NOVEC-NEXT: .cfi_def_cfa_offset 352 +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: la %r2, 160(%r15) +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f0, 160(%r15) +; NOVEC-NEXT: ld %f2, 168(%r15) +; NOVEC-NEXT: la %r2, 176(%r15) +; NOVEC-NEXT: axbr %f0, %f0 +; NOVEC-NEXT: std %f0, 176(%r15) +; NOVEC-NEXT: std %f2, 184(%r15) +; NOVEC-NEXT: brasl %r14, __trunctfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun5: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: la %r2, 160(%r15) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: vl %v0, 160(%r15), 3 +; VECTOR-NEXT: wfaxb %v0, %v0, %v0 +; VECTOR-NEXT: la %r2, 176(%r15) +; VECTOR-NEXT: vst %v0, 176(%r15), 3 +; VECTOR-NEXT: brasl %r14, __trunctfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load half, ptr %Src, align 2 + %E0 = fpext half %Op0 to fp128 + %Add = fadd fp128 %E0, %E0 + %Res = fptrunc fp128 %Add to half + store half %Res, ptr %Dst, align 2 + ret void +} + +; Test a chain of half operations which should have each operation surrounded +; by conversions to/from fp32 for proper emulation. +define half @fun6(half %Op0, half %Op1, half %Op2) { +; NOVEC-LABEL: fun6: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -184 +; NOVEC-NEXT: .cfi_def_cfa_offset 344 +; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: ler %f8, %f4 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f10 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun6: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: ldr %f8, %f4 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: wfasb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %A0 = fadd half %Op0, %Op1 + %Res = fadd half %A0, %Op2 + ret half %Res +} + +; Store an incoming half argument and return a loaded one. +define half @fun7(half %Op0, ptr %Dst, ptr %Src) { +; NOVEC-LABEL: fun7: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r2) +; NOVEC-NEXT: lgh %r0, 0(%r3) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun7: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vsteh %v0, 0(%r2), 0 +; VECTOR-NEXT: vlreph %v0, 0(%r3) +; VECTOR-NEXT: br %r14 +entry: + store half %Op0, ptr %Dst + %Res = load half, ptr %Src + ret half %Res +} + +; Call a function with half argument and return values. +declare half @foo(half) +define void @fun8(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun8: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lgh %r0, 0(%r2) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, foo@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun8: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, foo@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %arg = load half, ptr %Src + %Res = call half @foo(half %arg) + store half %Res, ptr %Dst + ret void +} + +; Receive stack argument. +define half @fun9(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) { +; NOVEC-LABEL: fun9: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: lgh %r0, 342(%r15) +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ler %f8, %f6 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun9: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: vlreph %v0, 342(%r15) +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %A0 = fadd half %Arg3, %Arg4 + ret half %A0 +} + +; Pass stack argument. +define void @fun10(half %Arg0) { +; NOVEC-LABEL: fun10: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -168 +; NOVEC-NEXT: .cfi_def_cfa_offset 328 +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: srlg %r0, %r0, 48 +; NOVEC-NEXT: ler %f2, %f0 +; NOVEC-NEXT: ler %f4, %f0 +; NOVEC-NEXT: ler %f6, %f0 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, fun9@PLT +; NOVEC-NEXT: lmg %r14, %r15, 280(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun10: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -168 +; VECTOR-NEXT: .cfi_def_cfa_offset 328 +; VECTOR-NEXT: ldr %f2, %f0 +; VECTOR-NEXT: ldr %f4, %f0 +; VECTOR-NEXT: ldr %f6, %f0 +; VECTOR-NEXT: vsteh %v0, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, fun9@PLT +; VECTOR-NEXT: lmg %r14, %r15, 280(%r15) +; VECTOR-NEXT: br %r14 + call void @fun9(half %Arg0, half %Arg0, half %Arg0, half %Arg0, half %Arg0) + ret void +} + +; Test loading some immediates from the Constant Pool. +declare void @foo2(half, half, half, half) +define void @fun11() { +; NOVEC-LABEL: fun11: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lghrl %r0, .LCPI11_0 +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lghrl %r0, .LCPI11_1 +; NOVEC-NEXT: lzer %f2 +; NOVEC-NEXT: lcdfr %f0, %f2 +; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d +; NOVEC-NEXT: sllg %r0, %r0, 48 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d +; NOVEC-NEXT: brasl %r14, foo2@PLT +; NOVEC-NEXT: lmg %r14, %r15, 272(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun11: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: lzer %f2 +; VECTOR-NEXT: vrepih %v4, 13824 +; VECTOR-NEXT: vrepih %v6, 15360 +; VECTOR-NEXT: lcdfr %f0, %f2 +; VECTOR-NEXT: brasl %r14, foo2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 +entry: + call void @foo2(half -0.0, half 0.0, half 0.375, half 1.0) + ret void +} + +; Test a tail call. +declare void @foo3(half) +define void @fun12(half %Arg0) { +; NOVEC-LABEL: fun12: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: jg foo3@PLT +; +; VECTOR-LABEL: fun12: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: jg foo3@PLT +entry: + tail call void @foo3(half %Arg0) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-libcall.ll index 60b698e34fcfe..5069b9b257b80 100644 --- a/llvm/test/CodeGen/SystemZ/fp-libcall.ll +++ b/llvm/test/CodeGen/SystemZ/fp-libcall.ll @@ -212,6 +212,16 @@ define fp128 @f30(fp128 %x, fp128 %y) { ret fp128 %tmp } +define half @f31_half(half %x, half %y) { +; CHECK-LABEL: f31_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, fmaxf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT + %tmp = call half @llvm.maxnum.f16(half %x, half %y) + ret half %tmp +} + define float @f31(float %x, float %y) { ; CHECK-LABEL: f31: ; CHECK: brasl %r14, fmaxf@PLT diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll index c5e66ff72c2a4..323907f03b743 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll @@ -6,6 +6,18 @@ declare float @foo() +; Check register multiplication. +define half @f0(half %f1, half %f2) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: meebr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fmul half %f1, %f2 + ret half %res +} + ; Check register multiplication. define float @f1(float %f1, float %f2) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll index 8fd363bc397d0..6b285a49057dc 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll @@ -3,8 +3,22 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ ; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s +declare half @llvm.fma.f16(half %f1, half %f2, half %f3) declare float @llvm.fma.f32(float %f1, float %f2, float %f3) +define half @f0(half %f1, half %f2, half %acc) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-SCALAR: maebr %f0, %f9, %f10 +; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc) + ret half %res +} + define float @f1(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f1: ; CHECK-SCALAR: maebr %f4, %f0, %f2 diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll index 543ab95551690..2b18abec8d555 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll @@ -3,8 +3,26 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ ; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s +declare half @llvm.fma.f16(half %f1, half %f2, half %f3) declare float @llvm.fma.f32(float %f1, float %f2, float %f3) +define half @f0(half %f1, half %f2, half %acc) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-SCALAR: maebr %f0, %f9, %f8 +; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %negacc = fneg half %acc + %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc) + ret half %res +} + define float @f1(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f1: ; CHECK-SCALAR: msebr %f4, %f0, %f2 diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll index 669ccbacf7898..1ecf52fbde354 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll @@ -2,6 +2,7 @@ declare double @llvm.fma.f64(double %f1, double %f2, double %f3) declare float @llvm.fma.f32(float %f1, float %f2, float %f3) +declare half @llvm.fma.f16(half %f1, half %f2, half %f3) define double @f1(double %f1, double %f2, double %acc) { ; CHECK-LABEL: f1: @@ -22,6 +23,22 @@ define double @f2(double %f1, double %f2, double %acc) { ret double %negres } +define half @f3_half(half %f1, half %f2, half %acc) { +; CHECK-LABEL: f3_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfmasb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc) + %negres = fneg half %res + ret half %negres +} + define float @f3(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f3: ; CHECK: wfnmasb %f0, %f0, %f2, %f4 @@ -31,6 +48,26 @@ define float @f3(float %f1, float %f2, float %acc) { ret float %negres } +define half @f4_half(half %f1, half %f2, half %acc) { +; CHECK-LABEL: f4_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfmasb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %negacc = fneg half %acc + %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc) + %negres = fneg half %res + ret half %negres +} + define float @f4(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f4: ; CHECK: wfnmssb %f0, %f0, %f2, %f4 @@ -40,4 +77,3 @@ define float @f4(float %f1, float %f2, float %acc) { %negres = fneg float %res ret float %negres } - diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-15.ll b/llvm/test/CodeGen/SystemZ/fp-mul-15.ll new file mode 100644 index 0000000000000..c897d05ab86df --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-mul-15.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s +; +; Check that a multiply-and-add *not* result for half. + +define half @f1(half %arg, half %A2, half %A3) { +; CHECK-LABEL: f1: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: meebr %f0, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfasb %f0, %f9, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT + +bb: + %i = fmul contract half %arg, %A2 + %i4 = fadd contract half %i, %A3 + ret half %i4 +} diff --git a/llvm/test/CodeGen/SystemZ/fp-neg-01.ll b/llvm/test/CodeGen/SystemZ/fp-neg-01.ll index 875905de4948d..a8fe8d5da7c8a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-neg-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-neg-01.ll @@ -3,6 +3,17 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s +; Test f16. +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fneg half %f + ret half %res +} + ; Test f32. define float @f1(float %f) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-neg-02.ll b/llvm/test/CodeGen/SystemZ/fp-neg-02.ll index 7cd66948e2fc7..848c4740d8540 100644 --- a/llvm/test/CodeGen/SystemZ/fp-neg-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-neg-02.ll @@ -2,6 +2,17 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; Test f16. +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: lcdfr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fneg half %f + ret half %res +} + ; Test f32. define float @f1(float %f) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-round-01.ll b/llvm/test/CodeGen/SystemZ/fp-round-01.ll index b1db2f547a832..21b354c7a83c4 100644 --- a/llvm/test/CodeGen/SystemZ/fp-round-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-round-01.ll @@ -34,6 +34,18 @@ define void @f3(ptr %ptr) { ret void } +; Test nearbyint for f16. +declare half @llvm.nearbyint.f16(half %f) +define half @f4_half(half %f) { +; CHECK-LABEL: f4_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, nearbyintf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.nearbyint.f16(half %f) + ret half %res +} + ; Test nearbyint for f32. declare float @llvm.nearbyint.f32(float %f) define float @f4(float %f) { @@ -66,6 +78,18 @@ define void @f6(ptr %ptr) { ret void } +; Test floor for f16. +declare half @llvm.floor.f16(half %f) +define half @f7_half(half %f) { +; CHECK-LABEL: f7_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, floorf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.floor.f16(half %f) + ret half %res +} + ; Test floor for f32. declare float @llvm.floor.f32(float %f) define float @f7(float %f) { @@ -98,6 +122,18 @@ define void @f9(ptr %ptr) { ret void } +; Test ceil for f16. +declare half @llvm.ceil.f16(half %f) +define half @f10_half(half %f) { +; CHECK-LABEL: f10_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, ceilf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.ceil.f16(half %f) + ret half %res +} + ; Test ceil for f32. declare float @llvm.ceil.f32(float %f) define float @f10(float %f) { @@ -162,6 +198,18 @@ define void @f15(ptr %ptr) { ret void } +; Test round for f16. +declare half @llvm.round.f16(half %f) +define half @f16_half(half %f) { +; CHECK-LABEL: f16_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, roundf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.round.f16(half %f) + ret half %res +} + ; Test round for f32. declare float @llvm.round.f32(float %f) define float @f16(float %f) { diff --git a/llvm/test/CodeGen/SystemZ/fp-round-02.ll b/llvm/test/CodeGen/SystemZ/fp-round-02.ll index 2cf009ad5b856..f1a0a2847a303 100644 --- a/llvm/test/CodeGen/SystemZ/fp-round-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-round-02.ll @@ -134,6 +134,18 @@ define void @f12(ptr %ptr) { ret void } +; Test trunc for f16. +declare half @llvm.trunc.f16(half %f) +define half @f13_half(half %f) { +; CHECK-LABEL: f13_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 5, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.trunc.f16(half %f) + ret half %res +} + ; Test trunc for f32. declare float @llvm.trunc.f32(float %f) define float @f13(float %f) { @@ -166,6 +178,18 @@ define void @f15(ptr %ptr) { ret void } +; Test round for f16. +declare half @llvm.round.f16(half %f) +define half @f16_half(half %f) { +; CHECK-LABEL: f16_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 1, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.round.f16(half %f) + ret half %res +} + ; Test round for f32. declare float @llvm.round.f32(float %f) define float @f16(float %f) { @@ -198,6 +222,18 @@ define void @f18(ptr %ptr) { ret void } +; Test roundeven for f16. +declare half @llvm.roundeven.f16(half %f) +define half @f19_half(half %f) { +; CHECK-LABEL: f19_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 4, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.roundeven.f16(half %f) + ret half %res +} + ; Test roundeven for f32. declare float @llvm.roundeven.f32(float %f) define float @f19(float %f) { diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll index 3cae74749efbe..e0c059661137c 100644 --- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll @@ -1,6 +1,19 @@ ; Test rounding functions for z14 and above. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -verify-machineinstrs \ +; RUN: | FileCheck %s + +; Test that an f16 intrinsic can be lowered with promotion to float. +declare half @llvm.rint.f16(half %f) +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 0, %f0, 0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.rint.f16(half %f) + ret half %res +} ; Test rint for f32. declare float @llvm.rint.f32(float %f) diff --git a/llvm/test/CodeGen/SystemZ/fp-sqrt-01.ll b/llvm/test/CodeGen/SystemZ/fp-sqrt-01.ll index 996bdc458b9de..2f7d38339eacd 100644 --- a/llvm/test/CodeGen/SystemZ/fp-sqrt-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-sqrt-01.ll @@ -4,9 +4,21 @@ ; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +declare half @llvm.sqrt.f16(half) declare float @llvm.sqrt.f32(float) declare float @sqrtf(float) +; Check register square root. +define half @f0(half %val) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: sqebr %f0, %f0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.sqrt.f16(half %val) + ret half %res +} + ; Check register square root. define float @f1(float %val) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll index bf9ccbcd70550..dfefc43c02bed 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll @@ -298,6 +298,43 @@ exit: ret float %add } +define half @f12_half(half %dummy, half %val) #0 { +; CHECK-LABEL: f12_half: +; CHECK: ler %f9, %f2 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: #APP +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: jl .LBB11_2 +; CHECK-NEXT:# %bb.1: # %store +; CHECK-NEXT: #APP +; CHECK-NEXT: blah +; CHECK-NEXT: #NO_APP +; CHECK-NEXT:.LBB11_2: # %exit +; CHECK-NEXT: ler %f0, %f8 +; CHECK: br %r14 +entry: + %ret = call half asm "ler $0, $1", "=f,{f0}"(half %val) #0 + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16( + half %val, half 0.0, + metadata !"olt", + metadata !"fpexcept.strict") #0 + br i1 %cmp, label %exit, label %store + +store: + call void asm sideeffect "blah", ""() #0 + br label %exit + +exit: + ret half %ret +} + ; Test that LER does not get converted to LTEBR as %f0 is live after it. define float @f12(float %dummy, float %val) #0 { ; CHECK-LABEL: f12: @@ -309,7 +346,7 @@ define float @f12(float %dummy, float %val) #0 { ; CHECK-NEXT: blr %r14 ; CHECK: br %r14 entry: - %ret = call float asm "blah $1", "=f,{f0}"(float %val) #0 + %ret = call float asm "$0 = blah $1", "=f,{f0}"(float %val) #0 %cmp = call i1 @llvm.experimental.constrained.fcmp.f32( float %val, float 0.0, metadata !"olt", @@ -384,6 +421,43 @@ exit: ret void } +define half @f15_half(half %val, half %dummy) #0 { +; CHECK-LABEL: f15_half: +; CHECK: ler %f9, %f0 +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: #APP +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: jl .LBB15_2 +; CHECK-NEXT:# %bb.1: # %store +; CHECK-NEXT: #APP +; CHECK-NEXT: blah +; CHECK-NEXT: #NO_APP +; CHECK-NEXT:.LBB15_2: # %exit +; CHECK-NEXT: ler %f0, %f8 +; CHECK: br %r14 +entry: + %ret = call half asm "ler $0, $1", "=f,{f2}"(half %val) #0 + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16( + half %val, half 0.0, + metadata !"olt", + metadata !"fpexcept.strict") #0 + br i1 %cmp, label %exit, label %store + +store: + call void asm sideeffect "blah", ""() #0 + br label %exit + +exit: + ret half %ret +} + ; Test a case where it is the source rather than destination of LER that ; we need, but cannot convert the LER. define float @f15(float %val, float %dummy) #0 { @@ -491,6 +565,43 @@ exit: ret float %res } +define half @f19_half(half %dummy, half %val) #0 { +; CHECK-LABEL: f19_half: +; CHECK: ler %f9, %f2 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: #APP +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: jl .LBB20_2 +; CHECK-NEXT:# %bb.1: # %store +; CHECK-NEXT: #APP +; CHECK-NEXT: blah +; CHECK-NEXT: #NO_APP +; CHECK-NEXT:.LBB20_2: # %exit +; CHECK-NEXT: ler %f0, %f8 +; CHECK: br %r14 +entry: + %ret = call half asm sideeffect "ler $0, $1", "=f,{f0}"(half %val) #0 + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16( + half %val, half 0.0, + metadata !"olt", + metadata !"fpexcept.strict") #0 + br i1 %cmp, label %exit, label %store + +store: + call void asm sideeffect "blah", ""() #0 + br label %exit + +exit: + ret half %ret +} + ; Verify that we cannot convert LER to LTEBR and omit the compare if ; there may be an intervening change to the exception flags. define float @f19(float %dummy, float %val) #0 { @@ -524,6 +635,7 @@ declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, me declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata) declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) declare i1 @llvm.experimental.constrained.fcmp.f128(fp128, fp128, metadata, metadata) diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-01.ll index ac8894417921c..20efbf60fdbdc 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-01.ll @@ -8,6 +8,26 @@ declare float @foo() +; Check comparison with registers. +define i64 @f0(i64 %a, i64 %b, half %f1, half %f2) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: kebr %f0, %f9 +; CHECK-SCALAR-NEXT: je +; CHECK-SCALAR: lgr %r13, %r12 +; CHECK-SCALAR: lgr %r2, %r13 +; CHECK-VECTOR: locgrne %r12, %r13 +; CHECK-VECTOR: lgr %r2, %r12 +; CHECK: br %r14 + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, + metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + ; Check comparison with registers. define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll index e178769f263e6..ad86df1753192 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll @@ -110,6 +110,43 @@ exit: ret float %res } +define half @f12_half(half %dummy, half %val) #0 { +; CHECK-LABEL: f12_half: +; CHECK: ler %f9, %f2 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: #APP +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: kebr %f0, %f10 +; CHECK-NEXT: jl .LBB4_2 +; CHECK-NEXT:# %bb.1: # %store +; CHECK-NEXT: #APP +; CHECK-NEXT: blah +; CHECK-NEXT: #NO_APP +; CHECK-NEXT:.LBB4_2: # %exit +; CHECK-NEXT: ler %f0, %f8 +; CHECK: br %r14 +entry: + %ret = call half asm "ler $0, $1", "=f,{f0}"(half %val) #0 + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16( + half %val, half 0.0, + metadata !"olt", + metadata !"fpexcept.strict") #0 + br i1 %cmp, label %exit, label %store + +store: + call void asm sideeffect "blah", ""() #0 + br label %exit + +exit: + ret half %ret +} + ; Test that LER does not get converted to LTEBR. define float @f12(float %dummy, float %val) #0 { ; CHECK-LABEL: f12: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll index 6b9db1569cf8c..8df7ef5f3d7c8 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll @@ -5,13 +5,29 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \ ; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s + +declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) + declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) +declare half @llvm.experimental.constrained.fptrunc.f16.f128(fp128, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f128(fp128, metadata, metadata) declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata) declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) +; Test f64->f16. +define half @f0(double %d1, double %d2) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __truncdfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.fptrunc.f16.f64( + double %d2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test f64->f32. define float @f1(double %d1, double %d2) #0 { ; CHECK-LABEL: f1: @@ -25,6 +41,19 @@ define float @f1(double %d1, double %d2) #0 { ret float %res } +; Test f128->f16. +define half @f2_half(ptr %ptr) #0 { +; CHECK-LABEL: f2_half: +; CHECK: brasl %r14, __trunctfhf2@PLT +; CHECK: br %r14 + %val = load fp128, ptr %ptr + %res = call half @llvm.experimental.constrained.fptrunc.f16.f128( + fp128 %val, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test f128->f32. define float @f2(ptr %ptr) #0 { ; CHECK-LABEL: f2: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll index c79f51dd1ae9e..725d53cabb937 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll @@ -2,8 +2,19 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +; Check register extension. +define double @f0(half %val) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: br %r14 + %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %val, + metadata !"fpexcept.strict") #0 + ret double %res +} + ; Check register extension. define double @f1(float %val) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-05.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-05.ll index f2a66098d32e2..96f764fdab47a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-05.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-05.ll @@ -2,10 +2,23 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) declare fp128 @llvm.experimental.constrained.sitofp.f128.i32(i32, metadata, metadata) +; Check i32->f16. +define half @f0(i32 %i) #0 { +; CHECK-LABEL: f0: +; CHECK: cefbr %f0, %r2 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Check i32->f32. define float @f1(i32 %i) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-06.ll index e23eaf3ab359a..2bd8556edd664 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-06.ll @@ -2,10 +2,25 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) declare fp128 @llvm.experimental.constrained.uitofp.f128.i32(i32, metadata, metadata) +; Check i32->f16. There is no native instruction, so we must promote +; to i64 first. +define half @f0(i32 %i) #0 { +; CHECK-LABEL: f0: +; CHECK: llgfr [[REGISTER:%r[0-5]]], %r2 +; CHECK: cegbr %f0, [[REGISTER]] +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Check i32->f32. There is no native instruction, so we must promote ; to i64 first. define float @f1(i32 %i) #0 { diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-07.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-07.ll index d18aa38966009..d2a568ed19a4e 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-07.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-07.ll @@ -2,10 +2,23 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) declare fp128 @llvm.experimental.constrained.sitofp.f128.i64(i64, metadata, metadata) +; Test i64->f16. +define half @f0(i64 %i) #0 { +; CHECK-LABEL: f0: +; CHECK: cegbr %f0, %r2 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Test i64->f32. define float @f1(i64 %i) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll index 2cbcf2a2ef0a3..c79f884ac4aeb 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll @@ -2,10 +2,24 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata) declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) declare fp128 @llvm.experimental.constrained.uitofp.f128.i64(i64, metadata, metadata) +; Test i64->f16. For z10, this results in just a single a libcall. +define half @f0(i64 %i) #0 { +; CHECK-LABEL: f0: +; CHECK: cegbr +; CHECK: aebr +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Test i64->f32. There's no native support for unsigned i64-to-fp conversions, ; but we should be able to implement them using signed i64-to-fp conversions. define float @f1(i64 %i) #0 { diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll index a54055120f727..40da726b7a46a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll @@ -2,10 +2,22 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata) declare i32 @llvm.experimental.constrained.fptosi.i32.f32(float, metadata) declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128, metadata) +; Test f16->i32. +define i32 @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cfebr %r2, 5, %f0 +; CHECK: br %r14 + %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll index 7cbcfeea8cf60..d2206a40169e5 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll @@ -9,25 +9,45 @@ ; outside the i32 range but in the i64 range, so use the default expansion. ; Note that the strict expansion sequence must be used. +declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128, metadata) +; Test f16->i32. Converted to signed as the max float value is smaller than +; the signed integer range. +define i32 @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cfebr %r2, 5, %f0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: le %f1, 0(%r1) ; CHECK-NEXT: kebr %f0, %f1 -; CHECK-NEXT: jnl .LBB0_2 +; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 ; CHECK-NEXT: lzer %f1 -; CHECK-NEXT: j .LBB0_3 -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: j .LBB1_3 +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: llilh %r0, 32768 -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: .LBB1_3: ; CHECK-NEXT: sebr %f0, %f1 ; CHECK-NEXT: cfebr %r2, 5, %f0 ; CHECK-NEXT: xr %r2, %r0 @@ -41,17 +61,17 @@ define i32 @f1(float %f) #0 { define i32 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: ld %f1, 0(%r1) ; CHECK-NEXT: kdbr %f0, %f1 -; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 ; CHECK-NEXT: lzdr %f1 -; CHECK-NEXT: j .LBB1_3 -; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: j .LBB2_3 +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: llilh %r0, 32768 -; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: .LBB2_3: ; CHECK-NEXT: sdbr %f0, %f1 ; CHECK-NEXT: cfdbr %r2, 5, %f0 ; CHECK-NEXT: xr %r2, %r0 @@ -67,17 +87,17 @@ define i32 @f3(ptr %src) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) ; CHECK-NEXT: ld %f2, 8(%r2) -; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: larl %r1, .LCPI3_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) ; CHECK-NEXT: kxbr %f0, %f1 -; CHECK-NEXT: jnl .LBB2_2 +; CHECK-NEXT: jnl .LBB3_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 ; CHECK-NEXT: lzxr %f1 -; CHECK-NEXT: j .LBB2_3 -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: j .LBB3_3 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: llilh %r0, 32768 -; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: .LBB3_3: ; CHECK-NEXT: sxbr %f0, %f1 ; CHECK-NEXT: cfxbr %r2, 5, %f0 ; CHECK-NEXT: xr %r2, %r0 diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll index 27af314cff01b..dd8a708599629 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll @@ -2,10 +2,22 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare i64 @llvm.experimental.constrained.fptosi.i64.f16(half, metadata) declare i64 @llvm.experimental.constrained.fptosi.i64.f32(float, metadata) declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128, metadata) +; Test f16->i64. +define i64 @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cgebr %r2, 5, %f0 +; CHECK: br %r14 + %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i64 %conv +} + ; Test f32->i64. define i64 @f1(float %f) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll index 69bbd82e29898..76c7188641724 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll @@ -8,25 +8,45 @@ ; Convert via signed i64s instead. ; Note that the strict expansion sequence must be used. +declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f32(float, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata) +; Test f16->i64. Converted to signed as the max float value is smaller than +; the signed integer range. +define i64 @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cgebr %r2, 5, %f0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i64 %conv +} + ; Test f32->i64. define i64 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: le %f1, 0(%r1) ; CHECK-NEXT: kebr %f0, %f1 -; CHECK-NEXT: jnl .LBB0_2 +; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 ; CHECK-NEXT: lzer %f1 -; CHECK-NEXT: j .LBB0_3 -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: j .LBB1_3 +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: llihh %r0, 32768 -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: .LBB1_3: ; CHECK-NEXT: sebr %f0, %f1 ; CHECK-NEXT: cgebr %r2, 5, %f0 ; CHECK-NEXT: xgr %r2, %r0 @@ -40,17 +60,17 @@ define i64 @f1(float %f) #0 { define i64 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: ld %f1, 0(%r1) ; CHECK-NEXT: kdbr %f0, %f1 -; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 ; CHECK-NEXT: lzdr %f1 -; CHECK-NEXT: j .LBB1_3 -; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: j .LBB2_3 +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: llihh %r0, 32768 -; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: .LBB2_3: ; CHECK-NEXT: sdbr %f0, %f1 ; CHECK-NEXT: cgdbr %r2, 5, %f0 ; CHECK-NEXT: xgr %r2, %r0 @@ -66,17 +86,17 @@ define i64 @f3(ptr %src) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) ; CHECK-NEXT: ld %f2, 8(%r2) -; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: larl %r1, .LCPI3_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) ; CHECK-NEXT: kxbr %f0, %f1 -; CHECK-NEXT: jnl .LBB2_2 +; CHECK-NEXT: jnl .LBB3_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 ; CHECK-NEXT: lzxr %f1 -; CHECK-NEXT: j .LBB2_3 -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: j .LBB3_3 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: llihh %r0, 32768 -; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: .LBB3_3: ; CHECK-NEXT: sxbr %f0, %f1 ; CHECK-NEXT: cgxbr %r2, 5, %f0 ; CHECK-NEXT: xgr %r2, %r0 diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-13.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-13.ll index 41913106f5340..2b1c47d0d91e4 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-13.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-13.ll @@ -3,14 +3,28 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s +declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) declare fp128 @llvm.experimental.constrained.uitofp.f128.i32(i32, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata) declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) declare fp128 @llvm.experimental.constrained.uitofp.f128.i64(i64, metadata, metadata) +; Check i32->f16. +define half @f0(i32 %i) #0 { +; CHECK-LABEL: f0: +; CHECK: celfbr %f0, 0, %r2, 0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Check i32->f32. define float @f1(i32 %i) #0 { ; CHECK-LABEL: f1: @@ -47,10 +61,22 @@ define void @f3(i32 %i, ptr %dst) #0 { ret void } -; Check i64->f32. -define float @f4(i64 %i) #0 { +; Check i64->f16. +define half @f4(i64 %i) #0 { ; CHECK-LABEL: f4: ; CHECK: celgbr %f0, 0, %r2, 0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + +; Check i64->f32. +define float @f5(i64 %i) #0 { +; CHECK-LABEL: f5: +; CHECK: celgbr %f0, 0, %r2, 0 ; CHECK: br %r14 %conv = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %i, metadata !"round.dynamic", @@ -59,8 +85,8 @@ define float @f4(i64 %i) #0 { } ; Check i64->f64. -define double @f5(i64 %i) #0 { -; CHECK-LABEL: f5: +define double @f6(i64 %i) #0 { +; CHECK-LABEL: f6: ; CHECK: cdlgbr %f0, 0, %r2, 0 ; CHECK: br %r14 %conv = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %i, @@ -70,8 +96,8 @@ define double @f5(i64 %i) #0 { } ; Check i64->f128. -define void @f6(i64 %i, ptr %dst) #0 { -; CHECK-LABEL: f6: +define void @f7(i64 %i, ptr %dst) #0 { +; CHECK-LABEL: f7: ; CHECK: cxlgbr %f0, 0, %r2, 0 ; CHECK-DAG: std %f0, 0(%r3) ; CHECK-DAG: std %f2, 8(%r3) diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll index aa82a1d91c4a8..1d3387f610d72 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll @@ -2,14 +2,27 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s +declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f32(float, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata) +; Test f16->i32. +define i32 @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: clfebr %r2, 5, %f0, 0 +; CHECK: br %r14 + %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i32 %conv +} + ; Test f32->i32. define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: @@ -43,9 +56,20 @@ define i32 @f3(ptr %src) #0 { ret i32 %conv } -; Test f32->i64. -define i64 @f4(float %f) #0 { +; Test f16->i64. +define i64 @f4(half %f) #0 { ; CHECK-LABEL: f4: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: clgebr %r2, 5, %f0, 0 +; CHECK: br %r14 + %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i64 %conv +} + +; Test f32->i64. +define i64 @f5(float %f) #0 { +; CHECK-LABEL: f5: ; CHECK: clgebr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %f, @@ -54,8 +78,8 @@ define i64 @f4(float %f) #0 { } ; Test f64->i64. -define i64 @f5(double %f) #0 { -; CHECK-LABEL: f5: +define i64 @f6(double %f) #0 { +; CHECK-LABEL: f6: ; CHECK: clgdbr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %f, @@ -64,8 +88,8 @@ define i64 @f5(double %f) #0 { } ; Test f128->i64. -define i64 @f6(ptr %src) #0 { -; CHECK-LABEL: f6: +define i64 @f7(ptr %src) #0 { +; CHECK-LABEL: f7: ; CHECK-DAG: ld %f0, 0(%r2) ; CHECK-DAG: ld %f2, 8(%r2) ; CHECK: clgxbr %r2, 5, %f0, 0 diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll index de93192b5f305..a53a3537a7390 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll @@ -2,9 +2,11 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +declare half @llvm.experimental.constrained.fptrunc.f16.f128(fp128, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f128(fp128, metadata, metadata) declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.fpext.f128.f16(half, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata) @@ -22,6 +24,21 @@ define double @f1(ptr %ptr) #0 { ret double %res } +; Test f128->f16. +define half @f2_half(ptr %ptr) #0 { +; CHECK-LABEL: f2_half: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: vst %v0, 160(%r15), 3 +; CHECK: brasl %r14, __trunctfhf2@PLT +; CHECK: br %r14 + %val = load fp128, ptr %ptr + %res = call half @llvm.experimental.constrained.fptrunc.f16.f128( + fp128 %val, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test f128->f32. define float @f2(ptr %ptr) #0 { ; CHECK-LABEL: f2: @@ -62,4 +79,15 @@ define void @f4(ptr %dst, float %val) #0 { ret void } +; Test f16->f128. +define void @f5(ptr %dst, half %val) #0 { +; CHECK-LABEL: f5: +; CHECK: brasl %r14, __extendhftf2@PLT +; CHECK: br %r14 + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f16(half %val, + metadata !"fpexcept.strict") #0 + store fp128 %res, ptr %dst + ret void +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll index 3ff63242a6d82..2becd18277e2a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll @@ -6,18 +6,22 @@ declare fp128 @llvm.experimental.constrained.sitofp.f128.i128(i128, metadata, metadata) declare double @llvm.experimental.constrained.sitofp.f64.i128(i128, metadata, metadata) declare float @llvm.experimental.constrained.sitofp.f32.i128(i128, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i128(i128, metadata, metadata) declare fp128 @llvm.experimental.constrained.uitofp.f128.i128(i128, metadata, metadata) declare double @llvm.experimental.constrained.uitofp.f64.i128(i128, metadata, metadata) declare float @llvm.experimental.constrained.uitofp.f32.i128(i128, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i128(i128, metadata, metadata) declare i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128, metadata) declare i128 @llvm.experimental.constrained.fptosi.i128.f64(double, metadata) declare i128 @llvm.experimental.constrained.fptosi.i128.f32(float, metadata) +declare i128 @llvm.experimental.constrained.fptosi.i128.f16(half, metadata) declare i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128, metadata) declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata) declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata) +declare i128 @llvm.experimental.constrained.fptoui.i128.f16(half, metadata) ; Test signed i128->f128. define fp128 @f1(i128 %i) #0 { @@ -52,9 +56,21 @@ define float @f3(i128 %i) #0 { ret float %conv } -; Test unsigned i128->f128. -define fp128 @f4(i128 %i) #0 { +; Test signed i128->f16. +define half @f4(i128 %i) #0 { ; CHECK-LABEL: f4: +; CHECK: %r14, __floattisf@PLT +; CHECK: %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.sitofp.f16.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + +; Test unsigned i128->f128. +define fp128 @f5(i128 %i) #0 { +; CHECK-LABEL: f5: ; CHECK: brasl %r14, __floatuntitf@PLT ; CHECK: br %r14 %conv = call fp128 @llvm.experimental.constrained.uitofp.f128.i128(i128 %i, @@ -64,8 +80,8 @@ define fp128 @f4(i128 %i) #0 { } ; Test unsigned i128->f64. -define double @f5(i128 %i) #0 { -; CHECK-LABEL: f5: +define double @f6(i128 %i) #0 { +; CHECK-LABEL: f6: ; CHECK: brasl %r14, __floatuntidf@PLT ; CHECK: br %r14 %conv = call double @llvm.experimental.constrained.uitofp.f64.i128(i128 %i, @@ -75,8 +91,8 @@ define double @f5(i128 %i) #0 { } ; Test unsigned i128->f32. -define float @f6(i128 %i) #0 { -; CHECK-LABEL: f6: +define float @f7(i128 %i) #0 { +; CHECK-LABEL: f7: ; CHECK: brasl %r14, __floatuntisf@PLT ; CHECK: br %r14 %conv = call float @llvm.experimental.constrained.uitofp.f32.i128(i128 %i, @@ -85,9 +101,21 @@ define float @f6(i128 %i) #0 { ret float %conv } +; Test unsigned i128->f16. +define half @f8(i128 %i) #0 { +; CHECK-LABEL: f8: +; CHECK: brasl %r14, __floatuntisf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %conv = call half @llvm.experimental.constrained.uitofp.f16.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %conv +} + ; Test signed f128->i128. -define i128 @f7(fp128 %f) #0 { -; CHECK-LABEL: f7: +define i128 @f9(fp128 %f) #0 { +; CHECK-LABEL: f9: ; CHECK: brasl %r14, __fixtfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128 %f, @@ -96,8 +124,8 @@ define i128 @f7(fp128 %f) #0 { } ; Test signed f64->i128. -define i128 @f8(double %f) #0 { -; CHECK-LABEL: f8: +define i128 @f10(double %f) #0 { +; CHECK-LABEL: f10: ; CHECK: brasl %r14, __fixdfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f64(double %f, @@ -105,9 +133,9 @@ define i128 @f8(double %f) #0 { ret i128 %conv } -; Test signed f9->i128. -define i128 @f9(float %f) #0 { -; CHECK-LABEL: f9: +; Test signed f32->i128. +define i128 @f11(float %f) #0 { +; CHECK-LABEL: f11: ; CHECK: brasl %r14, __fixsfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f32(float %f, @@ -115,9 +143,20 @@ define i128 @f9(float %f) #0 { ret i128 %conv } +; Test signed f16->i128. +define i128 @f12(half %f) #0 { +; CHECK-LABEL: f12: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __fixsfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + ; Test unsigned f128->i128. -define i128 @f10(fp128 %f) #0 { -; CHECK-LABEL: f10: +define i128 @f13(fp128 %f) #0 { +; CHECK-LABEL: f13: ; CHECK: brasl %r14, __fixunstfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128 %f, @@ -126,8 +165,8 @@ define i128 @f10(fp128 %f) #0 { } ; Test unsigned f64->i128. -define i128 @f11(double %f) #0 { -; CHECK-LABEL: f11: +define i128 @f14(double %f) #0 { +; CHECK-LABEL: f14: ; CHECK: brasl %r14, __fixunsdfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f64(double %f, @@ -136,8 +175,8 @@ define i128 @f11(double %f) #0 { } ; Test unsigned f32->i128. -define i128 @f12(float %f) #0 { -; CHECK-LABEL: f12: +define i128 @f15(float %f) #0 { +; CHECK-LABEL: f15: ; CHECK: brasl %r14, __fixunssfti@PLT ; CHECK: br %r14 %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f32(float %f, @@ -145,4 +184,15 @@ define i128 @f12(float %f) #0 { ret i128 %conv } +; Test unsigned f16->i128. +define i128 @f16(half %f) #0 { +; CHECK-LABEL: f16: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __fixunssfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f16(half %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll index 980df79481936..8b9dbbe9c9e6e 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll @@ -5,8 +5,24 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() +declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) +; Check register division. +define half @f0(half %f1, half %f2) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: debr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.fdiv.f16( + half %f1, half %f2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Check register division. define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll index 4971375789407..c951c79aeb7c6 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll @@ -3,8 +3,26 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ ; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s +declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) +define half @f0(half %f1, half %f2, half %acc) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-SCALAR: maebr %f10, %f0, %f8 +; CHECK-SCALAR: ler %f0, %f10 +; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.fma.f16 ( + half %f1, half %f2, half %acc, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + define float @f1(float %f1, float %f2, float %acc) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: maebr %f4, %f0, %f2 diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll index 964f16d605db6..95a5fa1af832b 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll @@ -43,6 +43,21 @@ define void @f3(ptr %ptr) #0 { ret void } +; Test nearbyint for f16. +declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata) +define half @f4_half(half %f) #0 { +; CHECK-LABEL: f4_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, nearbyintf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.nearbyint.f16( + half %f, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test nearbyint for f32. declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata) define float @f4(float %f) #0 { @@ -84,6 +99,20 @@ define void @f6(ptr %ptr) #0 { ret void } +; Test floor for f16. +declare half @llvm.experimental.constrained.floor.f16(half, metadata) +define half @f7_half(half %f) #0 { +; CHECK-LABEL: f7_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, floorf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.floor.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test floor for f32. declare float @llvm.experimental.constrained.floor.f32(float, metadata) define float @f7(float %f) #0 { @@ -122,6 +151,20 @@ define void @f9(ptr %ptr) #0 { ret void } +; Test ceil for f16. +declare half @llvm.experimental.constrained.ceil.f16(half, metadata) +define half @f10_half(half %f) #0 { +; CHECK-LABEL: f10_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, ceilf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.ceil.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test ceil for f32. declare float @llvm.experimental.constrained.ceil.f32(float, metadata) define float @f10(float %f) #0 { @@ -160,6 +203,20 @@ define void @f12(ptr %ptr) #0 { ret void } +; Test trunc for f16. +declare half @llvm.experimental.constrained.trunc.f16(half, metadata) +define half @f13_half(half %f) #0 { +; CHECK-LABEL: f13_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, truncf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.trunc.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test trunc for f32. declare float @llvm.experimental.constrained.trunc.f32(float, metadata) define float @f13(float %f) #0 { @@ -198,6 +255,20 @@ define void @f15(ptr %ptr) #0 { ret void } +; Test round for f16. +declare half @llvm.experimental.constrained.round.f16(half, metadata) +define half @f16_half(half %f) #0 { +; CHECK-LABEL: f16_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, roundf@PLT +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.round.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test round for f32. declare float @llvm.experimental.constrained.round.f32(float, metadata) define float @f16(float %f) #0 { diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll index c7b721e3770e5..bdfd9adf2b400 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll @@ -164,6 +164,20 @@ define void @f12(ptr %ptr) #0 { ret void } +; Test trunc for f16. +declare half @llvm.experimental.constrained.trunc.f16(half, metadata) +define half @f13_half(half %f) #0 { +; CHECK-LABEL: f13_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 5, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.trunc.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test trunc for f32. declare float @llvm.experimental.constrained.trunc.f32(float, metadata) define float @f13(float %f) #0 { @@ -202,6 +216,20 @@ define void @f15(ptr %ptr) #0 { ret void } +; Test round for f16. +declare half @llvm.experimental.constrained.round.f16(half, metadata) +define half @f16_half(half %f) #0 { +; CHECK-LABEL: f16_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 1, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.round.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test round for f32. declare float @llvm.experimental.constrained.round.f32(float, metadata) define float @f16(float %f) #0 { @@ -240,6 +268,20 @@ define void @f18(ptr %ptr) #0 { ret void } +; Test roundeven for f16. +declare half @llvm.experimental.constrained.roundeven.f16(half, metadata) +define half @f19_half(half %f) #0 { +; CHECK-LABEL: f19_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 4, %f0, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.roundeven.f16( + half %f, + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test roundeven for f32. declare float @llvm.experimental.constrained.roundeven.f32(float, metadata) define float @f19(float %f) #0 { diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll index e99d8b0f01650..5d9ee28ae8ea2 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll @@ -2,6 +2,21 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; Test rint for f16. +declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata) +define half @f0(half %f) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 0, %f0, 0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.rint.f16( + half %f, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Test rint for f32. declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata) define float @f1(float %f) #0 { diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll index 88cdb71ff7d12..2db86d2de7f66 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll @@ -6,8 +6,24 @@ ; Test strict 32-bit square root. ; +declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) +; Check register square root. +define half @f0(half %val) #0 { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sqebr %f0, %f0 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.sqrt.f16( + half %val, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Check register square root. define float @f1(float %val) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll index a677d471397f7..da91b6e69fd5f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll @@ -5,8 +5,24 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() +declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) +; Check register subtraction. +define half @f0(half %f1, half %f2) #0 { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: sebr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.experimental.constrained.fsub.f16( + half %f1, half %f2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + ; Check register subtraction. define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll index e875fa3be735b..7359f10f92852 100644 --- a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll @@ -6,6 +6,18 @@ declare float @foo() +; Check register subtraction. +define half @f0(half %f1, half %f2) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: sebr %f0, %f9 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = fsub half %f1, %f2 + ret half %res +} + ; Check register subtraction. define float @f1(float %f1, float %f2) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll index 44175f924f7fc..6228ffaa35fa2 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll @@ -3,6 +3,23 @@ ; ; Test inline assembly where the operand is bitcasted. +define signext i16 @short_and_f(i16 signext %cc_dep1) { +; CHECK-LABEL: short_and_f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: # kill: def $f1h killed $f1h killed $f1d +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: # kill: def $f1h killed $f1h def $f1d +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srag %r2, %r0, 48 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call i16 asm sideeffect "", "={f1},0"(i16 %cc_dep1) + ret i16 %0 +} + define signext i32 @int_and_f(i32 signext %cc_dep1) { ; CHECK-LABEL: int_and_f: ; CHECK: # %bb.0: # %entry @@ -51,6 +68,23 @@ entry: ret void } +define half @half_and_r(half %cc_dep1) { +; CHECK-LABEL: half_and_r: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r2, %r0, 48 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "={r2},0"(half %cc_dep1) + ret half %0 +} + define float @float_and_r(float %cc_dep1) { ; CHECK-LABEL: float_and_r: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll index 3cbf3d21dec5a..cf4dbbff8bec0 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll @@ -4,6 +4,20 @@ ; ; Test inline assembly where the operand is bitcasted. +define signext i16 @short_and_f(i16 signext %cc_dep1) { +; CHECK-LABEL: short_and_f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vlvgh %v0, %r2, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vlgvh %r0, %v0, 0 +; CHECK-NEXT: lghr %r2, %r0 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call i16 asm sideeffect "", "={f0},0"(i16 %cc_dep1) + ret i16 %0 +} + define signext i32 @int_and_f(i32 signext %cc_dep1) { ; CHECK-LABEL: int_and_f: ; CHECK: # %bb.0: # %entry @@ -101,6 +115,19 @@ entry: ret void } +define half @half_and_r(half %cc_dep1) { +; CHECK-LABEL: half_and_r: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vlgvh %r0, %v0, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vlvgh %v0, %r0, 0 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "={r0},0"(half %cc_dep1) + ret half %0 +} + define float @float_and_r(float %cc_dep1) { ; CHECK-LABEL: float_and_r: ; CHECK: # %bb.0: # %entry @@ -145,6 +172,19 @@ entry: ret void } +define half @half_and_v(half %cc_dep1) { +; CHECK-LABEL: half_and_v: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ldr %f3, %f0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ldr %f0, %f3 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "={v3},0"(half %cc_dep1) + ret half %0 +} + define float @float_and_v(float %cc_dep1) { ; CHECK-LABEL: float_and_v: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll index 1ef6eece80acb..19969ccf4e297 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll @@ -3,6 +3,21 @@ ; ; Test inline assembly where the operand is bitcasted. +define signext i16 @short_and_f(i16 signext %cc_dep1) { +; CHECK-LABEL: short_and_f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srag %r2, %r0, 48 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call i16 asm sideeffect "", "=f,0"(i16 %cc_dep1) + ret i16 %0 +} + define signext i32 @int_and_f(i32 signext %cc_dep1) { ; CHECK-LABEL: int_and_f: ; CHECK: # %bb.0: # %entry @@ -49,6 +64,23 @@ entry: ret void } +define half @half_and_r(half %cc_dep1) { +; CHECK-LABEL: half_and_r: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "=r,0"(half %cc_dep1) + ret half %0 +} + define float @float_and_r(float %cc_dep1) { ; CHECK-LABEL: float_and_r: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll index 23d78a9315b40..b23b40e0f0e90 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll @@ -4,6 +4,20 @@ ; ; Test inline assembly where the operand is bitcasted. +define signext i16 @short_and_f(i16 signext %cc_dep1) { +; CHECK-LABEL: short_and_f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vlvgh %v0, %r2, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vlgvh %r0, %v0, 0 +; CHECK-NEXT: lghr %r2, %r0 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call i16 asm sideeffect "", "=f,0"(i16 %cc_dep1) + ret i16 %0 +} + define signext i32 @int_and_f(i32 signext %cc_dep1) { ; CHECK-LABEL: int_and_f: ; CHECK: # %bb.0: # %entry @@ -58,6 +72,20 @@ entry: ret void } +define signext i16 @short_and_v(i16 signext %cc_dep1) { +; CHECK-LABEL: short_and_v: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vlvgh %v0, %r2, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vlgvh %r0, %v0, 0 +; CHECK-NEXT: lghr %r2, %r0 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call i16 asm sideeffect "", "=v,0"(i16 %cc_dep1) + ret i16 %0 +} + define signext i32 @int_and_v(i32 signext %cc_dep1) { ; CHECK-LABEL: int_and_v: ; CHECK: # %bb.0: # %entry @@ -100,6 +128,19 @@ entry: ret void } +define half @half_and_r(half %cc_dep1) { +; CHECK-LABEL: half_and_r: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vlgvh %r0, %v0, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vlvgh %v0, %r0, 0 +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "=r,0"(half %cc_dep1) + ret half %0 +} + define float @float_and_r(float %cc_dep1) { ; CHECK-LABEL: float_and_r: ; CHECK: # %bb.0: # %entry @@ -143,6 +184,17 @@ entry: ret void } +define half @half_and_v(half %cc_dep1) { +; CHECK-LABEL: half_and_v: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: br %r14 +entry: + %0 = tail call half asm sideeffect "", "=v,0"(half %cc_dep1) + ret half %0 +} + define float @float_and_v(float %cc_dep1) { ; CHECK-LABEL: float_and_v: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/SystemZ/is_fpclass.ll b/llvm/test/CodeGen/SystemZ/is_fpclass.ll index 7a02730047d20..98b856c5737ed 100644 --- a/llvm/test/CodeGen/SystemZ/is_fpclass.ll +++ b/llvm/test/CodeGen/SystemZ/is_fpclass.ll @@ -3,11 +3,30 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare i1 @llvm.is.fpclass.f16(half, i32) declare i1 @llvm.is.fpclass.f32(float, i32) declare i1 @llvm.is.fpclass.f64(double, i32) declare i1 @llvm.is.fpclass.f128(fp128, i32) +define i1 @isnan_h(half %x) { +; CHECK-LABEL: isnan_h: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: tceb %f0, 15 +; CHECK-NEXT: ipm %r2 +; CHECK-NEXT: srl %r2, 28 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) ; nan + ret i1 %1 +} + define i1 @isnan_f(float %x) { ; CHECK-LABEL: isnan_f: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/SystemZ/spill-half-01.mir b/llvm/test/CodeGen/SystemZ/spill-half-01.mir new file mode 100644 index 0000000000000..2680d0225459b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/spill-half-01.mir @@ -0,0 +1,63 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s -check-prefix=CHECK +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s -check-prefix=VECTOR + +# Test spilling / reloading fp16bit virtual registers. + +--- +name: fun0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $f0h, $f2h, $f4h + + ; CHECK-LABEL: fun0: + ; CHECK: aghi %r15, -240 + ; CHECK: ste %f4, 172(%r15) # 4-byte Spill + ; CHECK-NEXT: ste %f2, 164(%r15) # 4-byte Spill + ; CHECK-NEXT: ste %f0, 168(%r15) # 4-byte Spill + ; CHECK-NEXT: #APP + ; CHECK-NEXT: #NO_APP + ; CHECK-NEXT: le %f0, 164(%r15) # 4-byte Reload + ; CHECK: le %f0, 168(%r15) # 4-byte Reload + ; CHECK: le %f0, 172(%r15) # 4-byte Reload + + ; VECTOR-LABEL: fun0: + ; VECTOR: aghi %r15, -232 + ; VECTOR: vsteh %v4, 166(%r15), 0 # 2-byte Spill + ; VECTOR-NEXT: vsteh %v2, 162(%r15), 0 # 2-byte Spill + ; VECTOR-NEXT: vsteh %v0, 164(%r15), 0 # 2-byte Spill + ; VECTOR-NEXT: #APP + ; VECTOR-NEXT: #NO_APP + ; VECTOR-NEXT: vlreph %v0, 162(%r15) # 2-byte Reload + ; VECTOR: vlreph %v0, 164(%r15) # 2-byte Reload + ; VECTOR: vlreph %v0, 166(%r15) # 2-byte Reload + + %2:fp16bit = COPY $f4h + %1:fp16bit = COPY $f2h + %0:fp16bit = COPY $f0h + INLINEASM &"", 1, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d + $f0h = COPY %1 + CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s + %3:fp32bit = COPY $f0s + $f0h = COPY %0 + CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s + %5:fp32bit = COPY $f0s + %5:fp32bit = nofpexcept AEBR %5, %3, implicit-def dead $cc, implicit $fpc + $f0s = COPY %5 + CallBRASL &__truncsfhf2, $f0s, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0h + %6:fp16bit = COPY $f0h + $f0h = COPY %6 + CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s + %7:fp32bit = COPY $f0s + $f0h = COPY %2 + CallBRASL &__extendhfsf2, $f0h, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0s + %9:fp32bit = COPY $f0s + %9:fp32bit = nofpexcept AEBR %9, %7, implicit-def dead $cc, implicit $fpc + $f0s = COPY %9 + CallBRASL &__truncsfhf2, $f0s, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $f0h + %10:fp16bit = COPY $f0h + $f0h = COPY %10 + Return implicit $f0h +... diff --git a/llvm/test/CodeGen/SystemZ/spill-half-02.mir b/llvm/test/CodeGen/SystemZ/spill-half-02.mir new file mode 100644 index 0000000000000..724b5d352b298 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/spill-half-02.mir @@ -0,0 +1,27 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s + +# Test spilling / reloading of an vr16bit virtual register. + +--- +name: fun0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $r2d, $r3d + + ; CHECK-LABEL: fun0: + ; CHECK: vlreph %v0, 0(%r2) + ; CHECK-NEXT: vsteh %v0, 166(%r15), 0 # 2-byte Spill + ; CHECK-NEXT: #APP + ; CHECK-NEXT: #NO_APP + ; CHECK-NEXT: vlreph %v0, 166(%r15) # 2-byte Reload + ; CHECK-NEXT: vsteh %v0, 0(%r3), 0 + + %1:addr64bit = COPY $r3d + %0:addr64bit = COPY $r2d + %2:vr16bit = VL16 %0, 0, $noreg + INLINEASM &"", 1, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d, 12, implicit-def dead early-clobber $f16d, 12, implicit-def dead early-clobber $f17d, 12, implicit-def dead early-clobber $f18d, 12, implicit-def dead early-clobber $f19d, 12, implicit-def dead early-clobber $f20d, 12, implicit-def dead early-clobber $f21d, 12, implicit-def dead early-clobber $f22d, 12, implicit-def dead early-clobber $f23d, 12, implicit-def dead early-clobber $f24d, 12, implicit-def dead early-clobber $f25d, 12, implicit-def dead early-clobber $f26d, 12, implicit-def dead early-clobber $f27d, 12, implicit-def dead early-clobber $f28d, 12, implicit-def dead early-clobber $f29d, 12, implicit-def dead early-clobber $f30d, 12, implicit-def dead early-clobber $f31d + VST16 %2, %1, 0, $noreg + Return +... diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll index 6156b7f2fc5a1..05b8de756c032 100644 --- a/llvm/test/CodeGen/SystemZ/stackmap.ll +++ b/llvm/test/CodeGen/SystemZ/stackmap.ll @@ -553,7 +553,14 @@ declare void @escape_values(...) ; CHECK-LABEL: .long .L{{.*}}-floats ; CHECK-NEXT: .short 0 ; Num Locations -; CHECK-NEXT: .short 6 +; CHECK-NEXT: .short 9 +; Loc 0: constant half stored to FP register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short {{.*}} +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .long 16 ; Loc 0: constant float stored to FP register ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 @@ -568,6 +575,13 @@ declare void @escape_values(...) ; CHECK-NEXT: .short {{.*}} ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 +; Loc 1: half value in FP register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short {{.*}} +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .long 16 ; Loc 1: float value in FP register ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 @@ -582,6 +596,13 @@ declare void @escape_values(...) ; CHECK-NEXT: .short {{.*}} ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 +; Loc 3: half on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 8 +; CHECK-NEXT: .short {{.*}} +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .long {{.*}} ; Loc 3: float on stack ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 @@ -596,11 +617,12 @@ declare void @escape_values(...) ; CHECK-NEXT: .short {{.*}} ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long {{.*}} -define void @floats(float %f, double %g) { +define void @floats(half %e, float %f, double %g) { + %hh = alloca half %ff = alloca float %gg = alloca double - call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25, - double 1.5, float %f, double %g, ptr %ff, ptr %gg) + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, half 1.125, + float 1.25, double 1.5, half %e, float %f, double %g, ptr %hh, ptr %ff, ptr %gg) ret void } diff --git a/llvm/test/CodeGen/SystemZ/tdc-01.ll b/llvm/test/CodeGen/SystemZ/tdc-01.ll index 052d895b798f6..a0c090f463a2c 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-01.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-01.ll @@ -2,10 +2,22 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare i32 @llvm.s390.tdc.f16(half, i64) declare i32 @llvm.s390.tdc.f32(float, i64) declare i32 @llvm.s390.tdc.f64(double, i64) declare i32 @llvm.s390.tdc.f128(fp128, i64) +; Check using as i32 - f16 +define i32 @f0(half %x) { +; CHECK-LABEL: f0 +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: tceb %f0, 123 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 + %res = call i32 @llvm.s390.tdc.f16(half %x, i64 123) + ret i32 %res +} + ; Check using as i32 - f32 define i32 @f1(float %x) { ; CHECK-LABEL: f1 diff --git a/llvm/test/CodeGen/SystemZ/tdc-02.ll b/llvm/test/CodeGen/SystemZ/tdc-02.ll index c0c4ac84349e3..ceb397c6cb9cb 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-02.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-02.ll @@ -2,10 +2,27 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare i32 @llvm.s390.tdc.f16(half, i64) declare i32 @llvm.s390.tdc.f32(float, i64) declare i32 @llvm.s390.tdc.f64(double, i64) declare i32 @llvm.s390.tdc.f128(fp128, i64) +; Check using or i1 +define i32 @f0(half %x) { +; CHECK-LABEL: f0 +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: tceb %f0, 7 +; CHECK-NEXT: ipm [[REG1:%r[0-9]+]] +; CHECK-NEXT: risbg %r2, [[REG1]], 63, 191, 36 + %a = call i32 @llvm.s390.tdc.f16(half %x, i64 3) + %b = call i32 @llvm.s390.tdc.f16(half %x, i64 6) + %a1 = icmp ne i32 %a, 0 + %b1 = icmp ne i32 %b, 0 + %res = or i1 %a1, %b1 + %xres = zext i1 %res to i32 + ret i32 %xres +} + ; Check using or i1 define i32 @f1(float %x) { ; CHECK-LABEL: f1 diff --git a/llvm/test/CodeGen/SystemZ/tdc-03.ll b/llvm/test/CodeGen/SystemZ/tdc-03.ll index 95708f1effc6b..b6c12caef72fd 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-03.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-03.ll @@ -3,10 +3,23 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +declare half @llvm.fabs.f16(half) declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) declare fp128 @llvm.fabs.f128(fp128) +; Compare with 0 (unworthy) +define i32 @f0(half %x) { +; CHECK-LABEL: f0 +; CHECK-NOT: tceb +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: ltebr {{%f[0-9]+}}, %f0 +; CHECK-NOT: tceb + %res = fcmp ugt half %x, 0.0 + %xres = zext i1 %res to i32 + ret i32 %xres +} + ; Compare with 0 (unworthy) define i32 @f1(float %x) { ; CHECK-LABEL: f1 @@ -41,9 +54,20 @@ define i32 @f3(float %x) { ret i32 %xres } +; Compare fabs with inf +define i32 @f4_half(half %x) { +; CHECK-LABEL: f4_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: tceb %f0, 4047 + %y = call half @llvm.fabs.f16(half %x) + %res = fcmp ult half %y, 0x7ff0000000000000 + %xres = zext i1 %res to i32 + ret i32 %xres +} + ; Compare fabs with inf define i32 @f4(float %x) { -; CHECK-LABEL: f4 +; CHECK-LABEL: f4: ; CHECK: tceb %f0, 4047 %y = call float @llvm.fabs.f32(float %x) %res = fcmp ult float %y, 0x7ff0000000000000 diff --git a/llvm/test/CodeGen/SystemZ/tdc-04.ll b/llvm/test/CodeGen/SystemZ/tdc-04.ll index 8cc78f3de7522..bc719640e630d 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-04.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-04.ll @@ -1,10 +1,24 @@ ; Test the Test Data Class instruction logic operation conversion from ; signbit extraction. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefixes=CHECK,Z10 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s \ +; RUN: --check-prefixes=CHECK,Z13 ; +; Extract sign bit. +define i32 @f0(half %x) { +; CHECK-LABEL: f0 +; Z10: lgdr %r0, %f0 +; Z10: srlg %r2, %r0, 63 +; Z13: vlgvh %r0, %v0, 0 +; Z13: risblg %r2, %r0, 31, 159, 49 + %cast = bitcast half %x to i16 + %res = icmp slt i16 %cast, 0 + %xres = zext i1 %res to i32 + ret i32 %xres +} + ; Extract sign bit. define i32 @f1(float %x) { ; CHECK-LABEL: f1 diff --git a/llvm/test/CodeGen/SystemZ/tdc-05.ll b/llvm/test/CodeGen/SystemZ/tdc-05.ll index c639a9b7b4757..becf293c21f1f 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-05.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-05.ll @@ -8,6 +8,31 @@ declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) declare fp128 @llvm.fabs.f128(fp128) +; Compare with 0, extract sign bit +define i32 @f0(half %x) { +; CHECK-LABEL: f0 +; CHECK: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: lhr %r0, %r0 +; CHECK-NEXT: chi %r0, 0 +; CHECK-NEXT: ipm %r0 +; CHECK-NEXT: risbg %r13, %r0, 63, 191, 36 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: ipm %r0 +; CHECK-NEXT: rosbg %r13, %r0, 63, 63, 35 +; CHECK-NEXT: lr %r2, %r13 +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 + %cast = bitcast half %x to i16 + %sign = icmp slt i16 %cast, 0 + %fcmp = fcmp ugt half %x, 0.0 + %res = or i1 %sign, %fcmp + %xres = zext i1 %res to i32 + ret i32 %xres +} + ; Compare with 0, extract sign bit define i32 @f1(float %x) { ; CHECK-LABEL: f1 diff --git a/llvm/test/CodeGen/SystemZ/tdc-06.ll b/llvm/test/CodeGen/SystemZ/tdc-06.ll index 4ebf020c973da..19536b26eb5ae 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-06.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-06.ll @@ -3,9 +3,7 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s ; -declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) -declare fp128 @llvm.fabs.f128(fp128) define i32 @fpc(double %x) { entry: diff --git a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir index 7fc7bd3e347bb..95ba0b4bf3466 100644 --- a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir +++ b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir @@ -18,19 +18,19 @@ body: | ; CHECK-NEXT: $r2l = COPY [[COPY]] ; CHECK-NEXT: $r3l = COPY killed [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:grh32bit = COPY killed [[COPY1]] - ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l + ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l ; CHECK-NEXT: [[COPY3:%[0-9]+]]:grh32bit = COPY killed [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:grh32bit = COPY [[COPY3]] - ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 393225 /* reguse:GRH32Bit */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 524297 /* reguse:GRH32Bit */, [[COPY3]] ; CHECK-NEXT: $r2l = COPY killed [[COPY4]] ; CHECK-NEXT: Return implicit killed $r2l %0:gr32bit = COPY killed $r2l %2:grh32bit = COPY %0 $r2l = COPY %0 $r3l = COPY killed %0 - INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l + INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l %4:grh32bit = COPY killed %1 - INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 393225 /* reguse:GRH32Bit */, %4 + INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 524297 /* reguse:GRH32Bit */, %4 $r2l = COPY killed %3 Return implicit killed $r2l ... diff --git a/llvm/test/CodeGen/SystemZ/vec-max-05.ll b/llvm/test/CodeGen/SystemZ/vec-max-05.ll index 7bdf4e06029d2..09d40c77a1fb9 100644 --- a/llvm/test/CodeGen/SystemZ/vec-max-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-max-05.ll @@ -14,6 +14,8 @@ declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) declare float @llvm.maximum.f32(float, float) declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) +declare half @llvm.maxnum.f16(half, half) + declare fp128 @fmaxl(fp128, fp128) declare fp128 @llvm.maxnum.f128(fp128, fp128) declare fp128 @llvm.maximum.f128(fp128, fp128) @@ -96,6 +98,18 @@ define float @f11(float %dummy, float %val1, float %val2) { ret float %ret } +; Test the f16 maxnum intrinsic. +define half @f12_half(half %dummy, half %val1, half %val2) { +; CHECK-LABEL: f12_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfmaxsb %f0, %f0, %f9, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.maxnum.f16(half %val1, half %val2) + ret half %ret +} + ; Test the f32 maxnum intrinsic. define float @f12(float %dummy, float %val1, float %val2) { ; CHECK-LABEL: f12: diff --git a/llvm/test/CodeGen/SystemZ/vec-min-05.ll b/llvm/test/CodeGen/SystemZ/vec-min-05.ll index bf27eb3e56036..b7b288f531041 100644 --- a/llvm/test/CodeGen/SystemZ/vec-min-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-min-05.ll @@ -14,6 +14,8 @@ declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) declare float @llvm.minimum.f32(float, float) declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) +declare half @llvm.minnum.f16(half, half) + declare fp128 @fminl(fp128, fp128) declare fp128 @llvm.minnum.f128(fp128, fp128) declare fp128 @llvm.minimum.f128(fp128, fp128) @@ -96,6 +98,18 @@ define float @f11(float %dummy, float %val1, float %val2) { ret float %ret } +; Test the f16 minnum intrinsic. +define half @f12_half(half %dummy, half %val1, half %val2) { +; CHECK-LABEL: f12_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfminsb %f0, %f0, %f9, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.minnum.f16(half %val1, half %val2) + ret half %ret +} + ; Test the f32 minnum intrinsic. define float @f12(float %dummy, float %val1, float %val2) { ; CHECK-LABEL: f12: diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll index 66870a797a7a5..ff1875a731fbf 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll @@ -12,6 +12,9 @@ declare <4 x float> @llvm.experimental.constrained.maxnum.v4f32(<4 x float>, <4 declare float @llvm.experimental.constrained.maximum.f32(float, float, metadata) declare <4 x float> @llvm.experimental.constrained.maximum.v4f32(<4 x float>, <4 x float>, metadata) +declare half @llvm.experimental.constrained.maxnum.f16(half, half, metadata) +declare half @llvm.experimental.constrained.maximum.f16(half, half, metadata) + declare fp128 @llvm.experimental.constrained.maxnum.f128(fp128, fp128, metadata) declare fp128 @llvm.experimental.constrained.maximum.f128(fp128, fp128, metadata) @@ -38,6 +41,20 @@ define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, ret <2 x double> %ret } +; Test the f16 maxnum intrinsic. +define half @f3_half(half %dummy, half %val1, half %val2) #0 { +; CHECK-LABEL: f3_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfmaxsb %f0, %f0, %f9, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.experimental.constrained.maxnum.f16( + half %val1, half %val2, + metadata !"fpexcept.strict") #0 + ret half %ret +} + ; Test the f32 maxnum intrinsic. define float @f3(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f3: @@ -101,6 +118,20 @@ define <2 x double> @f12(<2 x double> %dummy, <2 x double> %val1, ret <2 x double> %ret } +; Test the f16 maximum intrinsic. +define half @f13_half(half %dummy, half %val1, half %val2) #0 { +; CHECK-LABEL: f13_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfmaxsb %f0, %f0, %f9, 1 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.experimental.constrained.maximum.f16( + half %val1, half %val2, + metadata !"fpexcept.strict") #0 + ret half %ret +} + ; Test the f32 maximum intrinsic. define float @f13(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f13: diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll index cf5332ff4f1d1..ddbffd735f699 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll @@ -12,6 +12,9 @@ declare <4 x float> @llvm.experimental.constrained.minnum.v4f32(<4 x float>, <4 declare float @llvm.experimental.constrained.minimum.f32(float, float, metadata) declare <4 x float> @llvm.experimental.constrained.minimum.v4f32(<4 x float>, <4 x float>, metadata) +declare half @llvm.experimental.constrained.minnum.f16(half, half, metadata) +declare half @llvm.experimental.constrained.minimum.f16(half, half, metadata) + declare fp128 @llvm.experimental.constrained.minnum.f128(fp128, fp128, metadata) declare fp128 @llvm.experimental.constrained.minimum.f128(fp128, fp128, metadata) @@ -38,6 +41,20 @@ define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, ret <2 x double> %ret } +; Test the f16 minnum intrinsic. +define half @f3_half(half %dummy, half %val1, half %val2) #0 { +; CHECK-LABEL: f3_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfminsb %f0, %f0, %f9, 4 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.experimental.constrained.minnum.f16( + half %val1, half %val2, + metadata !"fpexcept.strict") #0 + ret half %ret +} + ; Test the f32 minnum intrinsic. define float @f3(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f3: @@ -101,6 +118,20 @@ define <2 x double> @f12(<2 x double> %dummy, <2 x double> %val1, ret <2 x double> %ret } +; Test the f32 minimum intrinsic. +define half @f13_half(half %dummy, half %val1, half %val2) #0 { +; CHECK-LABEL: f13_half: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: wfminsb %f0, %f0, %f9, 1 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %ret = call half @llvm.experimental.constrained.minimum.f16( + half %val1, half %val2, + metadata !"fpexcept.strict") #0 + ret half %ret +} + ; Test the f32 minimum intrinsic. define float @f13(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f13: From ad12323fbf8f34fcb3bd3a75ed410d3d5b0ca42c Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 16 Apr 2025 11:15:54 -0700 Subject: [PATCH 158/710] [HLSL] Don't invoke `dxv` from `clang-dxc` for text output (#135876) Running `clang-dxc` with textual output was emitting various spurious warnings (if `dxv` wasn't on your path) or errors (if it was). Avoid these by not attempting to run this tool when it doesn't make sense to do so. Fixes #135874. --- clang/lib/Driver/ToolChains/HLSL.cpp | 3 +++ clang/test/Driver/HLSL/metal-converter.hlsl | 10 ++++++---- clang/test/Driver/dxc_D.hlsl | 3 --- clang/test/Driver/dxc_dxv_path.hlsl | 6 +++--- clang/test/Driver/dxc_options.hlsl | 3 +-- clang/test/Driver/hlsl-lang-targets.hlsl | 8 ++++---- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp index 22498bff1f251..59e9050af8a76 100644 --- a/clang/lib/Driver/ToolChains/HLSL.cpp +++ b/clang/lib/Driver/ToolChains/HLSL.cpp @@ -309,6 +309,9 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch, } bool HLSLToolChain::requiresValidation(DerivedArgList &Args) const { + if (!Args.hasArg(options::OPT_dxc_Fo)) + return false; + if (Args.getLastArg(options::OPT_dxc_disable_validation)) return false; diff --git a/clang/test/Driver/HLSL/metal-converter.hlsl b/clang/test/Driver/HLSL/metal-converter.hlsl index 536f24be6e73b..3c4257b3fbb28 100644 --- a/clang/test/Driver/HLSL/metal-converter.hlsl +++ b/clang/test/Driver/HLSL/metal-converter.hlsl @@ -1,11 +1,13 @@ -// RUN: %clang_dxc -T cs_6_0 %s -metal -Fo %t.mtl -### 2>&1 | FileCheck %s -// RUN: %clang_dxc -T cs_6_0 %s -metal -Vd -Fo %t.mtl -### 2>&1 | FileCheck %s -// CHECK: "{{.*}}metal-shaderconverter{{(.exe)?}}" "{{.*}}.obj" "-o" "{{.*}}.mtl" +// RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv + +// RUN: env PATH="" %clang_dxc -T cs_6_0 %s -metal -Fo %t.mtl -### 2>&1 | FileCheck --check-prefix=NO_DXV %s +// RUN: env PATH="" %clang_dxc -T cs_6_0 %s -metal -Vd -Fo %t.mtl -### 2>&1 | FileCheck --check-prefix=NO_DXV %s +// RUN: env PATH="" %clang_dxc -T cs_6_0 %s --dxv-path=%T -metal -Vd -Fo %t.mtl -### 2>&1 | FileCheck --check-prefix=NO_DXV %s +// NO_DXV: "{{.*}}metal-shaderconverter{{(.exe)?}}" "{{.*}}.obj" "-o" "{{.*}}.mtl" // RUN: %clang_dxc -T cs_6_0 %s -metal -### 2>&1 | FileCheck --check-prefix=NO_MTL %s // NO_MTL-NOT: metal-shaderconverter -// RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv // RUN: %clang_dxc -T cs_6_0 %s --dxv-path=%T -metal -Fo %t.mtl -### 2>&1 | FileCheck --check-prefix=DXV %s // DXV: "{{.*}}dxv{{(.exe)?}}" "{{.*}}.obj" "-o" "{{.*}}.dxo" // DXV: "{{.*}}metal-shaderconverter{{(.exe)?}}" "{{.*}}.dxo" "-o" "{{.*}}.mtl" diff --git a/clang/test/Driver/dxc_D.hlsl b/clang/test/Driver/dxc_D.hlsl index f32a22c503327..f941974c61093 100644 --- a/clang/test/Driver/dxc_D.hlsl +++ b/clang/test/Driver/dxc_D.hlsl @@ -1,5 +1,4 @@ // RUN: %clang_dxc -DTEST=2 -Tlib_6_7 -### %s 2>&1 | FileCheck %s -// RUN: %clang_dxc -DTEST=2 -Tlib_6_7 %s -fcgl -Fo - | FileCheck %s --check-prefix=ERROR // Make sure -D send to cc1. // CHECK:"-D" "TEST=2" @@ -9,5 +8,3 @@ #elif TEST != 2 #error "TEST defined to wrong value" #endif - -// ERROR-NOT: error: diff --git a/clang/test/Driver/dxc_dxv_path.hlsl b/clang/test/Driver/dxc_dxv_path.hlsl index 55a07f34a648e..65e386f2f35ab 100644 --- a/clang/test/Driver/dxc_dxv_path.hlsl +++ b/clang/test/Driver/dxc_dxv_path.hlsl @@ -1,10 +1,10 @@ -// RUN: %clang_dxc -I test -Tlib_6_3 -### %s 2>&1 | FileCheck %s +// RUN: env PATH="" %clang_dxc -I test -Tlib_6_3 -Fo %T/a.dxo -### %s 2>&1 | FileCheck %s // Make sure report warning. // CHECK:dxv not found -// RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv && %clang_dxc --dxv-path=%T %s -Tlib_6_3 -### 2>&1 | FileCheck %s --check-prefix=DXV_PATH -// DXV_PATH:dxv{{(.exe)?}}" "-" "-o" "{{.*}}.dxo" +// RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv && %clang_dxc --dxv-path=%T %s -Tlib_6_3 -Fo %T/a.dxo -### 2>&1 | FileCheck %s --check-prefix=DXV_PATH +// DXV_PATH:dxv{{(.exe)?}}" "{{.*}}.obj" "-o" "{{.*}}/a.dxo" // RUN: %clang_dxc -I test -Vd -Tlib_6_3 -### %s 2>&1 | FileCheck %s --check-prefix=VD // VD:"-cc1"{{.*}}"-triple" "dxilv1.3-unknown-shadermodel6.3-library" diff --git a/clang/test/Driver/dxc_options.hlsl b/clang/test/Driver/dxc_options.hlsl index 09fdba1c3dd5f..5026b75e52688 100644 --- a/clang/test/Driver/dxc_options.hlsl +++ b/clang/test/Driver/dxc_options.hlsl @@ -4,5 +4,4 @@ // RUN: -fdiagnostics-color \ // RUN: -fno-diagnostics-color \ // RUN: -fdiagnostics-color=auto \ -// RUN: -Tlib_6_7 -Vd -fdriver-only -- %s 2>&1 |count 0 - +// RUN: -Tlib_6_7 -fdriver-only -- %s 2>&1 |count 0 diff --git a/clang/test/Driver/hlsl-lang-targets.hlsl b/clang/test/Driver/hlsl-lang-targets.hlsl index 7ce490a66df5f..b321b00548e9b 100644 --- a/clang/test/Driver/hlsl-lang-targets.hlsl +++ b/clang/test/Driver/hlsl-lang-targets.hlsl @@ -2,10 +2,10 @@ // Supported targets // -// RUN: %clang -target dxil--shadermodel6.2-pixel %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s -// RUN: %clang -target dxil-unknown-shadermodel6.2-pixel %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s -// RUN: %clang -target dxil--shadermodel6.2-library %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s -// RUN: %clang -target dxil-unknown-shadermodel6.2-library %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s +// RUN: %clang -target dxil--shadermodel6.2-pixel %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s +// RUN: %clang -target dxil-unknown-shadermodel6.2-pixel %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s +// RUN: %clang -target dxil--shadermodel6.2-library %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s +// RUN: %clang -target dxil-unknown-shadermodel6.2-library %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s // Empty shader model // From e19fcb72d7fbda6a1e67c45b85b399fe69d212ad Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 16 Apr 2025 12:25:10 -0600 Subject: [PATCH 159/710] Fix 'unannotated fall-through between switch labels' warning. (#136000) --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index fddb99d2f0b22..b596f6b8d6ce2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -5467,8 +5467,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { DAG.getIntPtrConstant(0, dl, /*isTarget=*/true))); break; - case ISD::STRICT_FMINIMUM: { - case ISD::STRICT_FMAXIMUM: + case ISD::STRICT_FMINIMUM: + case ISD::STRICT_FMAXIMUM: { SDValue InChain = Node->getOperand(0); SDVTList VTs = DAG.getVTList(NVT, MVT::Other); Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, VTs, InChain, From 8a00efd26db21ef73df58b465b7741d1f889a681 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 11:25:55 -0700 Subject: [PATCH 160/710] [SystemZ] Fix warnings This patch fixes: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp:6916:7: error: unused variable 'RegVT' [-Werror,-Wunused-variable] llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp:1265:30: error: unused variable 'RC' [-Werror,-Wunused-variable] --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 1 + llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index fdbfc196e8fbc..75cd5a319557d 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -6915,6 +6915,7 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op, SelectionDAG &DAG) const { EVT RegVT = Op.getValueType(); assert(RegVT == MVT::f16 && "Expected to lower an f16 load."); + (void)RegVT; // Load as integer. SDLoc DL(Op); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 1ae3994eb0e01..ae6ca55a36092 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1267,6 +1267,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( assert((Size * 8 == TRI->getRegSizeInBits(*RC) || (RC == &SystemZ::FP16BitRegClass && Size == 4 && !STI.hasVector())) && "Invalid size combination"); + (void)RC; if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 && isInt<8>(MI.getOperand(2).getImm())) { From e77ef7b291a0024ae34eaa76dafb62aef06d3c95 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 11:29:34 -0700 Subject: [PATCH 161/710] [NFC][CFI] Dump test output to debug llvm-clang-win-x-aarch64 failure (#136002) --- clang/test/Driver/sanitizer-ld.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index a00ec029d3d46..c04831171bba9 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -839,6 +839,14 @@ // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-flto' // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-fvisibility=' +// CFI by itself does not link runtime libraries. +// RUN: %clang -fsanitize=cfi \ +// RUN: -flto -fvisibility=hidden \ +// RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: --sysroot=%S/Inputs/basic_linux_tree \ +// RUN: -### %s + // CFI by itself does not link runtime libraries. // RUN: %clang -fsanitize=cfi \ // RUN: -flto -fvisibility=hidden \ From 4aca20c8b6dcf86696db03d860e635112601a7f9 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Apr 2025 11:42:56 -0700 Subject: [PATCH 162/710] [SLP]Pre-cache the last instruction for all entries before vectorization Need to pre-cache last instruction to avoid unexpected changes in the last instruction detection during the vectorization, caused by adding the new vector instructions, which add new uses and may affect the analysis. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 8 +- .../entry-no-bundle-but-extra-use-on-vec.ll | 91 +++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 83252bdb51ea2..41ad43012d337 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -18414,8 +18414,14 @@ Value *BoUpSLP::vectorizeTree( // need to rebuild it. EntryToLastInstruction.clear(); // All blocks must be scheduled before any instructions are inserted. - for (auto &BSIter : BlocksSchedules) { + for (auto &BSIter : BlocksSchedules) scheduleBlock(BSIter.second.get()); + // Cache last instructions for the nodes to avoid side effects, which may + // appear during vectorization, like extra uses, etc. + for (const std::unique_ptr &TE : VectorizableTree) { + if (TE->isGather()) + continue; + (void)getLastInstructionInBundle(TE.get()); } if (ReductionRoot) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll new file mode 100644 index 0000000000000..9d48e7f8a787a --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-generic-linux-gnu < %s | FileCheck %s + +define void @test(ptr %nExp, float %0, i1 %cmp, float %1) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[NEXP:%.*]], float [[TMP0:%.*]], i1 [[CMP:%.*]], float [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> , float [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ , %[[ENTRY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[CALL25:%.*]] = load volatile ptr, ptr null, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> , <2 x float> [[TMP22]], i64 2) +; CHECK-NEXT: [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]] +; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[CALL25]], align 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %cmp, label %if.then, label %if.end + +if.then: + %div.i41 = fmul float %0, 0.000000e+00 + %2 = load float, ptr %nExp, align 4 + %div.1.i.i = fmul float %2, 0.000000e+00 + %div.2.i.i = fmul float %0, 0.000000e+00 + br label %if.end + +if.end: + %3 = phi float [ %1, %if.then ], [ %0, %entry ] + %4 = phi float [ 0.000000e+00, %if.then ], [ %1, %entry ] + %5 = phi float [ 0.000000e+00, %if.then ], [ 0x7FF8000000000000, %entry ] + %6 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ] + %fa.sroa.9.0 = phi float [ %div.2.i.i, %if.then ], [ 0.000000e+00, %entry ] + %fa.sroa.7.0 = phi float [ %div.1.i.i, %if.then ], [ 0.000000e+00, %entry ] + %fa.sroa.0.0 = phi float [ %div.i41, %if.then ], [ 0.000000e+00, %entry ] + %mul.1.i.i58 = fmul float %fa.sroa.7.0, %6 + %mul.2.i.i60 = fmul float %fa.sroa.9.0, %6 + %mul.1.i.i.i63 = fmul float %fa.sroa.0.0, %5 + %mul.2.i.i.i65 = fmul float %fa.sroa.0.0, 0.000000e+00 + %mul.i66 = fmul float %fa.sroa.0.0, 0.000000e+00 + %add.1.i.i = fadd float %mul.1.i.i58, %mul.1.i.i.i63 + %add.2.i.i = fadd float %mul.2.i.i60, %mul.2.i.i.i65 + %mul.1.i.i74 = fmul float %add.1.i.i, 0.000000e+00 + %mul.2.i.i76 = fmul float %add.2.i.i, 0.000000e+00 + %mul.i.i.i78 = fmul float %mul.i66, 0.000000e+00 + %add.1.i.i85 = fadd float %mul.1.i.i74, 0.000000e+00 + %add.2.i.i86 = fadd float %mul.2.i.i76, 0.000000e+00 + %mul.i.i.i97 = fmul float %5, 0.000000e+00 + %mul.1.i.i.i99 = fmul float %4, 0.000000e+00 + %mul.2.i.i.i101 = fmul float %3, 0.000000e+00 + %add.i.i103 = fadd float %mul.i.i.i97, 0.000000e+00 + %add.1.i.i104 = fadd float %mul.1.i.i.i99, 0.000000e+00 + %add.2.i.i105 = fadd float %mul.2.i.i.i101, 0.000000e+00 + %add = fadd float %mul.i.i.i78, 0.000000e+00 + %add.i = fadd float %add.i.i103, 1.000000e+00 + %add.1.i = fadd float %add.1.i.i104, %add.1.i.i85 + %add.2.i = fadd float %add.2.i.i105, %add.2.i.i86 + %call25 = load volatile ptr, ptr null, align 8 + store float %add, ptr %call25, align 4 + %__trans_tmp_29.sroa.5.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 4 + store float %add.i, ptr %__trans_tmp_29.sroa.5.0.call25.sroa_idx, align 4 + %__trans_tmp_29.sroa.6.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 8 + store float %add.1.i, ptr %__trans_tmp_29.sroa.6.0.call25.sroa_idx, align 4 + %__trans_tmp_29.sroa.7.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 12 + store float %add.2.i, ptr %__trans_tmp_29.sroa.7.0.call25.sroa_idx, align 4 + ret void +} From 4903a7b77b56c7d9a650205b6e7dca46581c7134 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 16 Apr 2025 12:10:08 -0700 Subject: [PATCH 163/710] [ctxprof][nfc] Move profile annotator to Analysis (#135871) This moves the utility that propagates counter values such that we can reuse it elsewhere. Specifically, in a subsequent patch, it'll be used to guide ICP: we need to prioritize promoting indirect calls that dominate larger portions of the dynamic instruction count. We can compare them based on the dynamic count of IR instructions, and we can get that early with this counter propagation logic. The patch is mostly a move of the existing logic, with a pimpl - style implementation to hide all the current complexity. --- llvm/include/llvm/Analysis/CtxProfAnalysis.h | 28 ++ llvm/lib/Analysis/CtxProfAnalysis.cpp | 372 +++++++++++++++++ .../Instrumentation/PGOCtxProfFlattening.cpp | 375 ++---------------- 3 files changed, 424 insertions(+), 351 deletions(-) diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h index 6f1c3696ca78c..aa582cfef1ad1 100644 --- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h +++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h @@ -157,6 +157,34 @@ class CtxProfAnalysisPrinterPass const PrintMode Mode; }; +/// Utility that propagates counter values to each basic block and to each edge +/// when a basic block has more than one outgoing edge, using an adaptation of +/// PGOUseFunc::populateCounters. +// FIXME(mtrofin): look into factoring the code to share one implementation. +class ProfileAnnotatorImpl; +class ProfileAnnotator { + std::unique_ptr PImpl; + +public: + ProfileAnnotator(const Function &F, ArrayRef RawCounters); + uint64_t getBBCount(const BasicBlock &BB) const; + + // Finds the true and false counts for the given select instruction. Returns + // false if the select doesn't have instrumentation or if the count of the + // parent BB is 0. + bool getSelectInstrProfile(SelectInst &SI, uint64_t &TrueCount, + uint64_t &FalseCount) const; + // Clears Profile and populates it with the edge weights, in the same order as + // they need to appear in the MD_prof metadata. Also computes the max of those + // weights an returns it in MaxCount. Returs false if: + // - the BB has less than 2 successors + // - the counts are 0 + bool getOutgoingBranchWeights(BasicBlock &BB, + SmallVectorImpl &Profile, + uint64_t &MaxCount) const; + ~ProfileAnnotator(); +}; + /// Assign a GUID to functions as metadata. GUID calculation takes linkage into /// account, which may change especially through and after thinlto. By /// pre-computing and assigning as metadata, this mechanism is resilient to such diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index d203e277546ea..391631e15aa89 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -14,7 +14,9 @@ #include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/CFG.h" #include "llvm/IR/Analysis.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" @@ -22,6 +24,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" +#include +#include #define DEBUG_TYPE "ctx_prof" @@ -46,6 +50,374 @@ static cl::opt ForceIsInSpecializedModule( const char *AssignGUIDPass::GUIDMetadataName = "guid"; +namespace llvm { +class ProfileAnnotatorImpl final { + friend class ProfileAnnotator; + class BBInfo; + struct EdgeInfo { + BBInfo *const Src; + BBInfo *const Dest; + std::optional Count; + + explicit EdgeInfo(BBInfo &Src, BBInfo &Dest) : Src(&Src), Dest(&Dest) {} + }; + + class BBInfo { + std::optional Count; + // OutEdges is dimensioned to match the number of terminator operands. + // Entries in the vector match the index in the terminator operand list. In + // some cases - see `shouldExcludeEdge` and its implementation - an entry + // will be nullptr. + // InEdges doesn't have the above constraint. + SmallVector OutEdges; + SmallVector InEdges; + size_t UnknownCountOutEdges = 0; + size_t UnknownCountInEdges = 0; + + // Pass AssumeAllKnown when we try to propagate counts from edges to BBs - + // because all the edge counters must be known. + // Return std::nullopt if there were no edges to sum. The user can decide + // how to interpret that. + std::optional getEdgeSum(const SmallVector &Edges, + bool AssumeAllKnown) const { + std::optional Sum; + for (const auto *E : Edges) { + // `Edges` may be `OutEdges`, case in which `E` could be nullptr. + if (E) { + if (!Sum.has_value()) + Sum = 0; + *Sum += (AssumeAllKnown ? *E->Count : E->Count.value_or(0U)); + } + } + return Sum; + } + + bool computeCountFrom(const SmallVector &Edges) { + assert(!Count.has_value()); + Count = getEdgeSum(Edges, true); + return Count.has_value(); + } + + void setSingleUnknownEdgeCount(SmallVector &Edges) { + uint64_t KnownSum = getEdgeSum(Edges, false).value_or(0U); + uint64_t EdgeVal = *Count > KnownSum ? *Count - KnownSum : 0U; + EdgeInfo *E = nullptr; + for (auto *I : Edges) + if (I && !I->Count.has_value()) { + E = I; +#ifdef NDEBUG + break; +#else + assert((!E || E == I) && + "Expected exactly one edge to have an unknown count, " + "found a second one"); + continue; +#endif + } + assert(E && "Expected exactly one edge to have an unknown count"); + assert(!E->Count.has_value()); + E->Count = EdgeVal; + assert(E->Src->UnknownCountOutEdges > 0); + assert(E->Dest->UnknownCountInEdges > 0); + --E->Src->UnknownCountOutEdges; + --E->Dest->UnknownCountInEdges; + } + + public: + BBInfo(size_t NumInEdges, size_t NumOutEdges, std::optional Count) + : Count(Count) { + // For in edges, we just want to pre-allocate enough space, since we know + // it at this stage. For out edges, we will insert edges at the indices + // corresponding to positions in this BB's terminator instruction, so we + // construct a default (nullptr values)-initialized vector. A nullptr edge + // corresponds to those that are excluded (see shouldExcludeEdge). + InEdges.reserve(NumInEdges); + OutEdges.resize(NumOutEdges); + } + + bool tryTakeCountFromKnownOutEdges(const BasicBlock &BB) { + if (!UnknownCountOutEdges) { + return computeCountFrom(OutEdges); + } + return false; + } + + bool tryTakeCountFromKnownInEdges(const BasicBlock &BB) { + if (!UnknownCountInEdges) { + return computeCountFrom(InEdges); + } + return false; + } + + void addInEdge(EdgeInfo &Info) { + InEdges.push_back(&Info); + ++UnknownCountInEdges; + } + + // For the out edges, we care about the position we place them in, which is + // the position in terminator instruction's list (at construction). Later, + // we build branch_weights metadata with edge frequency values matching + // these positions. + void addOutEdge(size_t Index, EdgeInfo &Info) { + OutEdges[Index] = &Info; + ++UnknownCountOutEdges; + } + + bool hasCount() const { return Count.has_value(); } + + uint64_t getCount() const { return *Count; } + + bool trySetSingleUnknownInEdgeCount() { + if (UnknownCountInEdges == 1) { + setSingleUnknownEdgeCount(InEdges); + return true; + } + return false; + } + + bool trySetSingleUnknownOutEdgeCount() { + if (UnknownCountOutEdges == 1) { + setSingleUnknownEdgeCount(OutEdges); + return true; + } + return false; + } + size_t getNumOutEdges() const { return OutEdges.size(); } + + uint64_t getEdgeCount(size_t Index) const { + if (auto *E = OutEdges[Index]) + return *E->Count; + return 0U; + } + }; + + const Function &F; + ArrayRef Counters; + // To be accessed through getBBInfo() after construction. + std::map BBInfos; + std::vector EdgeInfos; + + // The only criteria for exclusion is faux suspend -> exit edges in presplit + // coroutines. The API serves for readability, currently. + bool shouldExcludeEdge(const BasicBlock &Src, const BasicBlock &Dest) const { + return llvm::isPresplitCoroSuspendExitEdge(Src, Dest); + } + + BBInfo &getBBInfo(const BasicBlock &BB) { return BBInfos.find(&BB)->second; } + + const BBInfo &getBBInfo(const BasicBlock &BB) const { + return BBInfos.find(&BB)->second; + } + + // validation function after we propagate the counters: all BBs and edges' + // counters must have a value. + bool allCountersAreAssigned() const { + for (const auto &BBInfo : BBInfos) + if (!BBInfo.second.hasCount()) + return false; + for (const auto &EdgeInfo : EdgeInfos) + if (!EdgeInfo.Count.has_value()) + return false; + return true; + } + + /// Check that all paths from the entry basic block that use edges with + /// non-zero counts arrive at a basic block with no successors (i.e. "exit") + bool allTakenPathsExit() const { + std::deque Worklist; + DenseSet Visited; + Worklist.push_back(&F.getEntryBlock()); + bool HitExit = false; + while (!Worklist.empty()) { + const auto *BB = Worklist.front(); + Worklist.pop_front(); + if (!Visited.insert(BB).second) + continue; + if (succ_size(BB) == 0) { + if (isa(BB->getTerminator())) + return false; + HitExit = true; + continue; + } + if (succ_size(BB) == 1) { + Worklist.push_back(BB->getUniqueSuccessor()); + continue; + } + const auto &BBInfo = getBBInfo(*BB); + bool HasAWayOut = false; + for (auto I = 0U; I < BB->getTerminator()->getNumSuccessors(); ++I) { + const auto *Succ = BB->getTerminator()->getSuccessor(I); + if (!shouldExcludeEdge(*BB, *Succ)) { + if (BBInfo.getEdgeCount(I) > 0) { + HasAWayOut = true; + Worklist.push_back(Succ); + } + } + } + if (!HasAWayOut) + return false; + } + return HitExit; + } + + bool allNonColdSelectsHaveProfile() const { + for (const auto &BB : F) { + if (getBBInfo(BB).getCount() > 0) { + for (const auto &I : BB) { + if (const auto *SI = dyn_cast(&I)) { + if (const auto *Inst = CtxProfAnalysis::getSelectInstrumentation( + *const_cast(SI))) { + auto Index = Inst->getIndex()->getZExtValue(); + assert(Index < Counters.size()); + if (Counters[Index] == 0) + return false; + } + } + } + } + } + return true; + } + + // This is an adaptation of PGOUseFunc::populateCounters. + // FIXME(mtrofin): look into factoring the code to share one implementation. + void propagateCounterValues() { + bool KeepGoing = true; + while (KeepGoing) { + KeepGoing = false; + for (const auto &BB : F) { + auto &Info = getBBInfo(BB); + if (!Info.hasCount()) + KeepGoing |= Info.tryTakeCountFromKnownOutEdges(BB) || + Info.tryTakeCountFromKnownInEdges(BB); + if (Info.hasCount()) { + KeepGoing |= Info.trySetSingleUnknownOutEdgeCount(); + KeepGoing |= Info.trySetSingleUnknownInEdgeCount(); + } + } + } + assert(allCountersAreAssigned() && + "[ctx-prof] Expected all counters have been assigned."); + assert(allTakenPathsExit() && + "[ctx-prof] Encountered a BB with more than one successor, where " + "all outgoing edges have a 0 count. This occurs in non-exiting " + "functions (message pumps, usually) which are not supported in the " + "contextual profiling case"); + assert(allNonColdSelectsHaveProfile() && + "[ctx-prof] All non-cold select instructions were expected to have " + "a profile."); + } + +public: + ProfileAnnotatorImpl(const Function &F, ArrayRef Counters) + : F(F), Counters(Counters) { + assert(!F.isDeclaration()); + assert(!Counters.empty()); + size_t NrEdges = 0; + for (const auto &BB : F) { + std::optional Count; + if (auto *Ins = CtxProfAnalysis::getBBInstrumentation( + const_cast(BB))) { + auto Index = Ins->getIndex()->getZExtValue(); + assert(Index < Counters.size() && + "The index must be inside the counters vector by construction - " + "tripping this assertion indicates a bug in how the contextual " + "profile is managed by IPO transforms"); + (void)Index; + Count = Counters[Ins->getIndex()->getZExtValue()]; + } else if (isa(BB.getTerminator())) { + // The program presumably didn't crash. + Count = 0; + } + auto [It, Ins] = + BBInfos.insert({&BB, {pred_size(&BB), succ_size(&BB), Count}}); + (void)Ins; + assert(Ins && "We iterate through the function's BBs, no reason to " + "insert one more than once"); + NrEdges += llvm::count_if(successors(&BB), [&](const auto *Succ) { + return !shouldExcludeEdge(BB, *Succ); + }); + } + // Pre-allocate the vector, we want references to its contents to be stable. + EdgeInfos.reserve(NrEdges); + for (const auto &BB : F) { + auto &Info = getBBInfo(BB); + for (auto I = 0U; I < BB.getTerminator()->getNumSuccessors(); ++I) { + const auto *Succ = BB.getTerminator()->getSuccessor(I); + if (!shouldExcludeEdge(BB, *Succ)) { + auto &EI = EdgeInfos.emplace_back(getBBInfo(BB), getBBInfo(*Succ)); + Info.addOutEdge(I, EI); + getBBInfo(*Succ).addInEdge(EI); + } + } + } + assert(EdgeInfos.capacity() == NrEdges && + "The capacity of EdgeInfos should have stayed unchanged it was " + "populated, because we need pointers to its contents to be stable"); + propagateCounterValues(); + } + + uint64_t getBBCount(const BasicBlock &BB) { return getBBInfo(BB).getCount(); } +}; + +} // namespace llvm + +ProfileAnnotator::ProfileAnnotator(const Function &F, + ArrayRef RawCounters) + : PImpl(std::make_unique(F, RawCounters)) {} + +ProfileAnnotator::~ProfileAnnotator() = default; + +uint64_t ProfileAnnotator::getBBCount(const BasicBlock &BB) const { + return PImpl->getBBCount(BB); +} + +bool ProfileAnnotator::getSelectInstrProfile(SelectInst &SI, + uint64_t &TrueCount, + uint64_t &FalseCount) const { + const auto &BBInfo = PImpl->getBBInfo(*SI.getParent()); + TrueCount = FalseCount = 0; + if (BBInfo.getCount() == 0) + return false; + + auto *Step = CtxProfAnalysis::getSelectInstrumentation(SI); + if (!Step) + return false; + auto Index = Step->getIndex()->getZExtValue(); + assert(Index < PImpl->Counters.size() && + "The index of the step instruction must be inside the " + "counters vector by " + "construction - tripping this assertion indicates a bug in " + "how the contextual profile is managed by IPO transforms"); + auto TotalCount = BBInfo.getCount(); + TrueCount = PImpl->Counters[Index]; + FalseCount = (TotalCount > TrueCount ? TotalCount - TrueCount : 0U); + return true; +} + +bool ProfileAnnotator::getOutgoingBranchWeights( + BasicBlock &BB, SmallVectorImpl &Profile, + uint64_t &MaxCount) const { + Profile.clear(); + + if (succ_size(&BB) < 2) + return false; + + auto *Term = BB.getTerminator(); + Profile.resize(Term->getNumSuccessors()); + + const auto &BBInfo = PImpl->getBBInfo(BB); + MaxCount = 0; + for (unsigned SuccIdx = 0, Size = BBInfo.getNumOutEdges(); SuccIdx < Size; + ++SuccIdx) { + uint64_t EdgeCount = BBInfo.getEdgeCount(SuccIdx); + if (EdgeCount > MaxCount) + MaxCount = EdgeCount; + Profile[SuccIdx] = EdgeCount; + } + return MaxCount > 0; +} + PreservedAnalyses AssignGUIDPass::run(Module &M, ModuleAnalysisManager &MAM) { for (auto &F : M.functions()) { if (F.isDeclaration()) diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index 508a41684ed20..e47c9ab75ffe1 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -45,358 +45,33 @@ using namespace llvm; namespace { -class ProfileAnnotator final { - class BBInfo; - struct EdgeInfo { - BBInfo *const Src; - BBInfo *const Dest; - std::optional Count; +/// Assign branch weights and function entry count. Also update the PSI +/// builder. +void assignProfileData(Function &F, ArrayRef RawCounters) { + assert(!RawCounters.empty()); + ProfileAnnotator PA(F, RawCounters); - explicit EdgeInfo(BBInfo &Src, BBInfo &Dest) : Src(&Src), Dest(&Dest) {} - }; + F.setEntryCount(RawCounters[0]); + SmallVector ProfileHolder; - class BBInfo { - std::optional Count; - // OutEdges is dimensioned to match the number of terminator operands. - // Entries in the vector match the index in the terminator operand list. In - // some cases - see `shouldExcludeEdge` and its implementation - an entry - // will be nullptr. - // InEdges doesn't have the above constraint. - SmallVector OutEdges; - SmallVector InEdges; - size_t UnknownCountOutEdges = 0; - size_t UnknownCountInEdges = 0; - - // Pass AssumeAllKnown when we try to propagate counts from edges to BBs - - // because all the edge counters must be known. - // Return std::nullopt if there were no edges to sum. The user can decide - // how to interpret that. - std::optional getEdgeSum(const SmallVector &Edges, - bool AssumeAllKnown) const { - std::optional Sum; - for (const auto *E : Edges) { - // `Edges` may be `OutEdges`, case in which `E` could be nullptr. - if (E) { - if (!Sum.has_value()) - Sum = 0; - *Sum += (AssumeAllKnown ? *E->Count : E->Count.value_or(0U)); - } - } - return Sum; - } - - bool computeCountFrom(const SmallVector &Edges) { - assert(!Count.has_value()); - Count = getEdgeSum(Edges, true); - return Count.has_value(); - } - - void setSingleUnknownEdgeCount(SmallVector &Edges) { - uint64_t KnownSum = getEdgeSum(Edges, false).value_or(0U); - uint64_t EdgeVal = *Count > KnownSum ? *Count - KnownSum : 0U; - EdgeInfo *E = nullptr; - for (auto *I : Edges) - if (I && !I->Count.has_value()) { - E = I; -#ifdef NDEBUG - break; -#else - assert((!E || E == I) && - "Expected exactly one edge to have an unknown count, " - "found a second one"); - continue; -#endif - } - assert(E && "Expected exactly one edge to have an unknown count"); - assert(!E->Count.has_value()); - E->Count = EdgeVal; - assert(E->Src->UnknownCountOutEdges > 0); - assert(E->Dest->UnknownCountInEdges > 0); - --E->Src->UnknownCountOutEdges; - --E->Dest->UnknownCountInEdges; - } - - public: - BBInfo(size_t NumInEdges, size_t NumOutEdges, std::optional Count) - : Count(Count) { - // For in edges, we just want to pre-allocate enough space, since we know - // it at this stage. For out edges, we will insert edges at the indices - // corresponding to positions in this BB's terminator instruction, so we - // construct a default (nullptr values)-initialized vector. A nullptr edge - // corresponds to those that are excluded (see shouldExcludeEdge). - InEdges.reserve(NumInEdges); - OutEdges.resize(NumOutEdges); - } - - bool tryTakeCountFromKnownOutEdges(const BasicBlock &BB) { - if (!UnknownCountOutEdges) { - return computeCountFrom(OutEdges); - } - return false; - } - - bool tryTakeCountFromKnownInEdges(const BasicBlock &BB) { - if (!UnknownCountInEdges) { - return computeCountFrom(InEdges); - } - return false; - } - - void addInEdge(EdgeInfo &Info) { - InEdges.push_back(&Info); - ++UnknownCountInEdges; - } - - // For the out edges, we care about the position we place them in, which is - // the position in terminator instruction's list (at construction). Later, - // we build branch_weights metadata with edge frequency values matching - // these positions. - void addOutEdge(size_t Index, EdgeInfo &Info) { - OutEdges[Index] = &Info; - ++UnknownCountOutEdges; - } - - bool hasCount() const { return Count.has_value(); } - - uint64_t getCount() const { return *Count; } - - bool trySetSingleUnknownInEdgeCount() { - if (UnknownCountInEdges == 1) { - setSingleUnknownEdgeCount(InEdges); - return true; - } - return false; - } - - bool trySetSingleUnknownOutEdgeCount() { - if (UnknownCountOutEdges == 1) { - setSingleUnknownEdgeCount(OutEdges); - return true; - } - return false; - } - size_t getNumOutEdges() const { return OutEdges.size(); } - - uint64_t getEdgeCount(size_t Index) const { - if (auto *E = OutEdges[Index]) - return *E->Count; - return 0U; - } - }; - - Function &F; - const SmallVectorImpl &Counters; - // To be accessed through getBBInfo() after construction. - std::map BBInfos; - std::vector EdgeInfos; - - // This is an adaptation of PGOUseFunc::populateCounters. - // FIXME(mtrofin): look into factoring the code to share one implementation. - void propagateCounterValues(const SmallVectorImpl &Counters) { - bool KeepGoing = true; - while (KeepGoing) { - KeepGoing = false; - for (const auto &BB : F) { - auto &Info = getBBInfo(BB); - if (!Info.hasCount()) - KeepGoing |= Info.tryTakeCountFromKnownOutEdges(BB) || - Info.tryTakeCountFromKnownInEdges(BB); - if (Info.hasCount()) { - KeepGoing |= Info.trySetSingleUnknownOutEdgeCount(); - KeepGoing |= Info.trySetSingleUnknownInEdgeCount(); - } - } - } - } - // The only criteria for exclusion is faux suspend -> exit edges in presplit - // coroutines. The API serves for readability, currently. - bool shouldExcludeEdge(const BasicBlock &Src, const BasicBlock &Dest) const { - return llvm::isPresplitCoroSuspendExitEdge(Src, Dest); - } - - BBInfo &getBBInfo(const BasicBlock &BB) { return BBInfos.find(&BB)->second; } - - const BBInfo &getBBInfo(const BasicBlock &BB) const { - return BBInfos.find(&BB)->second; - } - - // validation function after we propagate the counters: all BBs and edges' - // counters must have a value. - bool allCountersAreAssigned() const { - for (const auto &BBInfo : BBInfos) - if (!BBInfo.second.hasCount()) - return false; - for (const auto &EdgeInfo : EdgeInfos) - if (!EdgeInfo.Count.has_value()) - return false; - return true; - } - - /// Check that all paths from the entry basic block that use edges with - /// non-zero counts arrive at a basic block with no successors (i.e. "exit") - bool allTakenPathsExit() const { - std::deque Worklist; - DenseSet Visited; - Worklist.push_back(&F.getEntryBlock()); - bool HitExit = false; - while (!Worklist.empty()) { - const auto *BB = Worklist.front(); - Worklist.pop_front(); - if (!Visited.insert(BB).second) - continue; - if (succ_size(BB) == 0) { - if (isa(BB->getTerminator())) - return false; - HitExit = true; - continue; - } - if (succ_size(BB) == 1) { - Worklist.push_back(BB->getUniqueSuccessor()); - continue; - } - const auto &BBInfo = getBBInfo(*BB); - bool HasAWayOut = false; - for (auto I = 0U; I < BB->getTerminator()->getNumSuccessors(); ++I) { - const auto *Succ = BB->getTerminator()->getSuccessor(I); - if (!shouldExcludeEdge(*BB, *Succ)) { - if (BBInfo.getEdgeCount(I) > 0) { - HasAWayOut = true; - Worklist.push_back(Succ); - } - } - } - if (!HasAWayOut) - return false; - } - return HitExit; - } - - bool allNonColdSelectsHaveProfile() const { - for (const auto &BB : F) { - if (getBBInfo(BB).getCount() > 0) { - for (const auto &I : BB) { - if (const auto *SI = dyn_cast(&I)) { - if (!SI->getMetadata(LLVMContext::MD_prof)) { - return false; - } - } - } - } - } - return true; - } - -public: - ProfileAnnotator(Function &F, const SmallVectorImpl &Counters) - : F(F), Counters(Counters) { - assert(!F.isDeclaration()); - assert(!Counters.empty()); - size_t NrEdges = 0; - for (const auto &BB : F) { - std::optional Count; - if (auto *Ins = CtxProfAnalysis::getBBInstrumentation( - const_cast(BB))) { - auto Index = Ins->getIndex()->getZExtValue(); - assert(Index < Counters.size() && - "The index must be inside the counters vector by construction - " - "tripping this assertion indicates a bug in how the contextual " - "profile is managed by IPO transforms"); - (void)Index; - Count = Counters[Ins->getIndex()->getZExtValue()]; - } else if (isa(BB.getTerminator())) { - // The program presumably didn't crash. - Count = 0; - } - auto [It, Ins] = - BBInfos.insert({&BB, {pred_size(&BB), succ_size(&BB), Count}}); - (void)Ins; - assert(Ins && "We iterate through the function's BBs, no reason to " - "insert one more than once"); - NrEdges += llvm::count_if(successors(&BB), [&](const auto *Succ) { - return !shouldExcludeEdge(BB, *Succ); - }); - } - // Pre-allocate the vector, we want references to its contents to be stable. - EdgeInfos.reserve(NrEdges); - for (const auto &BB : F) { - auto &Info = getBBInfo(BB); - for (auto I = 0U; I < BB.getTerminator()->getNumSuccessors(); ++I) { - const auto *Succ = BB.getTerminator()->getSuccessor(I); - if (!shouldExcludeEdge(BB, *Succ)) { - auto &EI = EdgeInfos.emplace_back(getBBInfo(BB), getBBInfo(*Succ)); - Info.addOutEdge(I, EI); - getBBInfo(*Succ).addInEdge(EI); - } - } - } - assert(EdgeInfos.capacity() == NrEdges && - "The capacity of EdgeInfos should have stayed unchanged it was " - "populated, because we need pointers to its contents to be stable"); - } - - void setProfileForSelectInstructions(BasicBlock &BB, const BBInfo &BBInfo) { - if (BBInfo.getCount() == 0) - return; - - for (auto &I : BB) { + for (auto &BB : F) { + for (auto &I : BB) if (auto *SI = dyn_cast(&I)) { - if (auto *Step = CtxProfAnalysis::getSelectInstrumentation(*SI)) { - auto Index = Step->getIndex()->getZExtValue(); - assert(Index < Counters.size() && - "The index of the step instruction must be inside the " - "counters vector by " - "construction - tripping this assertion indicates a bug in " - "how the contextual profile is managed by IPO transforms"); - auto TotalCount = BBInfo.getCount(); - auto TrueCount = Counters[Index]; - auto FalseCount = - (TotalCount > TrueCount ? TotalCount - TrueCount : 0U); - setProfMetadata(F.getParent(), SI, {TrueCount, FalseCount}, - std::max(TrueCount, FalseCount)); - } - } - } - } - - /// Assign branch weights and function entry count. Also update the PSI - /// builder. - void assignProfileData() { - assert(!Counters.empty()); - propagateCounterValues(Counters); - F.setEntryCount(Counters[0]); - - for (auto &BB : F) { - const auto &BBInfo = getBBInfo(BB); - setProfileForSelectInstructions(BB, BBInfo); - if (succ_size(&BB) < 2) - continue; - auto *Term = BB.getTerminator(); - SmallVector EdgeCounts(Term->getNumSuccessors(), 0); - uint64_t MaxCount = 0; - - for (unsigned SuccIdx = 0, Size = BBInfo.getNumOutEdges(); SuccIdx < Size; - ++SuccIdx) { - uint64_t EdgeCount = BBInfo.getEdgeCount(SuccIdx); - if (EdgeCount > MaxCount) - MaxCount = EdgeCount; - EdgeCounts[SuccIdx] = EdgeCount; + uint64_t TrueCount, FalseCount = 0; + if (!PA.getSelectInstrProfile(*SI, TrueCount, FalseCount)) + continue; + setProfMetadata(F.getParent(), SI, {TrueCount, FalseCount}, + std::max(TrueCount, FalseCount)); } - - if (MaxCount != 0) - setProfMetadata(F.getParent(), Term, EdgeCounts, MaxCount); - } - assert(allCountersAreAssigned() && - "[ctx-prof] Expected all counters have been assigned."); - assert(allTakenPathsExit() && - "[ctx-prof] Encountered a BB with more than one successor, where " - "all outgoing edges have a 0 count. This occurs in non-exiting " - "functions (message pumps, usually) which are not supported in the " - "contextual profiling case"); - assert(allNonColdSelectsHaveProfile() && - "[ctx-prof] All non-cold select instructions were expected to have " - "a profile."); + if (succ_size(&BB) < 2) + continue; + uint64_t MaxCount = 0; + if (!PA.getOutgoingBranchWeights(BB, ProfileHolder, MaxCount)) + continue; + assert(MaxCount > 0); + setProfMetadata(F.getParent(), BB.getTerminator(), ProfileHolder, MaxCount); } -}; +} [[maybe_unused]] bool areAllBBsReachable(const Function &F, FunctionAnalysisManager &FAM) { @@ -510,10 +185,8 @@ PreservedAnalyses PGOCtxProfFlatteningPass::run(Module &M, // If this function didn't appear in the contextual profile, it's cold. if (It == FlattenedProfile.end()) clearColdFunctionProfile(F); - else { - ProfileAnnotator S(F, It->second); - S.assignProfileData(); - } + else + assignProfileData(F, It->second); } InstrProfSummaryBuilder PB(ProfileSummaryBuilder::DefaultCutoffs); // use here the flat profiles just so the importer doesn't complain about From fe94f11407453c2d166597ef6e58d31f5b27d46e Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Wed, 16 Apr 2025 16:27:24 -0300 Subject: [PATCH 164/710] [clang] Fix elaborated keyword canonicalization (#135916) --- .../modernize/UseConstraintsCheck.cpp | 8 ++- clang/docs/ReleaseNotes.rst | 4 ++ clang/include/clang/AST/Type.h | 14 +++++ clang/lib/AST/ASTContext.cpp | 49 +++++++++++---- clang/lib/AST/Type.cpp | 8 +++ clang/lib/AST/TypeLoc.cpp | 44 ++++++++----- clang/lib/Sema/SemaDecl.cpp | 6 +- clang/lib/Sema/SemaDeclCXX.cpp | 60 +++++++++++------- clang/lib/Sema/SemaInit.cpp | 7 ++- clang/lib/Sema/SemaTemplate.cpp | 18 +++--- clang/test/Analysis/anonymous-decls.cpp | 4 +- clang/test/CXX/drs/cwg23xx.cpp | 4 +- .../dependent-template-recover.cpp | 4 +- .../elaborated-type-specifier.cpp | 63 ++++++++++++++++++- .../SemaTemplate/typename-specifier-3.cpp | 18 ++++++ 15 files changed, 234 insertions(+), 77 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp index fb82efb4dd211..6040cddf0e52a 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp @@ -54,8 +54,10 @@ static std::optional matchEnableIfSpecializationImplTypename(TypeLoc TheType) { if (const auto Dep = TheType.getAs()) { const IdentifierInfo *Identifier = Dep.getTypePtr()->getIdentifier(); + ElaboratedTypeKeyword Keyword = Dep.getTypePtr()->getKeyword(); if (!Identifier || Identifier->getName() != "type" || - Dep.getTypePtr()->getKeyword() != ElaboratedTypeKeyword::Typename) { + (Keyword != ElaboratedTypeKeyword::Typename && + Keyword != ElaboratedTypeKeyword::None)) { return std::nullopt; } TheType = Dep.getQualifierLoc().getTypeLoc(); @@ -108,8 +110,10 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) { if (const auto *AliasedType = dyn_cast(Specialization->getAliasedType())) { + ElaboratedTypeKeyword Keyword = AliasedType->getKeyword(); if (AliasedType->getIdentifier()->getName() != "type" || - AliasedType->getKeyword() != ElaboratedTypeKeyword::Typename) { + (Keyword != ElaboratedTypeKeyword::Typename && + Keyword != ElaboratedTypeKeyword::None)) { return std::nullopt; } } else { diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0891fd058bb57..07ff1251fc1ad 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -486,6 +486,10 @@ Bug Fixes to C++ Support - Fixes matching of nested template template parameters. (#GH130362) - Correctly diagnoses template template paramters which have a pack parameter not in the last position. +- Disallow overloading on struct vs class on dependent types, which is IFNDR, as + this makes the problem diagnosable. +- Improved preservation of the presence or abscence of typename specifier when + printing types in diagnostics. - Clang now correctly parses ``if constexpr`` expressions in immediate function context. (#GH123524) - Fixed an assertion failure affecting code that uses C++23 "deducing this". (#GH130272) - Clang now properly instantiates destructors for initialized members within non-delegating constructors. (#GH93251) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 5bf036e3347eb..1ecd64539e2de 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2838,6 +2838,20 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { /// immediately following this class. template const T *getAs() const; + /// Look through sugar for an instance of TemplateSpecializationType which + /// is not a type alias, or null if there is no such type. + /// This is used when you want as-written template arguments or the template + /// name for a class template specialization. + const TemplateSpecializationType * + getAsNonAliasTemplateSpecializationType() const; + + const TemplateSpecializationType * + castAsNonAliasTemplateSpecializationType() const { + const auto *TST = getAsNonAliasTemplateSpecializationType(); + assert(TST && "not a TemplateSpecializationType"); + return TST; + } + /// Member-template getAsAdjusted. Look through specific kinds /// of sugar (parens, attributes, etc) for an instance of \. /// This is used when you need to walk over sugar nodes that represent some diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index c6ffe7bbf5257..bf24704e48eaa 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -5747,6 +5747,30 @@ ASTContext::getMacroQualifiedType(QualType UnderlyingTy, return QualType(newType, 0); } +static ElaboratedTypeKeyword +getCanonicalElaboratedTypeKeyword(ElaboratedTypeKeyword Keyword) { + switch (Keyword) { + // These are just themselves. + case ElaboratedTypeKeyword::None: + case ElaboratedTypeKeyword::Struct: + case ElaboratedTypeKeyword::Union: + case ElaboratedTypeKeyword::Enum: + case ElaboratedTypeKeyword::Interface: + return Keyword; + + // These are equivalent. + case ElaboratedTypeKeyword::Typename: + return ElaboratedTypeKeyword::None; + + // These are functionally equivalent, so relying on their equivalence is + // IFNDR. By making them equivalent, we disallow overloading, which at least + // can produce a diagnostic. + case ElaboratedTypeKeyword::Class: + return ElaboratedTypeKeyword::Struct; + } + llvm_unreachable("unexpected keyword kind"); +} + QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, const IdentifierInfo *Name) const { @@ -5758,10 +5782,13 @@ QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword, DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos)) return QualType(T, 0); + ElaboratedTypeKeyword CanonKeyword = + getCanonicalElaboratedTypeKeyword(Keyword); + NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); + QualType Canon; - if (NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); - CanonNNS != NNS) { - Canon = getDependentNameType(Keyword, CanonNNS, Name); + if (CanonKeyword != Keyword || CanonNNS != NNS) { + Canon = getDependentNameType(CanonKeyword, CanonNNS, Name); [[maybe_unused]] DependentNameType *T = DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos); assert(!T && "broken canonicalization"); @@ -5800,19 +5827,19 @@ QualType ASTContext::getDependentTemplateSpecializationType( QualType Canon; if (!IsCanonical) { - ElaboratedTypeKeyword CanonKeyword = Keyword != ElaboratedTypeKeyword::None - ? Keyword - : ElaboratedTypeKeyword::Typename; + ElaboratedTypeKeyword CanonKeyword = + getCanonicalElaboratedTypeKeyword(Keyword); NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); bool AnyNonCanonArgs = false; auto CanonArgs = ::getCanonicalTemplateArguments(*this, Args, AnyNonCanonArgs); - if (AnyNonCanonArgs || CanonNNS != NNS || !Name.hasTemplateKeyword() || - CanonKeyword != Keyword) { + if (CanonKeyword != Keyword || AnyNonCanonArgs || CanonNNS != NNS || + !Name.hasTemplateKeyword()) { Canon = getDependentTemplateSpecializationType( CanonKeyword, {CanonNNS, Name.getName(), /*HasTemplateKeyword=*/true}, - CanonArgs, /*IsCanonical=*/true); + CanonArgs, + /*IsCanonical=*/true); // Find the insert position again. [[maybe_unused]] auto *Nothing = DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, @@ -5820,7 +5847,7 @@ QualType ASTContext::getDependentTemplateSpecializationType( assert(!Nothing && "canonical type broken"); } } else { - assert(Keyword != ElaboratedTypeKeyword::None); + assert(Keyword == getCanonicalElaboratedTypeKeyword(Keyword)); assert(Name.hasTemplateKeyword()); assert(NNS == getCanonicalNestedNameSpecifier(NNS)); #ifndef NDEBUG @@ -7657,7 +7684,7 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const { if (const auto *DTST = T->getAs()) { const DependentTemplateStorage &DTN = DTST->getDependentTemplateName(); QualType NewT = getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::Typename, + ElaboratedTypeKeyword::None, {/*NNS=*/nullptr, DTN.getName(), /*HasTemplateKeyword=*/true}, DTST->template_arguments(), /*IsCanonical=*/true); assert(NewT.isCanonical()); diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 53620003c9655..42e94d66d1a13 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1938,6 +1938,14 @@ TagDecl *Type::getAsTagDecl() const { return nullptr; } +const TemplateSpecializationType * +Type::getAsNonAliasTemplateSpecializationType() const { + const auto *TST = getAs(); + while (TST && TST->isTypeAlias()) + TST = TST->desugar()->getAs(); + return TST; +} + bool Type::hasAttr(attr::Kind AK) const { const Type *Cur = this; while (const auto *AT = Cur->getAs()) { diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp index 24726901b8f55..3d1b5ca966b66 100644 --- a/clang/lib/AST/TypeLoc.cpp +++ b/clang/lib/AST/TypeLoc.cpp @@ -546,37 +546,47 @@ void UnaryTransformTypeLoc::initializeLocal(ASTContext &Context, Context.getTrivialTypeSourceInfo(getTypePtr()->getBaseType(), Loc)); } +template +static void initializeElaboratedKeyword(TL T, SourceLocation Loc) { + T.setElaboratedKeywordLoc(T.getTypePtr()->getKeyword() != + ElaboratedTypeKeyword::None + ? Loc + : SourceLocation()); +} + +static NestedNameSpecifierLoc +initializeQualifier(ASTContext &Context, NestedNameSpecifier *Qualifier, + SourceLocation Loc) { + if (!Qualifier) + return NestedNameSpecifierLoc(); + NestedNameSpecifierLocBuilder Builder; + Builder.MakeTrivial(Context, Qualifier, Loc); + return Builder.getWithLocInContext(Context); +} + void ElaboratedTypeLoc::initializeLocal(ASTContext &Context, SourceLocation Loc) { if (isEmpty()) return; - setElaboratedKeywordLoc(Loc); - NestedNameSpecifierLocBuilder Builder; - Builder.MakeTrivial(Context, getTypePtr()->getQualifier(), Loc); - setQualifierLoc(Builder.getWithLocInContext(Context)); + initializeElaboratedKeyword(*this, Loc); + setQualifierLoc( + initializeQualifier(Context, getTypePtr()->getQualifier(), Loc)); } void DependentNameTypeLoc::initializeLocal(ASTContext &Context, SourceLocation Loc) { - setElaboratedKeywordLoc(Loc); - NestedNameSpecifierLocBuilder Builder; - Builder.MakeTrivial(Context, getTypePtr()->getQualifier(), Loc); - setQualifierLoc(Builder.getWithLocInContext(Context)); + initializeElaboratedKeyword(*this, Loc); + setQualifierLoc( + initializeQualifier(Context, getTypePtr()->getQualifier(), Loc)); setNameLoc(Loc); } void DependentTemplateSpecializationTypeLoc::initializeLocal(ASTContext &Context, SourceLocation Loc) { - setElaboratedKeywordLoc(Loc); - if (NestedNameSpecifier *Qualifier = - getTypePtr()->getDependentTemplateName().getQualifier()) { - NestedNameSpecifierLocBuilder Builder; - Builder.MakeTrivial(Context, Qualifier, Loc); - setQualifierLoc(Builder.getWithLocInContext(Context)); - } else { - setQualifierLoc(NestedNameSpecifierLoc()); - } + initializeElaboratedKeyword(*this, Loc); + setQualifierLoc(initializeQualifier( + Context, getTypePtr()->getDependentTemplateName().getQualifier(), Loc)); setTemplateKeywordLoc(Loc); setTemplateNameLoc(Loc); setLAngleLoc(Loc); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 127c0a4500a43..46933c5c43168 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -247,15 +247,15 @@ static ParsedType recoverFromTypeInKnownDependentBase(Sema &S, return nullptr; // We found some types in dependent base classes. Recover as if the user - // wrote 'typename MyClass::II' instead of 'II'. We'll fully resolve the - // lookup during template instantiation. + // wrote 'MyClass::II' instead of 'II', and this implicit typename was + // allowed. We'll fully resolve the lookup during template instantiation. S.Diag(NameLoc, diag::ext_found_in_dependent_base) << &II; ASTContext &Context = S.Context; auto *NNS = NestedNameSpecifier::Create( Context, nullptr, cast(Context.getRecordType(RD))); QualType T = - Context.getDependentNameType(ElaboratedTypeKeyword::Typename, NNS, &II); + Context.getDependentNameType(ElaboratedTypeKeyword::None, NNS, &II); CXXScopeSpec SS; SS.MakeTrivial(Context, NNS, SourceRange(NameLoc)); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 05991228dbfc2..d4e48a14d13c2 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -12154,33 +12154,31 @@ static bool isStdClassTemplate(Sema &S, QualType SugaredType, QualType *TypeArg, }; ClassTemplateDecl *Template = nullptr; - const TemplateArgument *Arguments = nullptr; - - QualType Ty = S.Context.getCanonicalType(SugaredType); - if (const RecordType *RT = Ty->getAs()) { - ClassTemplateSpecializationDecl *Specialization = - dyn_cast(RT->getDecl()); - if (!Specialization) { - ReportMatchingNameAsMalformed(RT->getDecl()); - return false; - } - - Template = Specialization->getSpecializedTemplate(); - Arguments = Specialization->getTemplateArgs().data(); - } else { - const TemplateSpecializationType *TST = nullptr; - if (auto *ICN = Ty->getAs()) - TST = ICN->getInjectedTST(); - else - TST = Ty->getAs(); + ArrayRef Arguments; + { + const TemplateSpecializationType *TST = + SugaredType->getAsNonAliasTemplateSpecializationType(); + if (!TST) + if (const auto *ICN = SugaredType->getAs()) + TST = ICN->getInjectedTST(); if (TST) { Template = dyn_cast_or_null( TST->getTemplateName().getAsTemplateDecl()); - Arguments = TST->template_arguments().begin(); + Arguments = TST->template_arguments(); + } else if (const RecordType *RT = SugaredType->getAs()) { + ClassTemplateSpecializationDecl *Specialization = + dyn_cast(RT->getDecl()); + if (!Specialization) { + ReportMatchingNameAsMalformed(RT->getDecl()); + return false; + } + Template = Specialization->getSpecializedTemplate(); + Arguments = Specialization->getTemplateArgs().asArray(); } } + if (!Template) { - ReportMatchingNameAsMalformed(Ty->getAsTagDecl()); + ReportMatchingNameAsMalformed(SugaredType->getAsTagDecl()); return false; } @@ -12200,7 +12198,8 @@ static bool isStdClassTemplate(Sema &S, QualType SugaredType, QualType *TypeArg, // template? TemplateParameterList *Params = Template->getTemplateParameters(); if (Params->getMinRequiredArguments() != 1 || - !isa(Params->getParam(0))) { + !isa(Params->getParam(0)) || + Params->getParam(0)->isTemplateParameterPack()) { if (MalformedDecl) *MalformedDecl = TemplateClass; return false; @@ -12214,8 +12213,21 @@ static bool isStdClassTemplate(Sema &S, QualType SugaredType, QualType *TypeArg, return false; // This is an instance of std::{ClassName}. Find the argument type. - if (TypeArg) - *TypeArg = Arguments[0].getAsType(); + if (TypeArg) { + QualType ArgType = Arguments[0].getAsType(); + // FIXME: Since TST only has as-written arguments, we have to perform the + // only kind of conversion applicable to type arguments; in Objective-C ARC: + // - If an explicitly-specified template argument type is a lifetime type + // with no lifetime qualifier, the __strong lifetime qualifier is + // inferred. + if (S.getLangOpts().ObjCAutoRefCount && ArgType->isObjCLifetimeType() && + !ArgType.getObjCLifetime()) { + Qualifiers Qs; + Qs.setObjCLifetime(Qualifiers::OCL_Strong); + ArgType = S.Context.getQualifiedType(ArgType, Qs); + } + *TypeArg = ArgType; + } return true; } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index a1e4bb4321d53..77d7f821f2011 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9897,7 +9897,7 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( auto TemplateName = DeducedTST->getTemplateName(); if (TemplateName.isDependent()) - return SubstAutoTypeDependent(TSInfo->getType()); + return SubstAutoTypeSourceInfoDependent(TSInfo)->getType(); // We can only perform deduction for class templates or alias templates. auto *Template = @@ -9942,7 +9942,7 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( Diag(TSInfo->getTypeLoc().getBeginLoc(), diag::warn_cxx14_compat_class_template_argument_deduction) << TSInfo->getTypeLoc().getSourceRange() << 0; - return SubstAutoTypeDependent(TSInfo->getType()); + return SubstAutoTypeSourceInfoDependent(TSInfo)->getType(); } // FIXME: Perform "exact type" matching first, per CWG discussion? @@ -10253,7 +10253,8 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( // The placeholder is replaced by the return type of the function selected // by overload resolution for class template deduction. QualType DeducedType = - SubstAutoType(TSInfo->getType(), Best->Function->getReturnType()); + SubstAutoTypeSourceInfo(TSInfo, Best->Function->getReturnType()) + ->getType(); Diag(TSInfo->getTypeLoc().getBeginLoc(), diag::warn_cxx14_compat_class_template_argument_deduction) << TSInfo->getTypeLoc().getSourceRange() << 1 << DeducedType; diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 6b7892fa30989..894f072d84989 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4892,7 +4892,7 @@ bool Sema::CheckTemplateTypeArgument( // Recover by synthesizing a type using the location information that we // already have. - ArgType = Context.getDependentNameType(ElaboratedTypeKeyword::Typename, + ArgType = Context.getDependentNameType(ElaboratedTypeKeyword::None, SS.getScopeRep(), II); TypeLocBuilder TLB; DependentNameTypeLoc TL = TLB.push(ArgType); @@ -10672,10 +10672,8 @@ TypeResult Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context); TypeSourceInfo *TSI = nullptr; QualType T = - CheckTypenameType((TypenameLoc.isValid() || - IsImplicitTypename == ImplicitTypenameContext::Yes) - ? ElaboratedTypeKeyword::Typename - : ElaboratedTypeKeyword::None, + CheckTypenameType(TypenameLoc.isValid() ? ElaboratedTypeKeyword::Typename + : ElaboratedTypeKeyword::None, TypenameLoc, QualifierLoc, II, IdLoc, &TSI, /*DeducedTSTContext=*/true); if (T.isNull()) @@ -10713,6 +10711,9 @@ Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc); translateTemplateArguments(TemplateArgsIn, TemplateArgs); + auto Keyword = TypenameLoc.isValid() ? ElaboratedTypeKeyword::Typename + : ElaboratedTypeKeyword::None; + TemplateName Template = TemplateIn.get(); if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) { // Construct a dependent template specialization type. @@ -10726,7 +10727,7 @@ Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, } QualType T = Context.getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::Typename, *DTN, TemplateArgs.arguments()); + Keyword, *DTN, TemplateArgs.arguments()); // Create source-location information for this type. TypeLocBuilder Builder; @@ -10758,8 +10759,7 @@ Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I) SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo()); - T = Context.getElaboratedType(ElaboratedTypeKeyword::Typename, - SS.getScopeRep(), T); + T = Context.getElaboratedType(Keyword, SS.getScopeRep(), T); ElaboratedTypeLoc TL = Builder.push(T); TL.setElaboratedKeywordLoc(TypenameLoc); TL.setQualifierLoc(SS.getWithLocInContext(Context)); @@ -10853,6 +10853,8 @@ Sema::CheckTypenameType(ElaboratedTypeKeyword Keyword, NestedNameSpecifierLoc QualifierLoc, const IdentifierInfo &II, SourceLocation IILoc, bool DeducedTSTContext) { + assert((Keyword != ElaboratedTypeKeyword::None) == KeywordLoc.isValid()); + CXXScopeSpec SS; SS.Adopt(QualifierLoc); diff --git a/clang/test/Analysis/anonymous-decls.cpp b/clang/test/Analysis/anonymous-decls.cpp index 211184523aa51..85449caa46972 100644 --- a/clang/test/Analysis/anonymous-decls.cpp +++ b/clang/test/Analysis/anonymous-decls.cpp @@ -74,13 +74,13 @@ int main() { // CHECK-NEXT: 4: * [B3.3] (OperatorCall) // CHECK-NEXT: 5: auto &; // CHECK-NEXT: 6: get<0UL> -// CHECK-NEXT: 7: [B3.6] (ImplicitCastExpr, FunctionToPointerDecay, typename tuple_element<0L, pair >::type (*)(pair &)) +// CHECK-NEXT: 7: [B3.6] (ImplicitCastExpr, FunctionToPointerDecay, tuple_element<0L, pair >::type (*)(pair &)) // CHECK-NEXT: 8: decomposition-a-b // CHECK-NEXT: 9: [B3.7]([B3.8]) // CHECK-NEXT: 10: [B3.9] // CHECK-NEXT: 11: std::tuple_element<0, std::pair>::type a = get<0UL>(decomposition-a-b); // CHECK-NEXT: 12: get<1UL> -// CHECK-NEXT: 13: [B3.12] (ImplicitCastExpr, FunctionToPointerDecay, typename tuple_element<1L, pair >::type (*)(pair &)) +// CHECK-NEXT: 13: [B3.12] (ImplicitCastExpr, FunctionToPointerDecay, tuple_element<1L, pair >::type (*)(pair &)) // CHECK-NEXT: 14: decomposition-a-b // CHECK-NEXT: 15: [B3.13]([B3.14]) // CHECK-NEXT: 16: [B3.15] diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp index 78cecb8b71bca..74e72f2371e2a 100644 --- a/clang/test/CXX/drs/cwg23xx.cpp +++ b/clang/test/CXX/drs/cwg23xx.cpp @@ -91,7 +91,7 @@ struct Y {}; struct Z : W, X, check_derived_from, // #cwg2310-X check_derived_from, Y // #cwg2310-Y -{ +{ // FIXME: It was properly rejected before, but we're crashing since Clang 11 in C++11 and C++14 modes. // See https://github.com/llvm/llvm-project/issues/59920 #if __cplusplus >= 201703L @@ -188,7 +188,7 @@ struct InitListCtor { std::initializer_list i; auto j = std::initializer_list{ i }; -// since-cxx17-error@-1 {{conversion function from 'std::initializer_list' to 'const cwg2311::InitListCtor' invokes a deleted function}} +// since-cxx17-error@-1 {{conversion function from 'std::initializer_list' to 'const InitListCtor' invokes a deleted function}} // since-cxx17-note@#cwg2311-InitListCtor {{'InitListCtor' has been explicitly marked deleted here}} #endif } // namespace cwg2311 diff --git a/clang/test/SemaTemplate/dependent-template-recover.cpp b/clang/test/SemaTemplate/dependent-template-recover.cpp index 251a8f9816417..21e6a963719bd 100644 --- a/clang/test/SemaTemplate/dependent-template-recover.cpp +++ b/clang/test/SemaTemplate/dependent-template-recover.cpp @@ -146,9 +146,9 @@ namespace templ_spec { // FIXME: Why error recovery for the non-typename case is so bad? A> t3; // expected-error {{did you forget 'typename'}} - // expected-error@-1 {{'A' (aka 'void')}} + // expected-error@-1 {{'A' (aka 'void')}} A> t4; // expected-error {{use 'template' keyword}} expected-error {{did you forget 'typename'}} - // expected-error@-1 {{'A' (aka 'void')}} + // expected-error@-1 {{'A' (aka 'void')}} }; } // namespace templ_spec diff --git a/clang/test/SemaTemplate/elaborated-type-specifier.cpp b/clang/test/SemaTemplate/elaborated-type-specifier.cpp index 27b3f36ee14dd..95c2aa9f60a39 100644 --- a/clang/test/SemaTemplate/elaborated-type-specifier.cpp +++ b/clang/test/SemaTemplate/elaborated-type-specifier.cpp @@ -10,7 +10,7 @@ namespace PR6915 { struct D1 { enum X { value }; }; - struct D2 { + struct D2 { class X { }; // expected-note{{previous use is here}} }; struct D3 { }; @@ -25,12 +25,12 @@ struct DeclOrDef { enum T::foo; // expected-error{{nested name specifier for a declaration cannot depend on a template parameter}} // expected-error@-1{{forward declaration of enum cannot have a nested name specifier}} enum T::bar { // expected-error{{nested name specifier for a declaration cannot depend on a template parameter}} - value + value }; }; namespace PR6649 { - template struct foo { + template struct foo { class T::bar; // expected-error{{nested name specifier for a declaration cannot depend on a template parameter}} // expected-error@-1{{forward declaration of class cannot have a nested name specifier}} class T::bar { int x; }; // expected-error{{nested name specifier for a declaration cannot depend on a template parameter}} @@ -40,3 +40,60 @@ namespace PR6649 { namespace rdar8568507 { template struct A *makeA(T t); } + +namespace canon { + template void t1(struct T::X) {} + // expected-note@-1 {{previous definition is here}} + template void t1(class T::X) {} + // expected-error@-1 {{redefinition of 't1'}} + + template void t2(struct T::template X) {} + // expected-note@-1 {{previous definition is here}} + template void t2(class T::template X) {} + // expected-error@-1 {{redefinition of 't2'}} + + template constexpr int t3(typename T::X* = 0) { return 0; } // #canon-t3-0 + template constexpr int t3(struct T::X* = 0) { return 1; } // #canon-t3-1 + template constexpr int t3(union T::X* = 0) { return 2; } // #canon-t3-2 + template constexpr int t3(enum T::X* = 0) { return 3; } // #canon-t3-3 + + struct A { using X = int; }; + static_assert(t3() == 0); + + struct B { struct X {}; }; + static_assert(t3() == 1); + // expected-error@-1 {{call to 't3' is ambiguous}} + // expected-note@#canon-t3-0 {{candidate function}} + // expected-note@#canon-t3-1 {{candidate function}} + + struct C { union X {}; }; + static_assert(t3() == 2); + // expected-error@-1 {{call to 't3' is ambiguous}} + // expected-note@#canon-t3-0 {{candidate function}} + // expected-note@#canon-t3-2 {{candidate function}} + + struct D { enum X {}; }; + static_assert(t3() == 3); + // expected-error@-1 {{call to 't3' is ambiguous}} + // expected-note@#canon-t3-0 {{candidate function}} + // expected-note@#canon-t3-3 {{candidate function}} + + template constexpr int t4(typename T::template X* = 0) { return 0; } + // expected-note@-1 3{{candidate function}} + template constexpr int t4(struct T::template X* = 0) { return 1; } + // expected-note@-1 3{{candidate function}} + template constexpr int t4(union T::template X* = 0) { return 2; } + // expected-note@-1 3{{candidate function}} + + // FIXME: This should work. + struct E { template using X = T; }; + static_assert(t4() == 0); // expected-error {{call to 't4' is ambiguous}} + + // FIXME: Should not match the union overload. + struct F { template struct X {}; }; + static_assert(t4() == 1); // expected-error {{call to 't4' is ambiguous}} + + // FIXME: Should not match the struct overload. + struct G { template union X {}; }; + static_assert(t4() == 2); // expected-error {{call to 't4' is ambiguous}} +} // namespace canon diff --git a/clang/test/SemaTemplate/typename-specifier-3.cpp b/clang/test/SemaTemplate/typename-specifier-3.cpp index 0140b1a479c2d..cdd065c98bb0a 100644 --- a/clang/test/SemaTemplate/typename-specifier-3.cpp +++ b/clang/test/SemaTemplate/typename-specifier-3.cpp @@ -75,3 +75,21 @@ namespace PR12884_fixed { A::C::x a; // ok } + +namespace preserve_keyword { + template struct A { + using type = T; + }; + + template using B = A::type*; // precxx17-warning {{missing 'typename'}} + void *t1 = *B(); // expected-error {{lvalue of type 'A::type' (aka 'int')}} + + template using C = typename A::type*; + void *t2 = *C(); // expected-error {{lvalue of type 'typename A::type' (aka 'int')}} + + using D = A::type*; + void *t3 = *D(); // expected-error {{lvalue of type 'A::type' (aka 'int')}} + + using D = typename A::type*; + void *t4 = *D(); // expected-error {{lvalue of type 'typename A::type' (aka 'int')}} +} // namespace preserve_keyword From c4e9901b5b660f7c64570c3440080436c8e8b32e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 12:28:47 -0700 Subject: [PATCH 165/710] [llvm] Use llvm::append_range (NFC) (#135931) --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 6 ++++-- llvm/lib/Analysis/MemoryProfileInfo.cpp | 6 ++---- llvm/lib/Analysis/ScalarEvolution.cpp | 6 ++---- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 +- llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp | 4 +--- llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp | 2 +- llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp | 2 +- .../ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp | 2 +- llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp | 4 ++-- llvm/lib/IR/DebugInfoMetadata.cpp | 2 +- llvm/lib/MC/DXContainerPSVInfo.cpp | 3 +-- llvm/lib/MC/MCParser/MasmParser.cpp | 5 ++--- llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp | 4 ++-- llvm/lib/ProfileData/InstrProfReader.cpp | 2 +- llvm/lib/TargetParser/SubtargetFeature.cpp | 2 +- llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp | 4 ++-- llvm/unittests/DebugInfo/PDB/HashTableTest.cpp | 2 +- llvm/unittests/Transforms/IPO/LowerTypeTests.cpp | 2 +- llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 6 +++--- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 2 +- 20 files changed, 31 insertions(+), 37 deletions(-) diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 7aa36345268cd..3f20d40722ca2 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1319,8 +1319,10 @@ class CfiFunctionIndex { std::vector symbols() const { std::vector Symbols; - for (auto &[GUID, Syms] : Index) - Symbols.insert(Symbols.end(), Syms.begin(), Syms.end()); + for (auto &[GUID, Syms] : Index) { + (void)GUID; + llvm::append_range(Symbols, Syms); + } return Symbols; } diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 95138de592290..6ca5b5e492723 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -181,8 +181,7 @@ void CallStackTrie::addCallStack( Curr = New; } assert(Curr); - Curr->ContextSizeInfo.insert(Curr->ContextSizeInfo.end(), - ContextSizeInfo.begin(), ContextSizeInfo.end()); + llvm::append_range(Curr->ContextSizeInfo, ContextSizeInfo); } void CallStackTrie::addCallStack(MDNode *MIB) { @@ -235,8 +234,7 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, void CallStackTrie::collectContextSizeInfo( CallStackTrieNode *Node, std::vector &ContextSizeInfo) { - ContextSizeInfo.insert(ContextSizeInfo.end(), Node->ContextSizeInfo.begin(), - Node->ContextSizeInfo.end()); + llvm::append_range(ContextSizeInfo, Node->ContextSizeInfo); for (auto &Caller : Node->Callers) collectContextSizeInfo(Caller.second, ContextSizeInfo); } diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index d193c9e3210ea..5132ee13a9632 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8503,10 +8503,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) { } auto LoopUsersItr = LoopUsers.find(CurrL); - if (LoopUsersItr != LoopUsers.end()) { - ToForget.insert(ToForget.end(), LoopUsersItr->second.begin(), - LoopUsersItr->second.end()); - } + if (LoopUsersItr != LoopUsers.end()) + llvm::append_range(ToForget, LoopUsersItr->second); // Drop information about expressions based on loop-header PHIs. PushLoopPHIs(CurrL, Worklist, Visited); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index ad15f13902e63..73bed85c65b3d 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -5098,7 +5098,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return; for (GlobalValue::GUID GUID : DefOrUseGUIDs) { auto Defs = CfiIndex.forGuid(GUID); - Functions.insert(Functions.end(), Defs.begin(), Defs.end()); + llvm::append_range(Functions, Defs); } if (Functions.empty()) return; diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index 642ab61756ea5..22137ea172240 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -621,9 +621,7 @@ void DwarfTransformer::parseCallSiteInfoFromDwarf(CUInfo &CUI, DWARFDie Die, if (!FI.CallSites) FI.CallSites = CallSiteInfoCollection(); // Append parsed DWARF callsites: - FI.CallSites->CallSites.insert(FI.CallSites->CallSites.end(), - CSIC.CallSites.begin(), - CSIC.CallSites.end()); + llvm::append_range(FI.CallSites->CallSites, CSIC.CallSites); } } diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp index 5673ea7c2cd23..3cb2662f2f313 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVCompare.cpp @@ -230,7 +230,7 @@ Error LVCompare::execute(LVReader *ReferenceReader, LVReader *TargetReader) { } if (Pass == LVComparePass::Added) // Record all the current missing elements for this category. - Set.insert(Set.end(), Elements.begin(), Elements.end()); + llvm::append_range(Set, Elements); if (options().getReportList()) { if (Elements.size()) { OS << "\n(" << Elements.size() << ") " diff --git a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp index 4f8f883a75f32..87675be1fc8e1 100644 --- a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp +++ b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp @@ -485,7 +485,7 @@ static GenericValue lle_X_fprintf(FunctionType *FT, char Buffer[10000]; std::vector NewArgs; NewArgs.push_back(PTOGV(Buffer)); - NewArgs.insert(NewArgs.end(), Args.begin()+1, Args.end()); + llvm::append_range(NewArgs, llvm::drop_begin(Args)); GenericValue GV = lle_X_sprintf(FT, NewArgs); fputs(Buffer, (FILE *) GVTOP(Args[0])); diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp index 6a00b87dd0a6b..8793d6f8ab90b 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.cpp @@ -162,7 +162,7 @@ void VTuneSupportPlugin::notifyTransferringResources(JITDylib &JD, return; auto &Dest = LoadedMethodIDs[DstKey]; - Dest.insert(Dest.end(), I->second.begin(), I->second.end()); + llvm::append_range(Dest, I->second); LoadedMethodIDs.erase(SrcKey); } diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp index 80f2a1304dde7..48b096f62ff29 100644 --- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -323,7 +323,7 @@ void LazyReexportsManager::handleTransferResources(JITDylib &JD, } else { auto &SrcAddrs = I->second; auto &DstAddrs = J->second; - DstAddrs.insert(DstAddrs.end(), SrcAddrs.begin(), SrcAddrs.end()); + llvm::append_range(DstAddrs, SrcAddrs); KeyToReentryAddrs.erase(I); } if (L) @@ -503,7 +503,7 @@ void SimpleLazyReexportsSpeculator::onLazyReexportsTransfered( } else { auto &SrcNames = J->second; auto &DstNames = K->second; - DstNames.insert(DstNames.end(), SrcNames.begin(), SrcNames.end()); + llvm::append_range(DstNames, SrcNames); MapForJD.erase(J); } } diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index 12aba7d2bd123..b8b824aed7178 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -1990,7 +1990,7 @@ DIExpression *DIExpression::appendOpsToArg(const DIExpression *Expr, } Op.appendToVector(NewOps); if (Op.getOp() == dwarf::DW_OP_LLVM_arg && Op.getArg(0) == ArgNo) - NewOps.insert(NewOps.end(), Ops.begin(), Ops.end()); + llvm::append_range(NewOps, Ops); } if (StackValue) NewOps.push_back(dwarf::DW_OP_stack_value); diff --git a/llvm/lib/MC/DXContainerPSVInfo.cpp b/llvm/lib/MC/DXContainerPSVInfo.cpp index aeff693801397..f70c8b1af01b3 100644 --- a/llvm/lib/MC/DXContainerPSVInfo.cpp +++ b/llvm/lib/MC/DXContainerPSVInfo.cpp @@ -58,8 +58,7 @@ ProcessElementList(StringTableBuilder &StrTabBuilder, size_t Idx = FindSequence(IndexBuffer, El.Indices); if (Idx == npos) { FinalElement.IndicesOffset = static_cast(IndexBuffer.size()); - IndexBuffer.insert(IndexBuffer.end(), El.Indices.begin(), - El.Indices.end()); + llvm::append_range(IndexBuffer, El.Indices); } else FinalElement.IndicesOffset = static_cast(Idx); FinalElements.push_back(FinalElement); diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index bbcdffd4d4fa8..f758020566465 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -3647,9 +3647,8 @@ bool MasmParser::parseFieldInitializer(const FieldInfo &Field, std::to_string(Initializers.size())); } // Default-initialize all remaining values. - Initializers.insert(Initializers.end(), - Contents.Initializers.begin() + Initializers.size(), - Contents.Initializers.end()); + llvm::append_range(Initializers, llvm::drop_begin(Contents.Initializers, + Initializers.size())); Initializer = FieldInitializer(std::move(Initializers), Contents.Structure); return false; diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp index b0ec215aec203..935f89ad76440 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp @@ -662,13 +662,13 @@ RemoveNoteDetail::updateData(ArrayRef OldData, for (const DeletedRange &RemRange : ToRemove) { if (CurPos < RemRange.OldFrom) { auto Slice = OldData.slice(CurPos, RemRange.OldFrom - CurPos); - NewData.insert(NewData.end(), Slice.begin(), Slice.end()); + llvm::append_range(NewData, Slice); } CurPos = RemRange.OldTo; } if (CurPos < OldData.size()) { auto Slice = OldData.slice(CurPos); - NewData.insert(NewData.end(), Slice.begin(), Slice.end()); + llvm::append_range(NewData, Slice); } return NewData; } diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index cac1760d3ef80..4075b513c218d 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1096,7 +1096,7 @@ class llvm::InstrProfReaderItaniumRemapper SmallVectorImpl &Out) { Out.reserve(OrigName.size() + Replacement.size() - ExtractedName.size()); Out.insert(Out.end(), OrigName.begin(), ExtractedName.begin()); - Out.insert(Out.end(), Replacement.begin(), Replacement.end()); + llvm::append_range(Out, Replacement); Out.insert(Out.end(), ExtractedName.end(), OrigName.end()); } diff --git a/llvm/lib/TargetParser/SubtargetFeature.cpp b/llvm/lib/TargetParser/SubtargetFeature.cpp index be42a42967332..36c67f661d9a5 100644 --- a/llvm/lib/TargetParser/SubtargetFeature.cpp +++ b/llvm/lib/TargetParser/SubtargetFeature.cpp @@ -43,7 +43,7 @@ void SubtargetFeatures::AddFeature(StringRef String, bool Enable) { void SubtargetFeatures::addFeaturesVector( const ArrayRef OtherFeatures) { - Features.insert(Features.cend(), OtherFeatures.begin(), OtherFeatures.end()); + llvm::append_range(Features, OtherFeatures); } SubtargetFeatures::SubtargetFeatures(StringRef Initial) { diff --git a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp index cf4a5f27585d0..f8d161d8c50b6 100644 --- a/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DwarfGenerator.cpp @@ -243,7 +243,7 @@ void dwarfgen::LineTable::addByte(uint8_t Value) { void dwarfgen::LineTable::addStandardOpcode(uint8_t Opcode, ArrayRef Operands) { Contents.push_back({Opcode, Byte}); - Contents.insert(Contents.end(), Operands.begin(), Operands.end()); + llvm::append_range(Contents, Operands); } void dwarfgen::LineTable::addExtendedOpcode(uint64_t Length, uint8_t Opcode, @@ -251,7 +251,7 @@ void dwarfgen::LineTable::addExtendedOpcode(uint64_t Length, uint8_t Opcode, Contents.push_back({0, Byte}); Contents.push_back({Length, ULEB}); Contents.push_back({Opcode, Byte}); - Contents.insert(Contents.end(), Operands.begin(), Operands.end()); + llvm::append_range(Contents, Operands); } void dwarfgen::LineTable::generate(MCContext &MC, AsmPrinter &Asm) const { diff --git a/llvm/unittests/DebugInfo/PDB/HashTableTest.cpp b/llvm/unittests/DebugInfo/PDB/HashTableTest.cpp index 6d17332f49079..94e82ed02c398 100644 --- a/llvm/unittests/DebugInfo/PDB/HashTableTest.cpp +++ b/llvm/unittests/DebugInfo/PDB/HashTableTest.cpp @@ -233,7 +233,7 @@ struct FooBarHashTraits { uint32_t lookupKeyToStorageKey(StringRef S) { uint32_t N = Buffer.size(); - Buffer.insert(Buffer.end(), S.begin(), S.end()); + llvm::append_range(Buffer, S); Buffer.push_back('\0'); return N; } diff --git a/llvm/unittests/Transforms/IPO/LowerTypeTests.cpp b/llvm/unittests/Transforms/IPO/LowerTypeTests.cpp index ba13378099ecb..1602826b7252c 100644 --- a/llvm/unittests/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/unittests/Transforms/IPO/LowerTypeTests.cpp @@ -100,7 +100,7 @@ TEST(LowerTypeTests, GlobalLayoutBuilder) { std::vector ComputedLayout; for (auto &&F : GLB.Fragments) - ComputedLayout.insert(ComputedLayout.end(), F.begin(), F.end()); + llvm::append_range(ComputedLayout, F); EXPECT_EQ(T.WantLayout, ComputedLayout); } diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 3a6e828a99f2d..eb142e66faf2f 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -678,7 +678,7 @@ struct TupleExpander : SetTheory::Expander { // Take the cost list of the first register in the tuple. const ListInit *CostList = Proto->getValueAsListInit("CostPerUse"); SmallVector CostPerUse; - CostPerUse.insert(CostPerUse.end(), CostList->begin(), CostList->end()); + llvm::append_range(CostPerUse, *CostList); const StringInit *AsmName = StringInit::get(RK, ""); if (!RegNames.empty()) { @@ -1186,7 +1186,7 @@ void CodeGenRegisterClass::extendSuperRegClasses(CodeGenSubRegIndex *SubIdx) { return; SmallVector MidRCs; - MidRCs.insert(MidRCs.end(), It->second.begin(), It->second.end()); + llvm::append_range(MidRCs, It->second); for (CodeGenRegisterClass *MidRC : MidRCs) { for (auto &Pair : MidRC->SuperRegClasses) { @@ -1244,7 +1244,7 @@ CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records, for (const Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) { // Expand tuples and merge the vectors std::vector TupRegs = *Sets.expand(R); - Regs.insert(Regs.end(), TupRegs.begin(), TupRegs.end()); + llvm::append_range(Regs, TupRegs); } llvm::sort(Regs, LessRecordRegister()); diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 45c6db94023b7..98f0d7eaaff38 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -1487,7 +1487,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) { // each register. Fill with zero for values which are not explicitly given. for (const auto &Reg : Regs) { auto Costs = Reg.CostPerUse; - AllRegCostPerUse.insert(AllRegCostPerUse.end(), Costs.begin(), Costs.end()); + llvm::append_range(AllRegCostPerUse, Costs); if (NumRegCosts > Costs.size()) AllRegCostPerUse.insert(AllRegCostPerUse.end(), NumRegCosts - Costs.size(), 0); From 0f97cd87dee1ddc6aadc04369262739da2f55b5a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 12:29:04 -0700 Subject: [PATCH 166/710] [Frontend] Use StringRef::ends_with (NFC) (#135988) --- clang/lib/Frontend/InitPreprocessor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 659af42478991..1f297f228fc1b 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -32,7 +32,7 @@ using namespace clang; static bool MacroBodyEndsInBackslash(StringRef MacroBody) { while (!MacroBody.empty() && isWhitespace(MacroBody.back())) MacroBody = MacroBody.drop_back(); - return !MacroBody.empty() && MacroBody.back() == '\\'; + return MacroBody.ends_with('\\'); } // Append a #define line to Buf for Macro. Macro should be of the form XXX, From fa46d522fe66b77bdf3156be9255ce3b83010433 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 12:29:33 -0700 Subject: [PATCH 167/710] [llvm-xray] Use llvm::make_second_range (NFC) (#135989) --- llvm/tools/llvm-xray/xray-stacks.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/llvm/tools/llvm-xray/xray-stacks.cpp b/llvm/tools/llvm-xray/xray-stacks.cpp index cbf6faeb32960..b11d732a4fcc0 100644 --- a/llvm/tools/llvm-xray/xray-stacks.cpp +++ b/llvm/tools/llvm-xray/xray-stacks.cpp @@ -495,15 +495,8 @@ class StackTrie { void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { RootVector RootValues; - // Function to pull the values out of a map iterator. - using RootsType = decltype(Roots.begin())::value_type; - auto MapValueFn = [](const RootsType &Value) { return Value.second; }; - - for (const auto &RootNodeRange : - make_range(map_iterator(Roots.begin(), MapValueFn), - map_iterator(Roots.end(), MapValueFn))) { + for (const auto &RootNodeRange : make_second_range(Roots)) llvm::append_range(RootValues, RootNodeRange); - } print(OS, FN, RootValues); } From 4863d1ffbde77b8a3c580b4f7905ec21b1aba7e0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 16 Apr 2025 12:29:58 -0700 Subject: [PATCH 168/710] [Serialization] Use llvm::map_range (NFC) (#135990) --- clang/lib/Serialization/MultiOnDiskHashTable.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/lib/Serialization/MultiOnDiskHashTable.h b/clang/lib/Serialization/MultiOnDiskHashTable.h index fa18a29a9a98e..996e9b94287f4 100644 --- a/clang/lib/Serialization/MultiOnDiskHashTable.h +++ b/clang/lib/Serialization/MultiOnDiskHashTable.h @@ -103,11 +103,9 @@ template class MultiOnDiskHashTable { /// The current set of on-disk tables. table_range tables() { - auto Begin = Tables.begin(), End = Tables.end(); - if (getMergedTable()) - ++Begin; - return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()), - llvm::map_iterator(End, AsOnDiskTable())); + unsigned DropBegin = getMergedTable() ? 1 : 0; + return llvm::map_range(llvm::drop_begin(Tables, DropBegin), + AsOnDiskTable()); } MergedTable *getMergedTable() const { From d338bdc9fe72aef1d9d2043d569fd94eb57db111 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Wed, 16 Apr 2025 22:02:52 +0200 Subject: [PATCH 169/710] [mlir][linalg][NFC] Update elementwise docs to match op name (#135999) Updates linalg.elementwise op description to replace older abbreviated mnemonic with its current form. --- .../mlir/Dialect/Linalg/IR/LinalgStructuredOps.td | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 308e39a9a51e1..b9edcc92e81a9 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -565,18 +565,18 @@ def ElementwiseOp : LinalgStructuredBase_Op<"elementwise", [ Example: - Defining a unary linalg.elemwise with default indexing-map: + Defining a unary linalg.elementwise with default indexing-map: ```mlir - %exp = linalg.elemwise - kind=#linalg.elemwise_kind + %exp = linalg.elementwise + kind=#linalg.elementwise_kind ins(%x : tensor<4x16x8xf32>) outs(%y: tensor<4x16x8xf32>) -> tensor<4x16x8xf32> ``` - Defining a binary linalg.elemwise with user-defined indexing-map: + Defining a binary linalg.elementwise with user-defined indexing-map: ```mlir - %add = linalg.elemwise - kind=#linalg.elemwise_kind + %add = linalg.elementwise + kind=#linalg.elementwise_kind indexing_maps = [#transpose, #broadcast, #identity] ins(%exp, %arg1 : tensor<4x16x8xf32>, tensor<4x16xf32>) outs(%arg2: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> From 598ec8ce2d1e5e20b45c56de8972f58a0caeb697 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 16 Apr 2025 13:07:58 -0700 Subject: [PATCH 170/710] [LLVM][TableGen] Parameterize NumToSkip in DecoderEmitter (#135882) - Add command line option `num-to-skip-size` to parameterize the size of `NumToSkip` bytes in the decoder table. Default value will be 2, and targets that need larger size can use 3. - Keep all existing targets, except AArch64, to use size 2, and change AArch64 to use size 3 since it run into the "disassembler decoding table too large" error with size 2. - Following is a rough reduction in size for the decoder tables by switching to size 2. ``` Target Old Size New Size % Reduction ================================================ AArch64 153254 153254 0.00 AMDGPU 471566 412805 12.46 ARC 5724 5061 11.58 ARM 84936 73831 13.07 AVR 1497 1306 12.76 BPF 2172 1927 11.28 CSKY 10064 8692 13.63 Hexagon 47967 41965 12.51 Lanai 1108 982 11.37 LoongArch 24446 21621 11.56 MSP430 4200 3716 11.52 Mips 36330 31415 13.53 PPC 31897 28098 11.91 RISCV 37979 32790 13.66 Sparc 8331 7252 12.95 SystemZ 36722 32248 12.18 VE 48296 42873 11.23 XCore 2590 2316 10.58 Xtensa 3827 3316 13.35 ``` --- llvm/lib/Target/AArch64/CMakeLists.txt | 2 +- llvm/test/TableGen/VarLenDecoder.td | 4 +- llvm/test/TableGen/trydecode-emission.td | 10 +- llvm/test/TableGen/trydecode-emission2.td | 16 +-- llvm/test/TableGen/trydecode-emission3.td | 2 +- llvm/test/TableGen/trydecode-emission4.td | 2 +- llvm/utils/TableGen/DecoderEmitter.cpp | 115 ++++++++++++---------- 7 files changed, 83 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 2300e479bc110..ba1d1605ec104 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -7,7 +7,7 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler --num-to-skip-size=3) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td index 5cf0bf8911859..b77702ff7c5c1 100644 --- a/llvm/test/TableGen/VarLenDecoder.td +++ b/llvm/test/TableGen/VarLenDecoder.td @@ -47,9 +47,9 @@ def FOO32 : MyVarInst { } // CHECK: MCD::OPC_ExtractField, 3, 5, // Inst{7-3} ... -// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, 0, // Skip to: 12 +// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16 -// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, 0, // Skip to: 21 +// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, // Skip to: 19 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32 // CHECK-NEXT: MCD::OPC_Fail, diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/trydecode-emission.td index 20d2446eeac7f..2b4239f4fbe65 100644 --- a/llvm/test/TableGen/trydecode-emission.td +++ b/llvm/test/TableGen/trydecode-emission.td @@ -34,10 +34,10 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 4, 4, // Inst{7-4} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 18, 0, 0, // Skip to: 26 -// CHECK-NEXT: /* 8 */ MCD::OPC_CheckField, 2, 2, 0, 7, 0, 0, // Skip to: 22 -// CHECK-NEXT: /* 15 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 22 -// CHECK-NEXT: /* 22 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA -// CHECK-NEXT: /* 26 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 16, 0, // Skip to: 23 +// CHECK-NEXT: /* 7 */ MCD::OPC_CheckField, 2, 2, 0, 6, 0, // Skip to: 19 +// CHECK-NEXT: /* 13 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 19 +// CHECK-NEXT: /* 19 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA +// CHECK-NEXT: /* 23 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/trydecode-emission2.td index 0584034e41233..7d30474058f73 100644 --- a/llvm/test/TableGen/trydecode-emission2.td +++ b/llvm/test/TableGen/trydecode-emission2.td @@ -31,14 +31,14 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 2, 1, // Inst{2} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 36, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 8 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... -// CHECK-NEXT: /* 11 */ MCD::OPC_FilterValue, 0, 28, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 16 */ MCD::OPC_CheckField, 0, 2, 3, 7, 0, 0, // Skip to: 30 -// CHECK-NEXT: /* 23 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 30 -// CHECK-NEXT: /* 30 */ MCD::OPC_CheckField, 3, 2, 0, 7, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 37 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, 0, // Opcode: InstA, skip to: 44 -// CHECK-NEXT: /* 44 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 31, 0, // Skip to: 38 +// CHECK-NEXT: /* 7 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... +// CHECK-NEXT: /* 10 */ MCD::OPC_FilterValue, 0, 24, 0, // Skip to: 38 +// CHECK-NEXT: /* 14 */ MCD::OPC_CheckField, 0, 2, 3, 6, 0, // Skip to: 26 +// CHECK-NEXT: /* 20 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 26 +// CHECK-NEXT: /* 26 */ MCD::OPC_CheckField, 3, 2, 0, 6, 0, // Skip to: 38 +// CHECK-NEXT: /* 32 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, // Opcode: InstA, skip to: 38 +// CHECK-NEXT: /* 38 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } // CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/trydecode-emission3.td index 4c5be7e1af229..0abbe62fe337e 100644 --- a/llvm/test/TableGen/trydecode-emission3.td +++ b/llvm/test/TableGen/trydecode-emission3.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s + // RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/trydecode-emission4.td index 1e51ba5e40768..413e4a0d1275a 100644 --- a/llvm/test/TableGen/trydecode-emission4.td +++ b/llvm/test/TableGen/trydecode-emission4.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s // Test for OPC_ExtractField/OPC_CheckField with start bit > 255. // These large start values may arise for architectures with long instruction diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 9c6015cc24576..0735702abf339 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -32,8 +32,10 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -76,6 +78,12 @@ static cl::opt DecoderEmitterSuppressDuplicates( "significantly reducing Table Duplications")), cl::init(SUPPRESSION_DISABLE), cl::cat(DisassemblerEmitterCat)); +static cl::opt + NumToSkipSizeInBytes("num-to-skip-size", + cl::desc("number of bytes to use for num-to-skip " + "entries in the decoder table (2 or 3)"), + cl::init(2), cl::cat(DisassemblerEmitterCat)); + STATISTIC(NumEncodings, "Number of encodings considered"); STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info"); @@ -130,10 +138,29 @@ struct DecoderTable : public std::vector { // in the table for patching. size_t insertNumToSkip() { size_t Size = size(); - insert(end(), 3, 0); + insert(end(), NumToSkipSizeInBytes, 0); return Size; } + + void patchNumToSkip(size_t FixupIdx, uint32_t DestIdx) { + // Calculate the distance from the byte following the fixup entry byte + // to the destination. The Target is calculated from after the + // `NumToSkipSizeInBytes`-byte NumToSkip entry itself, so subtract + // `NumToSkipSizeInBytes` from the displacement here to account for that. + assert(DestIdx > FixupIdx + NumToSkipSizeInBytes && + "Expecting a forward jump in the decoding table"); + uint32_t Delta = DestIdx - FixupIdx - NumToSkipSizeInBytes; + if (!isUIntN(8 * NumToSkipSizeInBytes, Delta)) + PrintFatalError( + "disassembler decoding table too large, try --num-to-skip-size=3"); + + (*this)[FixupIdx] = static_cast(Delta); + (*this)[FixupIdx + 1] = static_cast(Delta >> 8); + if (NumToSkipSizeInBytes == 3) + (*this)[FixupIdx + 2] = static_cast(Delta >> 16); + } }; + struct DecoderTableInfo { DecoderTable Table; FixupScopeList FixupStack; @@ -690,19 +717,8 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, uint32_t DestIdx) { // Any NumToSkip fixups in the current scope can resolve to the // current location. - for (uint32_t FixupIdx : reverse(Fixups)) { - // Calculate the distance from the byte following the fixup entry byte - // to the destination. The Target is calculated from after the 24-bit - // NumToSkip entry itself, so subtract three from the displacement here - // to account for that. - uint32_t Delta = DestIdx - FixupIdx - 3; - // Our NumToSkip entries are 24-bits. Make sure our table isn't too - // big. - assert(isUInt<24>(Delta)); - Table[FixupIdx] = (uint8_t)Delta; - Table[FixupIdx + 1] = (uint8_t)(Delta >> 8); - Table[FixupIdx + 2] = (uint8_t)(Delta >> 16); - } + for (uint32_t FixupIdx : Fixups) + Table.patchNumToSkip(FixupIdx, DestIdx); } // Emit table entries to decode instructions given a segment or segments @@ -759,15 +775,9 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { Delegate->emitTableEntries(TableInfo); // Now that we've emitted the body of the handler, update the NumToSkip - // of the filter itself to be able to skip forward when false. Subtract - // three as to account for the width of the NumToSkip field itself. - if (PrevFilter) { - uint32_t NumToSkip = Table.size() - PrevFilter - 3; - assert(isUInt<24>(NumToSkip) && "disassembler decoding table too large!"); - Table[PrevFilter] = (uint8_t)NumToSkip; - Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8); - Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16); - } + // of the filter itself to be able to skip forward when false. + if (PrevFilter) + Table.patchNumToSkip(PrevFilter, Table.size()); } // If there is no fallthrough, then the final filter should get fixed @@ -814,7 +824,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << (unsigned)*I++ << ", "; }; - // Emit 24-bit numtoskip value to OS, returning the NumToSkip value. + // Emit `NumToSkipSizeInBytes`-byte numtoskip value to OS, returning the + // NumToSkip value. auto emitNumToSkip = [](DecoderTable::const_iterator &I, formatted_raw_ostream &OS) { uint8_t Byte = *I++; @@ -823,9 +834,11 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, Byte = *I++; OS << (unsigned)Byte << ", "; NumToSkip |= Byte << 8; - Byte = *I++; - OS << (unsigned)(Byte) << ", "; - NumToSkip |= Byte << 16; + if (NumToSkipSizeInBytes == 3) { + Byte = *I++; + OS << (unsigned)(Byte) << ", "; + NumToSkip |= Byte << 16; + } return NumToSkip; }; @@ -867,7 +880,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // The filter value is ULEB128 encoded. emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -883,7 +896,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // ULEB128 encoded field value. emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -893,7 +906,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << Indent << "MCD::OPC_CheckPredicate, "; emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -925,7 +938,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // Fallthrough for OPC_TryDecode. - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Opcode: " << NumberedEncodings[EncodingID] @@ -1411,9 +1424,9 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(NumBits); TableInfo.Table.insertULEB128(Ilnd.FieldVal); - // The fixup is always 24-bits, so go ahead and allocate the space - // in the table so all our relative position calculations work OK even - // before we fully resolve the real value here. + // Allocate space in the table for fixup (NumToSkipSizeInBytes) so all + // our relative position calculations work OK even before we fully + // resolve the real value here. // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.insertNumToSkip()); @@ -2157,7 +2170,18 @@ insertBits(InsnType &field, uint64_t bits, unsigned startBit, unsigned numBits) // decodeInstruction(). static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst) { + OS << formatv("\nconstexpr unsigned NumToSkipSizeInBytes = {};\n", + NumToSkipSizeInBytes); + OS << R"( +inline unsigned decodeNumToSkip(const uint8_t *&Ptr) { + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + if constexpr (NumToSkipSizeInBytes == 3) + NumToSkip |= (*Ptr++) << 16; + return NumToSkip; +} + template static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, InsnType insn, uint64_t Address, @@ -2195,10 +2219,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the field value. uint64_t Val = decodeULEB128AndIncUnsafe(++Ptr); bool Failed = Val != CurFieldValue; - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Perform the filter operation. if (Failed) @@ -2222,10 +2243,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen); Ptr += PtrLen; bool Failed = ExpectedValue != FieldValue; - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // If the actual and expected values don't match, skip. if (Failed) @@ -2240,10 +2258,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, case MCD::OPC_CheckPredicate: { // Decode the Predicate Index value. unsigned PIdx = decodeULEB128AndIncUnsafe(++Ptr); - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Check the predicate. bool Failed = !checkDecoderPredicate(PIdx, Bits); if (Failed) @@ -2278,10 +2293,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the Opcode value. unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr); unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr); - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Perform the decode operation. MCInst TmpMI; @@ -2406,6 +2418,9 @@ handleHwModesUnrelatedEncodings(const CodeGenInstruction *Instr, // Emits disassembler code for instruction decoding. void DecoderEmitter::run(raw_ostream &o) { + if (NumToSkipSizeInBytes != 2 && NumToSkipSizeInBytes != 3) + PrintFatalError("Invalid value for num-to-skip-size, must be 2 or 3"); + formatted_raw_ostream OS(o); OS << R"( #include "llvm/MC/MCInst.h" From 80855eb6f11b06c194939eb305761eb2b62822f9 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 16 Apr 2025 13:11:55 -0700 Subject: [PATCH 171/710] [SampleFDO] Extend the function base name max size (#135863) The function base name could be way long which overflows and leads to a crash. Update to extend the max size. Also changed to use heap allocation( `std::vector` ) to avoid stack overflow. --- llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index d6d1b7c51d4c0..963c321772d6e 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -737,11 +737,11 @@ bool SampleProfileMatcher::functionMatchesProfileHelper( auto FunctionName = FName.str(); if (Demangler.partialDemangle(FunctionName.c_str())) return std::string(); - constexpr size_t MaxBaseNameSize = 4096; - char BaseNameBuf[MaxBaseNameSize] = {}; + constexpr size_t MaxBaseNameSize = 65536; + std::vector BaseNameBuf(MaxBaseNameSize, 0); size_t BaseNameSize = MaxBaseNameSize; char *BaseNamePtr = - Demangler.getFunctionBaseName(BaseNameBuf, &BaseNameSize); + Demangler.getFunctionBaseName(BaseNameBuf.data(), &BaseNameSize); return (BaseNamePtr && BaseNameSize) ? std::string(BaseNamePtr, BaseNameSize) : std::string(); From 05aa98955c697e52209d775327013e9cc2be6321 Mon Sep 17 00:00:00 2001 From: calebwat <107081575+calebwat@users.noreply.github.com> Date: Wed, 16 Apr 2025 13:12:25 -0700 Subject: [PATCH 172/710] [NFCI] Explicitly delete unused copy constructor and assign copy for VPInterleavedAccessInfo (#134755) VPInterleavedAccessInfo has a defined destructor freeing memory, but no explicitly defined copy constructor or copy assignment op. These are not used, so this patch marks them as deleted to avoid usage of the implicitly defined implementations. --- llvm/lib/Transforms/Vectorize/VPlanSLP.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 93f04e6e30a6f..7f123689170ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -48,6 +48,8 @@ class VPInterleavedAccessInfo { public: VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); + VPInterleavedAccessInfo(const VPInterleavedAccessInfo &) = delete; + VPInterleavedAccessInfo &operator=(const VPInterleavedAccessInfo &) = delete; ~VPInterleavedAccessInfo() { // Avoid releasing a pointer twice. From 7fd0c8acd4659ccd0aef5486afe32c8ddf0f2957 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 16 Apr 2025 13:16:32 -0700 Subject: [PATCH 173/710] Revert "[LLVM][TableGen] Parameterize NumToSkip in DecoderEmitter" (#136017) Reverts llvm/llvm-project#135882 Causing assert failures for AArch64 backend --- llvm/lib/Target/AArch64/CMakeLists.txt | 2 +- llvm/test/TableGen/VarLenDecoder.td | 4 +- llvm/test/TableGen/trydecode-emission.td | 10 +- llvm/test/TableGen/trydecode-emission2.td | 16 +-- llvm/test/TableGen/trydecode-emission3.td | 2 +- llvm/test/TableGen/trydecode-emission4.td | 2 +- llvm/utils/TableGen/DecoderEmitter.cpp | 115 ++++++++++------------ 7 files changed, 68 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index ba1d1605ec104..2300e479bc110 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -7,7 +7,7 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler --num-to-skip-size=3) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td index b77702ff7c5c1..5cf0bf8911859 100644 --- a/llvm/test/TableGen/VarLenDecoder.td +++ b/llvm/test/TableGen/VarLenDecoder.td @@ -47,9 +47,9 @@ def FOO32 : MyVarInst { } // CHECK: MCD::OPC_ExtractField, 3, 5, // Inst{7-3} ... -// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11 +// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, 0, // Skip to: 12 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16 -// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, // Skip to: 19 +// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, 0, // Skip to: 21 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32 // CHECK-NEXT: MCD::OPC_Fail, diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/trydecode-emission.td index 2b4239f4fbe65..20d2446eeac7f 100644 --- a/llvm/test/TableGen/trydecode-emission.td +++ b/llvm/test/TableGen/trydecode-emission.td @@ -34,10 +34,10 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 4, 4, // Inst{7-4} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 16, 0, // Skip to: 23 -// CHECK-NEXT: /* 7 */ MCD::OPC_CheckField, 2, 2, 0, 6, 0, // Skip to: 19 -// CHECK-NEXT: /* 13 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 19 -// CHECK-NEXT: /* 19 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA -// CHECK-NEXT: /* 23 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 18, 0, 0, // Skip to: 26 +// CHECK-NEXT: /* 8 */ MCD::OPC_CheckField, 2, 2, 0, 7, 0, 0, // Skip to: 22 +// CHECK-NEXT: /* 15 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 22 +// CHECK-NEXT: /* 22 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA +// CHECK-NEXT: /* 26 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/trydecode-emission2.td index 7d30474058f73..0584034e41233 100644 --- a/llvm/test/TableGen/trydecode-emission2.td +++ b/llvm/test/TableGen/trydecode-emission2.td @@ -31,14 +31,14 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 2, 1, // Inst{2} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 31, 0, // Skip to: 38 -// CHECK-NEXT: /* 7 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... -// CHECK-NEXT: /* 10 */ MCD::OPC_FilterValue, 0, 24, 0, // Skip to: 38 -// CHECK-NEXT: /* 14 */ MCD::OPC_CheckField, 0, 2, 3, 6, 0, // Skip to: 26 -// CHECK-NEXT: /* 20 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 26 -// CHECK-NEXT: /* 26 */ MCD::OPC_CheckField, 3, 2, 0, 6, 0, // Skip to: 38 -// CHECK-NEXT: /* 32 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, // Opcode: InstA, skip to: 38 -// CHECK-NEXT: /* 38 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 36, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 8 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... +// CHECK-NEXT: /* 11 */ MCD::OPC_FilterValue, 0, 28, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 16 */ MCD::OPC_CheckField, 0, 2, 3, 7, 0, 0, // Skip to: 30 +// CHECK-NEXT: /* 23 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 30 +// CHECK-NEXT: /* 30 */ MCD::OPC_CheckField, 3, 2, 0, 7, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 37 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, 0, // Opcode: InstA, skip to: 44 +// CHECK-NEXT: /* 44 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } // CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/trydecode-emission3.td index 0abbe62fe337e..4c5be7e1af229 100644 --- a/llvm/test/TableGen/trydecode-emission3.td +++ b/llvm/test/TableGen/trydecode-emission3.td @@ -1,4 +1,4 @@ - // RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/trydecode-emission4.td index 413e4a0d1275a..1e51ba5e40768 100644 --- a/llvm/test/TableGen/trydecode-emission4.td +++ b/llvm/test/TableGen/trydecode-emission4.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s // Test for OPC_ExtractField/OPC_CheckField with start bit > 255. // These large start values may arise for architectures with long instruction diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 0735702abf339..9c6015cc24576 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -32,10 +32,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -78,12 +76,6 @@ static cl::opt DecoderEmitterSuppressDuplicates( "significantly reducing Table Duplications")), cl::init(SUPPRESSION_DISABLE), cl::cat(DisassemblerEmitterCat)); -static cl::opt - NumToSkipSizeInBytes("num-to-skip-size", - cl::desc("number of bytes to use for num-to-skip " - "entries in the decoder table (2 or 3)"), - cl::init(2), cl::cat(DisassemblerEmitterCat)); - STATISTIC(NumEncodings, "Number of encodings considered"); STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info"); @@ -138,29 +130,10 @@ struct DecoderTable : public std::vector { // in the table for patching. size_t insertNumToSkip() { size_t Size = size(); - insert(end(), NumToSkipSizeInBytes, 0); + insert(end(), 3, 0); return Size; } - - void patchNumToSkip(size_t FixupIdx, uint32_t DestIdx) { - // Calculate the distance from the byte following the fixup entry byte - // to the destination. The Target is calculated from after the - // `NumToSkipSizeInBytes`-byte NumToSkip entry itself, so subtract - // `NumToSkipSizeInBytes` from the displacement here to account for that. - assert(DestIdx > FixupIdx + NumToSkipSizeInBytes && - "Expecting a forward jump in the decoding table"); - uint32_t Delta = DestIdx - FixupIdx - NumToSkipSizeInBytes; - if (!isUIntN(8 * NumToSkipSizeInBytes, Delta)) - PrintFatalError( - "disassembler decoding table too large, try --num-to-skip-size=3"); - - (*this)[FixupIdx] = static_cast(Delta); - (*this)[FixupIdx + 1] = static_cast(Delta >> 8); - if (NumToSkipSizeInBytes == 3) - (*this)[FixupIdx + 2] = static_cast(Delta >> 16); - } }; - struct DecoderTableInfo { DecoderTable Table; FixupScopeList FixupStack; @@ -717,8 +690,19 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, uint32_t DestIdx) { // Any NumToSkip fixups in the current scope can resolve to the // current location. - for (uint32_t FixupIdx : Fixups) - Table.patchNumToSkip(FixupIdx, DestIdx); + for (uint32_t FixupIdx : reverse(Fixups)) { + // Calculate the distance from the byte following the fixup entry byte + // to the destination. The Target is calculated from after the 24-bit + // NumToSkip entry itself, so subtract three from the displacement here + // to account for that. + uint32_t Delta = DestIdx - FixupIdx - 3; + // Our NumToSkip entries are 24-bits. Make sure our table isn't too + // big. + assert(isUInt<24>(Delta)); + Table[FixupIdx] = (uint8_t)Delta; + Table[FixupIdx + 1] = (uint8_t)(Delta >> 8); + Table[FixupIdx + 2] = (uint8_t)(Delta >> 16); + } } // Emit table entries to decode instructions given a segment or segments @@ -775,9 +759,15 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { Delegate->emitTableEntries(TableInfo); // Now that we've emitted the body of the handler, update the NumToSkip - // of the filter itself to be able to skip forward when false. - if (PrevFilter) - Table.patchNumToSkip(PrevFilter, Table.size()); + // of the filter itself to be able to skip forward when false. Subtract + // three as to account for the width of the NumToSkip field itself. + if (PrevFilter) { + uint32_t NumToSkip = Table.size() - PrevFilter - 3; + assert(isUInt<24>(NumToSkip) && "disassembler decoding table too large!"); + Table[PrevFilter] = (uint8_t)NumToSkip; + Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8); + Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16); + } } // If there is no fallthrough, then the final filter should get fixed @@ -824,8 +814,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << (unsigned)*I++ << ", "; }; - // Emit `NumToSkipSizeInBytes`-byte numtoskip value to OS, returning the - // NumToSkip value. + // Emit 24-bit numtoskip value to OS, returning the NumToSkip value. auto emitNumToSkip = [](DecoderTable::const_iterator &I, formatted_raw_ostream &OS) { uint8_t Byte = *I++; @@ -834,11 +823,9 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, Byte = *I++; OS << (unsigned)Byte << ", "; NumToSkip |= Byte << 8; - if (NumToSkipSizeInBytes == 3) { - Byte = *I++; - OS << (unsigned)(Byte) << ", "; - NumToSkip |= Byte << 16; - } + Byte = *I++; + OS << (unsigned)(Byte) << ", "; + NumToSkip |= Byte << 16; return NumToSkip; }; @@ -880,7 +867,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // The filter value is ULEB128 encoded. emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -896,7 +883,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // ULEB128 encoded field value. emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -906,7 +893,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << Indent << "MCD::OPC_CheckPredicate, "; emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -938,7 +925,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // Fallthrough for OPC_TryDecode. - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Opcode: " << NumberedEncodings[EncodingID] @@ -1424,9 +1411,9 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(NumBits); TableInfo.Table.insertULEB128(Ilnd.FieldVal); - // Allocate space in the table for fixup (NumToSkipSizeInBytes) so all - // our relative position calculations work OK even before we fully - // resolve the real value here. + // The fixup is always 24-bits, so go ahead and allocate the space + // in the table so all our relative position calculations work OK even + // before we fully resolve the real value here. // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.insertNumToSkip()); @@ -2170,18 +2157,7 @@ insertBits(InsnType &field, uint64_t bits, unsigned startBit, unsigned numBits) // decodeInstruction(). static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst) { - OS << formatv("\nconstexpr unsigned NumToSkipSizeInBytes = {};\n", - NumToSkipSizeInBytes); - OS << R"( -inline unsigned decodeNumToSkip(const uint8_t *&Ptr) { - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - if constexpr (NumToSkipSizeInBytes == 3) - NumToSkip |= (*Ptr++) << 16; - return NumToSkip; -} - template static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, InsnType insn, uint64_t Address, @@ -2219,7 +2195,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the field value. uint64_t Val = decodeULEB128AndIncUnsafe(++Ptr); bool Failed = Val != CurFieldValue; - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Perform the filter operation. if (Failed) @@ -2243,7 +2222,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen); Ptr += PtrLen; bool Failed = ExpectedValue != FieldValue; - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // If the actual and expected values don't match, skip. if (Failed) @@ -2258,7 +2240,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, case MCD::OPC_CheckPredicate: { // Decode the Predicate Index value. unsigned PIdx = decodeULEB128AndIncUnsafe(++Ptr); - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Check the predicate. bool Failed = !checkDecoderPredicate(PIdx, Bits); if (Failed) @@ -2293,7 +2278,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the Opcode value. unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr); unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr); - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Perform the decode operation. MCInst TmpMI; @@ -2418,9 +2406,6 @@ handleHwModesUnrelatedEncodings(const CodeGenInstruction *Instr, // Emits disassembler code for instruction decoding. void DecoderEmitter::run(raw_ostream &o) { - if (NumToSkipSizeInBytes != 2 && NumToSkipSizeInBytes != 3) - PrintFatalError("Invalid value for num-to-skip-size, must be 2 or 3"); - formatted_raw_ostream OS(o); OS << R"( #include "llvm/MC/MCInst.h" From c7fae59ac540ced666f664c88c2a49e06352a8dc Mon Sep 17 00:00:00 2001 From: James Newling Date: Wed, 16 Apr 2025 13:20:05 -0700 Subject: [PATCH 174/710] [mlir][vector] Move extract_strided_slice canonicalization to folding (#135676) Folders are preferred: https://mlir.llvm.org/docs/Canonicalization/#when-to-use-the-fold-method-vs-rewriterpatterns-for-canonicalizations Included here : some missing `-----` between lit test file with mlir-opt with `-split-input-file` flag --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 165 ++++++++------------- mlir/test/Dialect/Vector/canonicalize.mlir | 7 + 2 files changed, 69 insertions(+), 103 deletions(-) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 4b2fba03ce551..cea31276d10a8 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -3714,12 +3714,67 @@ foldExtractStridedOpFromInsertChain(ExtractStridedSliceOp op) { return failure(); } +// ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp. +static OpFoldResult +foldExtractStridedSliceNonSplatConstant(ExtractStridedSliceOp op, + Attribute foldInput) { + + auto dense = llvm::dyn_cast_if_present(foldInput); + if (!dense) + return {}; + + // TODO: Handle non-unit strides when they become available. + if (op.hasNonUnitStrides()) + return {}; + + VectorType sourceVecTy = op.getSourceVectorType(); + ArrayRef sourceShape = sourceVecTy.getShape(); + SmallVector sourceStrides = computeStrides(sourceShape); + + VectorType sliceVecTy = op.getType(); + ArrayRef sliceShape = sliceVecTy.getShape(); + int64_t rank = sliceVecTy.getRank(); + + // Expand offsets and sizes to match the vector rank. + SmallVector offsets(rank, 0); + copy(getI64SubArray(op.getOffsets()), offsets.begin()); + + SmallVector sizes(sourceShape); + copy(getI64SubArray(op.getSizes()), sizes.begin()); + + // Calculate the slice elements by enumerating all slice positions and + // linearizing them. The enumeration order is lexicographic which yields a + // sequence of monotonically increasing linearized position indices. + const auto denseValuesBegin = dense.value_begin(); + SmallVector sliceValues; + sliceValues.reserve(sliceVecTy.getNumElements()); + SmallVector currSlicePosition(offsets.begin(), offsets.end()); + do { + int64_t linearizedPosition = linearize(currSlicePosition, sourceStrides); + assert(linearizedPosition < sourceVecTy.getNumElements() && + "Invalid index"); + sliceValues.push_back(*(denseValuesBegin + linearizedPosition)); + } while (succeeded(incSlicePosition(currSlicePosition, sliceShape, offsets))); + + assert(static_cast(sliceValues.size()) == + sliceVecTy.getNumElements() && + "Invalid number of slice elements"); + return DenseElementsAttr::get(sliceVecTy, sliceValues); +} + OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) { if (getSourceVectorType() == getResult().getType()) return getVector(); if (succeeded(foldExtractStridedOpFromInsertChain(*this))) return getResult(); - return {}; + + // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp. + if (auto splat = + llvm::dyn_cast_if_present(adaptor.getVector())) + DenseElementsAttr::get(getType(), splat.getSplatValue()); + + // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp. + return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getVector()); } void ExtractStridedSliceOp::getOffsets(SmallVectorImpl &results) { @@ -3783,98 +3838,6 @@ class StridedSliceConstantMaskFolder final } }; -// Pattern to rewrite a ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp. -class StridedSliceSplatConstantFolder final - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(ExtractStridedSliceOp extractStridedSliceOp, - PatternRewriter &rewriter) const override { - // Return if 'ExtractStridedSliceOp' operand is not defined by a splat - // ConstantOp. - Value sourceVector = extractStridedSliceOp.getVector(); - Attribute vectorCst; - if (!matchPattern(sourceVector, m_Constant(&vectorCst))) - return failure(); - - auto splat = llvm::dyn_cast(vectorCst); - if (!splat) - return failure(); - - auto newAttr = SplatElementsAttr::get(extractStridedSliceOp.getType(), - splat.getSplatValue()); - rewriter.replaceOpWithNewOp(extractStridedSliceOp, - newAttr); - return success(); - } -}; - -// Pattern to rewrite a ExtractStridedSliceOp(non-splat ConstantOp) -> -// ConstantOp. -class StridedSliceNonSplatConstantFolder final - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(ExtractStridedSliceOp extractStridedSliceOp, - PatternRewriter &rewriter) const override { - // Return if 'ExtractStridedSliceOp' operand is not defined by a non-splat - // ConstantOp. - Value sourceVector = extractStridedSliceOp.getVector(); - Attribute vectorCst; - if (!matchPattern(sourceVector, m_Constant(&vectorCst))) - return failure(); - - // The splat case is handled by `StridedSliceSplatConstantFolder`. - auto dense = llvm::dyn_cast(vectorCst); - if (!dense || dense.isSplat()) - return failure(); - - // TODO: Handle non-unit strides when they become available. - if (extractStridedSliceOp.hasNonUnitStrides()) - return failure(); - - auto sourceVecTy = llvm::cast(sourceVector.getType()); - ArrayRef sourceShape = sourceVecTy.getShape(); - SmallVector sourceStrides = computeStrides(sourceShape); - - VectorType sliceVecTy = extractStridedSliceOp.getType(); - ArrayRef sliceShape = sliceVecTy.getShape(); - int64_t sliceRank = sliceVecTy.getRank(); - - // Expand offsets and sizes to match the vector rank. - SmallVector offsets(sliceRank, 0); - copy(getI64SubArray(extractStridedSliceOp.getOffsets()), offsets.begin()); - - SmallVector sizes(sourceShape); - copy(getI64SubArray(extractStridedSliceOp.getSizes()), sizes.begin()); - - // Calculate the slice elements by enumerating all slice positions and - // linearizing them. The enumeration order is lexicographic which yields a - // sequence of monotonically increasing linearized position indices. - auto denseValuesBegin = dense.value_begin(); - SmallVector sliceValues; - sliceValues.reserve(sliceVecTy.getNumElements()); - SmallVector currSlicePosition(offsets.begin(), offsets.end()); - do { - int64_t linearizedPosition = linearize(currSlicePosition, sourceStrides); - assert(linearizedPosition < sourceVecTy.getNumElements() && - "Invalid index"); - sliceValues.push_back(*(denseValuesBegin + linearizedPosition)); - } while ( - succeeded(incSlicePosition(currSlicePosition, sliceShape, offsets))); - - assert(static_cast(sliceValues.size()) == - sliceVecTy.getNumElements() && - "Invalid number of slice elements"); - auto newAttr = DenseElementsAttr::get(sliceVecTy, sliceValues); - rewriter.replaceOpWithNewOp(extractStridedSliceOp, - newAttr); - return success(); - } -}; - // Pattern to rewrite an ExtractStridedSliceOp(BroadcastOp) to // BroadcastOp(ExtractStrideSliceOp). class StridedSliceBroadcast final @@ -4018,8 +3981,7 @@ void ExtractStridedSliceOp::getCanonicalizationPatterns( RewritePatternSet &results, MLIRContext *context) { // Pattern to rewrite a ExtractStridedSliceOp(ConstantMaskOp) -> // ConstantMaskOp and ExtractStridedSliceOp(ConstantOp) -> ConstantOp. - results.add( context); } @@ -5659,10 +5621,8 @@ OpFoldResult ShapeCastOp::fold(FoldAdaptor adaptor) { // shape_cast(constant) -> constant if (auto splatAttr = - llvm::dyn_cast_if_present(adaptor.getSource())) { - return DenseElementsAttr::get(resultType, - splatAttr.getSplatValue()); - } + llvm::dyn_cast_if_present(adaptor.getSource())) + return splatAttr.reshape(getType()); // shape_cast(poison) -> poison if (llvm::dyn_cast_if_present(adaptor.getSource())) { @@ -6006,10 +5966,9 @@ void vector::TransposeOp::build(OpBuilder &builder, OperationState &result, OpFoldResult vector::TransposeOp::fold(FoldAdaptor adaptor) { // Eliminate splat constant transpose ops. - if (auto attr = - llvm::dyn_cast_if_present(adaptor.getVector())) - if (attr.isSplat()) - return attr.reshape(getResultVectorType()); + if (auto splat = + llvm::dyn_cast_if_present(adaptor.getVector())) + return splat.reshape(getResultVectorType()); // Eliminate poison transpose ops. if (llvm::dyn_cast_if_present(adaptor.getVector())) diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index b24cf93707d8b..f50d92fc9c8b1 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -1121,6 +1121,8 @@ func.func @bitcast_folding(%I1: vector<4x8xf32>, %I2: vector<2xi32>) -> (vector< return %0, %2 : vector<4x8xf32>, vector<2xi32> } +// ----- + // CHECK-LABEL: func @bitcast_f16_to_f32 // bit pattern: 0x40004000 // CHECK-DAG: %[[CST1:.+]] = arith.constant dense<2.00390625> : vector<4xf32> @@ -1135,6 +1137,8 @@ func.func @bitcast_f16_to_f32() -> (vector<4xf32>, vector<4xf32>) { return %cast0, %cast1: vector<4xf32>, vector<4xf32> } +// ----- + // CHECK-LABEL: func @bitcast_i8_to_i32 // bit pattern: 0xA0A0A0A0 // CHECK-DAG: %[[CST1:.+]] = arith.constant dense<-1600085856> : vector<4xi32> @@ -1732,6 +1736,7 @@ func.func @vector_multi_reduction_unit_dimensions(%source: vector<5x1x4x1x20xf32 } // ----- + // CHECK-LABEL: func.func @vector_multi_reduction_scalable( // CHECK-SAME: %[[VAL_0:.*]]: vector<1x[4]x1xf32>, // CHECK-SAME: %[[VAL_1:.*]]: vector<1x[4]xf32>, @@ -2249,6 +2254,8 @@ func.func @transpose_splat_constant() -> vector<8x4xf32> { return %0 : vector<8x4xf32> } +// ----- + // CHECK-LABEL: func @transpose_splat2( // CHECK-SAME: %[[VAL_0:.*]]: f32) -> vector<3x4xf32> { // CHECK: %[[VAL_1:.*]] = vector.splat %[[VAL_0]] : vector<3x4xf32> From cbbf6b487360a0926c88d512c986b3640136f91e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 13:55:01 -0700 Subject: [PATCH 175/710] Revert "[NFC][CFI] Dump test output to debug llvm-clang-win-x-aarch64 failure" (#136029) Reverts llvm/llvm-project#136002 Not needed any more. --- clang/test/Driver/sanitizer-ld.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index c04831171bba9..a00ec029d3d46 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -839,14 +839,6 @@ // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-flto' // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-fvisibility=' -// CFI by itself does not link runtime libraries. -// RUN: %clang -fsanitize=cfi \ -// RUN: -flto -fvisibility=hidden \ -// RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ -// RUN: -resource-dir=%S/Inputs/resource_dir \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: -### %s - // CFI by itself does not link runtime libraries. // RUN: %clang -fsanitize=cfi \ // RUN: -flto -fvisibility=hidden \ From 6b0c8c4acd31eba83bf20ab1cf8729b2153e383c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 13:56:38 -0700 Subject: [PATCH 176/710] Revert "[NFC][CFI] Avoid clang error in CFI tests" (#136030) Reverts llvm/llvm-project#135981 Fails with 'clang: error: --rtlib=libgcc requires --unwindlib=libgcc' on some bots. --- clang/test/Driver/sanitizer-ld.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index a00ec029d3d46..67ca33d676d20 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -840,8 +840,7 @@ // CHECK-CFI-PREREQ-LINUX: '-fsanitize=cfi' only allowed with '-fvisibility=' // CFI by itself does not link runtime libraries. -// RUN: %clang -fsanitize=cfi \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -850,8 +849,7 @@ // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}" // CFI with diagnostics links the UBSan runtime. -// RUN: %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -861,8 +859,7 @@ // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive" // Cross-DSO CFI links the CFI runtime. -// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ @@ -873,8 +870,7 @@ // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic // Cross-DSO CFI with diagnostics links just the CFI runtime. -// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ @@ -886,8 +882,7 @@ // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic // Cross-DSO CFI on Android does not link runtime libraries. -// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_android_tree \ @@ -896,8 +891,7 @@ // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}" // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime. -// RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ -// RUN: -flto -fvisibility=hidden \ +// RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso \ // RUN: -fno-sanitize-trap=cfi -fsanitize-recover=cfi \ // RUN: --target=aarch64-linux-android -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ From 41c1a7be3f1a2556e407e761acb766a5d103d691 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 16 Apr 2025 22:58:09 +0200 Subject: [PATCH 177/710] [LV] Don't add fixed-order recurrence phis to forced scalars. Fixed-order recurrence phis cannot be forced to be scalar, they will always be widened at the moment. Make sure we don't add them to ForcedScalars, otherwise the legacy cost model will compute incorrect costs. This fixes an assertion reported with https://github.com/llvm/llvm-project/pull/129645. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- .../X86/fixed-order-recurrence.ll | 100 +++++++++++++++++- 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dd7f05465a50b..e9e1d0ac6c196 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6464,10 +6464,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } - } else + } else { + // Cannot scalarize fixed-order recurrence phis at the moment. + if (isa(I) && Legal->isFixedOrderRecurrence(cast(I))) + continue; + // Make sure I gets scalarized and a cost estimate without // scalarization overhead. ForcedScalars[VF].insert(I); + } } } diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index bc57aeb775fb1..838df4d0caf09 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -91,7 +91,7 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } -define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 { +define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) { ; CHECK-LABEL: @thirdorderrec( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 3 @@ -352,3 +352,101 @@ loop: exit: ret void } + +define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %n) #0 { +; CHECK-LABEL: @test_for_tried_to_force_scalar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[CONFLICT_RDX20:%.*]] = icmp ule i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[CONFLICT_RDX20]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x ptr> poison, ptr [[A:%.*]], i32 3 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP28]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4 +; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PREV:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[NEXT]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[NEXT]], align 4 +; CHECK-NEXT: store float [[TMP40]], ptr [[C]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[PREV]], align 4 +; CHECK-NEXT: store float [[TMP41]], ptr [[B]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %prev = phi ptr [ %A, %entry ], [ %next, %loop ] + %next = getelementptr nusw [3 x float], ptr %A, i64 %iv + %0 = load float, ptr %next, align 4 + store float %0, ptr %C, align 4 + %1 = load float, ptr %prev, align 4 + store float %1, ptr %B, align 4 + %iv.next = add nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv, %n + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +attributes #0 = { "target-cpu"="znver3" } From 42ad82bb059f1ba454c6f7d882984f38d6099d88 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Wed, 16 Apr 2025 15:58:42 -0500 Subject: [PATCH 178/710] [mlir][tosa] Add verifier check for Slice Op (#135853) Add verifier check for Slice Op to make sure input1 and output have same ranks. Added test in verifier.mlir Also moved existing slice verifier tests in invalid.mlir to verfier.mlir Signed-off-by: Tai Ly --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 29 ++++++++++++------- mlir/test/Dialect/Tosa/invalid.mlir | 32 --------------------- mlir/test/Dialect/Tosa/verifier.mlir | 43 ++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 43 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 9a1dfd8e17b85..8b4f6ef0d0980 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -1446,19 +1446,26 @@ LogicalResult tosa::SliceOp::verify() { /* outType = */ getOutput().getType()) .failed()) return failure(); - auto inputType = llvm::dyn_cast(getInput1().getType()); - if (!inputType) - return success(); - auto startShapeRank = - llvm::cast(getStart().getType()).getRank(); - if (inputType.getRank() != startShapeRank) - return emitOpError("length of start is not equal to rank of input shape"); + const ShapeAdaptor inputShape(getInput1().getType()); + if (inputShape.hasRank()) { + const auto inputRank = inputShape.getRank(); + const ShapeAdaptor outputShape(getOutput().getType()); + if (outputShape.hasRank() && inputRank != outputShape.getRank()) + return emitOpError( + "expect input1 and output to have the same ranks, got ") + << inputRank << " and " << outputShape.getRank(); + + const auto startShapeRank = + llvm::cast(getStart().getType()).getRank(); + if (inputRank != startShapeRank) + return emitOpError("length of start is not equal to rank of input shape"); - auto sizeShapeRank = - llvm::cast(getSize().getType()).getRank(); - if (inputType.getRank() != sizeShapeRank) - return emitOpError("length of size is not equal to rank of input shape"); + const auto sizeShapeRank = + llvm::cast(getSize().getType()).getRank(); + if (inputRank != sizeShapeRank) + return emitOpError("length of size is not equal to rank of input shape"); + } return success(); } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index c0b251885de5c..fc98aa95ed5b3 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -660,28 +660,6 @@ func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi8>) -> () { // ----- -func.func @test_slice_invalid_start() { - %0 = tensor.empty() : tensor<4x31x31xf32> - %start = tosa.const_shape {values = dense<[1, 1]> : tensor<2xindex>} : () -> !tosa.shape<2> - %size = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> - // expected-error@+1 {{'tosa.slice' op length of start is not equal to rank of input shape}} - %3 = tosa.slice %0, %start, %size : (tensor<4x31x31xf32>, !tosa.shape<2>, !tosa.shape<3>) -> tensor<*xf32> - return -} - -// ----- - -func.func @test_slice_invalid_size() { - %0 = tensor.empty() : tensor<4x31x31xf32> - %start = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> - %size = tosa.const_shape {values = dense<[1]> : tensor<1xindex>} : () -> !tosa.shape<1> - // expected-error@+1 {{'tosa.slice' op length of size is not equal to rank of input shape}} - %3 = tosa.slice %0, %start, %size : (tensor<4x31x31xf32>, !tosa.shape<3>, !tosa.shape<1>) -> tensor<*xf32> - return -} - -// ----- - func.func @test_tile_invalid_multiples() { %0 = tensor.empty() : tensor<4x31x31xf32> %cst = tosa.const_shape { values = dense<1> : tensor<1xindex> } : () -> !tosa.shape<1> @@ -1938,16 +1916,6 @@ func.func @test_scalar_reverse(%arg0: tensor) -> tensor { // ----- -func.func @test_scalar_slice(%arg0: tensor) -> tensor { - %0 = tosa.const_shape {values = dense<[]> : tensor<0xindex>} : () -> !tosa.shape<0> - %1 = tosa.const_shape {values = dense<[]> : tensor<0xindex>} : () -> !tosa.shape<0> - // expected-error@+1 {{'tosa.slice' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor'}} - %2 = tosa.slice %arg0, %0, %1 : (tensor, !tosa.shape<0>, !tosa.shape<0>) -> tensor - return %2 : tensor -} - -// ----- - func.func @test_scalar_tile(%arg0: tensor) -> tensor<*xf32> { %cst = tosa.const_shape { values = dense<[]> : tensor<0xindex> } : () -> !tosa.shape<0> // expected-error@+1 {{'tosa.tile' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor'}} diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir index c49cbecd25c78..efdd26a9346fb 100644 --- a/mlir/test/Dialect/Tosa/verifier.mlir +++ b/mlir/test/Dialect/Tosa/verifier.mlir @@ -124,3 +124,46 @@ func.func @test_scalar_output_transpose(%arg0: tensor<*xf32>) -> tensor { %1 = tosa.transpose %arg0 {perms = array} : (tensor<*xf32>) -> tensor return %1 : tensor } + +// ----- + +func.func @test_slice_invalid_output_rank() { + %0 = tensor.empty() : tensor<4x31x31xf32> + %start = tosa.const_shape {values = dense<[1, 1]> : tensor<2xindex>} : () -> !tosa.shape<2> + %size = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op expect input1 and output to have the same ranks, got 3 and 4}} + %3 = tosa.slice %0, %start, %size : (tensor<4x31x31xf32>, !tosa.shape<2>, !tosa.shape<3>) -> tensor + return +} + +// ----- + +func.func @test_slice_invalid_start() { + %0 = tensor.empty() : tensor<4x31x31xf32> + %start = tosa.const_shape {values = dense<[1, 1]> : tensor<2xindex>} : () -> !tosa.shape<2> + %size = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op length of start is not equal to rank of input shape}} + %3 = tosa.slice %0, %start, %size : (tensor<4x31x31xf32>, !tosa.shape<2>, !tosa.shape<3>) -> tensor<*xf32> + return +} + +// ----- + +func.func @test_slice_invalid_size() { + %0 = tensor.empty() : tensor<4x31x31xf32> + %start = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %size = tosa.const_shape {values = dense<[1]> : tensor<1xindex>} : () -> !tosa.shape<1> + // expected-error@+1 {{'tosa.slice' op length of size is not equal to rank of input shape}} + %3 = tosa.slice %0, %start, %size : (tensor<4x31x31xf32>, !tosa.shape<3>, !tosa.shape<1>) -> tensor<*xf32> + return +} + +// ----- + +func.func @test_scalar_slice(%arg0: tensor) -> tensor { + %0 = tosa.const_shape {values = dense<[]> : tensor<0xindex>} : () -> !tosa.shape<0> + %1 = tosa.const_shape {values = dense<[]> : tensor<0xindex>} : () -> !tosa.shape<0> + // expected-error@+1 {{'tosa.slice' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor'}} + %2 = tosa.slice %arg0, %0, %1 : (tensor, !tosa.shape<0>, !tosa.shape<0>) -> tensor + return %2 : tensor +} From 2afef58e40ba953c0848577106dee51819b9be8f Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Thu, 17 Apr 2025 00:03:34 +0300 Subject: [PATCH 179/710] [ARM] Use helper class for emitting CFI instructions into MIR (#135994) Similar to #135845. PR: https://github.com/llvm/llvm-project/pull/135994 --- llvm/include/llvm/CodeGen/CFIInstBuilder.h | 20 ++++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 89 +++-------------- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 10 -- llvm/lib/Target/ARM/ARMFrameLowering.cpp | 104 ++++++-------------- llvm/lib/Target/ARM/Thumb1FrameLowering.cpp | 81 ++++----------- 5 files changed, 83 insertions(+), 221 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CFIInstBuilder.h b/llvm/include/llvm/CodeGen/CFIInstBuilder.h index e799b47a0c974..9025624c0d8ab 100644 --- a/llvm/include/llvm/CodeGen/CFIInstBuilder.h +++ b/llvm/include/llvm/CodeGen/CFIInstBuilder.h @@ -45,6 +45,10 @@ class CFIInstBuilder { setInsertPoint(InsertPt); } + CFIInstBuilder(MachineBasicBlock *MBB, MachineInstr::MIFlag MIFlag, + bool IsEH = true) + : CFIInstBuilder(*MBB, MBB->end(), MIFlag, IsEH) {} + void setInsertPoint(MachineBasicBlock::iterator IP) { InsertPt = IP; } void insertCFIInst(const MCCFIInstruction &CFIInst) const { @@ -72,11 +76,27 @@ class CFIInstBuilder { nullptr, TRI.getDwarfRegNum(Reg, IsEH), Offset)); } + void buildRegister(MCRegister Reg1, MCRegister Reg2) const { + insertCFIInst(MCCFIInstruction::createRegister( + nullptr, TRI.getDwarfRegNum(Reg1, IsEH), + TRI.getDwarfRegNum(Reg2, IsEH))); + } + void buildRestore(MCRegister Reg) const { insertCFIInst(MCCFIInstruction::createRestore( nullptr, TRI.getDwarfRegNum(Reg, IsEH))); } + void buildUndefined(MCRegister Reg) const { + insertCFIInst(MCCFIInstruction::createUndefined( + nullptr, TRI.getDwarfRegNum(Reg, IsEH))); + } + + void buildSameValue(MCRegister Reg) const { + insertCFIInst(MCCFIInstruction::createSameValue( + nullptr, TRI.getDwarfRegNum(Reg, IsEH))); + } + void buildEscape(StringRef Bytes, StringRef Comment = "") const { insertCFIInst( MCCFIInstruction::createEscape(nullptr, Bytes, SMLoc(), Comment)); diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 6843ec895e69c..69bc84a6733c0 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -6485,51 +6486,20 @@ void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB, if (!CFI) return; - MachineFunction &MF = *MBB.getParent(); - // Add a CFI, saying CFA is offset by Align bytes from SP. - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameSetup); + CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup); + CFIBuilder.buildDefCFAOffset(Align); // Add a CFI saying that the LR that we want to find is now higher than // before. int LROffset = Auth ? Align - 4 : Align; - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int64_t LRPosEntry = MF.addFrameInst( - MCCFIInstruction::createOffset(nullptr, DwarfLR, -LROffset)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildOffset(ARM::LR, -LROffset); if (Auth) { // Add a CFI for the location of the return adddress PAC. - unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true); - int64_t RACPosEntry = MF.addFrameInst( - MCCFIInstruction::createOffset(nullptr, DwarfRAC, -Align)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(RACPosEntry) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildOffset(ARM::RA_AUTH_CODE, -Align); } } -void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It, - Register Reg) const { - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - - int64_t LRPosEntry = MF.addFrameInst( - MCCFIInstruction::createRegister(nullptr, DwarfLR, DwarfReg)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameSetup); -} - void ARMBaseInstrInfo::restoreLRFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator It, bool CFI, bool Auth) const { @@ -6560,50 +6530,18 @@ void ARMBaseInstrInfo::restoreLRFromStack(MachineBasicBlock &MBB, } if (CFI) { - // Now stack has moved back up... - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); - - // ... and we have restored LR. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); - - if (Auth) { - unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true); - int64_t Entry = - MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, DwarfRAC)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(Entry) - .setMIFlags(MachineInstr::FrameDestroy); - } + // Now stack has moved back up and we have restored LR. + CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameDestroy); + CFIBuilder.buildDefCFAOffset(0); + CFIBuilder.buildRestore(ARM::LR); + if (Auth) + CFIBuilder.buildUndefined(ARM::RA_AUTH_CODE); } if (Auth) BuildMI(MBB, It, DebugLoc(), get(ARM::t2AUT)); } -void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); -} - void ARMBaseInstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { @@ -6722,11 +6660,12 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( // Save and restore LR from that register. copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true); if (!AFI.isLRSpilled()) - emitCFIForLRSaveToReg(MBB, It, Reg); + CFIInstBuilder(MBB, It, MachineInstr::FrameSetup) + .buildRegister(ARM::LR, Reg); CallPt = MBB.insert(It, CallMIB); copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true); if (!AFI.isLRSpilled()) - emitCFIForLRRestoreFromReg(MBB, It); + CFIInstBuilder(MBB, It, MachineInstr::FrameDestroy).buildRestore(ARM::LR); It--; return CallPt; } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 35edd5bf003ef..987f5a0e3d824 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -409,16 +409,6 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MachineBasicBlock::iterator It, bool CFI, bool Auth) const; - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// for the case when the LR is saved in the register \p Reg. - void emitCFIForLRSaveToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It, - Register Reg) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// after the LR is was restored from a register. - void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; /// \brief Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. /// diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 475f53fc03399..d3a6504c9100e 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -120,6 +120,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -140,10 +141,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -760,21 +758,16 @@ struct StackAdjustingInsts { Info->SPAdjust += ExtraBytes; } - void emitDefCFAOffsets(MachineBasicBlock &MBB, const DebugLoc &dl, - const ARMBaseInstrInfo &TII, bool HasFP) { - MachineFunction &MF = *MBB.getParent(); + void emitDefCFAOffsets(MachineBasicBlock &MBB, bool HasFP) { + CFIInstBuilder CFIBuilder(MBB, MBB.end(), MachineInstr::FrameSetup); unsigned CFAOffset = 0; for (auto &Info : Insts) { if (HasFP && !Info.BeforeFPSet) return; CFAOffset += Info.SPAdjust; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, std::next(Info.I), dl, - TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.setInsertPoint(std::next(Info.I)); + CFIBuilder.buildDefCFAOffset(CFAOffset); } } @@ -890,9 +883,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); - MCContext &Context = MF.getContext(); const TargetMachine &TM = MF.getTarget(); - const MCRegisterInfo *MRI = Context.getRegisterInfo(); const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo(); const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); assert(!AFI->isThumb1OnlyFunction() && @@ -938,7 +929,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes, true); } if (!NeedsWinCFI) - DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP); + DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, HasFP); if (NeedsWinCFI && MBBI != MBB.begin()) { insertSEHRange(MBB, {}, MBBI, TII, MachineInstr::FrameSetup); BuildMI(MBB, MBBI, dl, TII.get(ARM::SEH_PrologEnd)) @@ -1245,21 +1236,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, if (!NeedsWinCFI) { // Emit DWARF info to find the CFA using the frame pointer from this // point onward. - if (FPOffsetAfterPush != 0) { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( - nullptr, MRI->getDwarfRegNum(FramePtr, true), - -MFI.getObjectOffset(FramePtrSpillFI))); - BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } else { - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( - nullptr, MRI->getDwarfRegNum(FramePtr, true))); - BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } + CFIInstBuilder CFIBuilder(MBB, AfterPush, MachineInstr::FrameSetup); + if (FPOffsetAfterPush != 0) + CFIBuilder.buildDefCFA(FramePtr, -MFI.getObjectOffset(FramePtrSpillFI)); + else + CFIBuilder.buildDefCFARegister(FramePtr); } } @@ -1304,14 +1285,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } if (CFIPos.isValid()) { - int CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, - MRI->getDwarfRegNum(Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, - true), - MFI.getObjectOffset(FI))); - BuildMI(MBB, CFIPos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIInstBuilder(MBB, CFIPos, MachineInstr::FrameSetup) + .buildOffset(Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, + MFI.getObjectOffset(FI)); } } } @@ -1322,7 +1298,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // actually get emitted. if (!NeedsWinCFI) { LLVM_DEBUG(DefCFAOffsetCandidates.dump()); - DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP); + DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, HasFP); } if (STI.isTargetELF() && hasFP(MF)) @@ -3155,7 +3131,6 @@ static const uint64_t kSplitStackAvailable = 256; void ARMFrameLowering::adjustForSegmentedStacks( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { unsigned Opcode; - unsigned CFIIndex; const ARMSubtarget *ST = &MF.getSubtarget(); bool Thumb = ST->isThumb(); bool Thumb2 = ST->isThumb2(); @@ -3168,8 +3143,6 @@ void ARMFrameLowering::adjustForSegmentedStacks( report_fatal_error("Segmented stacks not supported on this platform."); MachineFrameInfo &MFI = MF.getFrameInfo(); - MCContext &Context = MF.getContext(); - const MCRegisterInfo *MRI = Context.getRegisterInfo(); const ARMBaseInstrInfo &TII = *static_cast(MF.getSubtarget().getInstrInfo()); ARMFunctionInfo *ARMFI = MF.getInfo(); @@ -3267,17 +3240,10 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Emit the relevant DWARF information about the change in stack pointer as // well as where to find both r4 and r5 (the callee-save registers) if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { - CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8)); - BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4)); - BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8)); - BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + CFIInstBuilder CFIBuilder(PrevStackMBB, MachineInstr::NoFlags); + CFIBuilder.buildDefCFAOffset(8); + CFIBuilder.buildOffset(ScratchReg1, -4); + CFIBuilder.buildOffset(ScratchReg0, -8); } // mov SR1, sp @@ -3486,13 +3452,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Emit the DWARF info about the change in stack as well as where to find the // previous link register if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { - CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12)); - BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12)); - BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + CFIInstBuilder CFIBuilder(AllocMBB, MachineInstr::NoFlags); + CFIBuilder.buildDefCFAOffset(12); + CFIBuilder.buildOffset(ARM::LR, -12); } // Call __morestack(). @@ -3549,11 +3511,8 @@ void ARMFrameLowering::adjustForSegmentedStacks( } // Update the CFA offset now that we've popped - if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { - CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } + if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) + CFIInstBuilder(AllocMBB, MachineInstr::NoFlags).buildDefCFAOffset(0); // Return from this function. BuildMI(AllocMBB, DL, TII.get(ST->getReturnOpcode())).add(predOps(ARMCC::AL)); @@ -3576,20 +3535,13 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Update the CFA offset now that we've popped if (!MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { - CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + CFIInstBuilder CFIBuilder(PostStackMBB, MachineInstr::NoFlags); + CFIBuilder.buildDefCFAOffset(0); // Tell debuggers that r4 and r5 are now the same as they were in the // previous function, that they're the "Same Value". - CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue( - nullptr, MRI->getDwarfRegNum(ScratchReg0, true))); - BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue( - nullptr, MRI->getDwarfRegNum(ScratchReg1, true))); - BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + CFIBuilder.buildSameValue(ScratchReg0); + CFIBuilder.buildSameValue(ScratchReg1); } // Organizing MBB lists diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index a69e307a5da20..b04e20a0b6709 100644 --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -34,9 +35,6 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDwarf.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include @@ -150,7 +148,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); - const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo(); const ThumbRegisterInfo *RegInfo = static_cast(STI.getRegisterInfo()); const Thumb1InstrInfo &TII = @@ -180,16 +177,13 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // belongs to which callee-save spill areas. unsigned FRSize = 0, GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; int FramePtrSpillFI = 0; + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); if (ArgRegsSaveSize) { emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset += ArgRegsSaveSize; - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFAOffset(CFAOffset); } if (!AFI->hasStackFrame()) { @@ -198,11 +192,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, -(NumBytes - ArgRegsSaveSize), ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset += NumBytes - ArgRegsSaveSize; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFAOffset(CFAOffset); } return; } @@ -340,20 +330,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, .add(predOps(ARMCC::AL)); } - if(FramePtrOffsetInBlock) { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( - nullptr, MRI->getDwarfRegNum(FramePtr, true), (CFAOffset - FramePtrOffsetInBlock))); - BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } else { - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( - nullptr, MRI->getDwarfRegNum(FramePtr, true))); - BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } + CFIBuilder.setInsertPoint(AfterPush); + if (FramePtrOffsetInBlock) + CFIBuilder.buildDefCFA(FramePtr, CFAOffset - FramePtrOffsetInBlock); + else + CFIBuilder.buildDefCFARegister(FramePtr); if (NumBytes > 508) // If offset is > 508 then sp cannot be adjusted in a single instruction, // try restoring from fp instead. @@ -362,18 +343,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // Emit call frame information for the callee-saved low registers. if (GPRCS1Size > 0) { - MachineBasicBlock::iterator Pos = std::next(GPRCS1Push); - if (adjustedGPRCS1Size) { - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } + CFIBuilder.setInsertPoint(std::next(GPRCS1Push)); + if (adjustedGPRCS1Size) + CFIBuilder.buildDefCFAOffset(CFAOffset); for (const CalleeSavedInfo &I : CSI) { - MCRegister Reg = I.getReg(); - int FI = I.getFrameIdx(); - switch (Reg) { + switch (I.getReg()) { case ARM::R8: case ARM::R9: case ARM::R10: @@ -389,11 +363,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, case ARM::R6: case ARM::R7: case ARM::LR: - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI))); - BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildOffset(I.getReg(), + MFI.getObjectOffset(I.getFrameIdx())); break; } } @@ -401,23 +372,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // Emit call frame information for the callee-saved high registers. if (GPRCS2Size > 0) { - MachineBasicBlock::iterator Pos = std::next(GPRCS2Push); + CFIBuilder.setInsertPoint(std::next(GPRCS2Push)); for (auto &I : CSI) { - MCRegister Reg = I.getReg(); - int FI = I.getFrameIdx(); - switch (Reg) { + switch (I.getReg()) { case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: - case ARM::R12: { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI))); - BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + case ARM::R12: + CFIBuilder.buildOffset(I.getReg(), + MFI.getObjectOffset(I.getFrameIdx())); break; - } default: break; } @@ -442,11 +407,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, ScratchRegister, MachineInstr::FrameSetup); if (!HasFP) { CFAOffset += NumBytes; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + CFIBuilder.buildDefCFAOffset(CFAOffset); } } From a9827fbc86b3d2973b9eef7bfb8f726dd75f17a5 Mon Sep 17 00:00:00 2001 From: Prabhu Rajasekaran Date: Wed, 16 Apr 2025 14:14:43 -0700 Subject: [PATCH 180/710] [llvm] Handle CPI symbols for UEFI (#135652) UEFI targets besides MSVC environment must support constant pool symbols. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 1d72a1e116ecd..4b05ad398f053 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -4113,7 +4113,8 @@ const MCExpr *AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) { /// GetCPISymbol - Return the symbol for the specified constant pool entry. MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const { - if (getSubtargetInfo().getTargetTriple().isWindowsMSVCEnvironment()) { + if (getSubtargetInfo().getTargetTriple().isWindowsMSVCEnvironment() || + getSubtargetInfo().getTargetTriple().isUEFI()) { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { From 9dbe107219c7ab1c422300f9eeb9ca3f7fc87c53 Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Wed, 16 Apr 2025 14:41:20 -0700 Subject: [PATCH 181/710] disable test on older compilers (#136037) --- .../TestDataFormatterLibcxxInvalidVectorSimulator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py index 8788ea7be882d..3f58018b0fbd9 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/TestDataFormatterLibcxxInvalidVectorSimulator.py @@ -13,6 +13,8 @@ class LibcxxInvalidVectorDataFormatterSimulatorTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True + + @skipIf(compiler="clang", compiler_version=['<', '15.0.1']) def test(self): self.build() lldbutil.run_to_source_breakpoint(self, "return 0", lldb.SBFileSpec("main.cpp")) From c409da2223ad910d2a0fb491c19623c679e8aae3 Mon Sep 17 00:00:00 2001 From: Muzammil <55665739+Muzammiluddin-Syed-ECE@users.noreply.github.com> Date: Wed, 16 Apr 2025 17:53:17 -0400 Subject: [PATCH 182/710] [mlir][ROCDL] Add permlanex16 op to allow subgroup reductions on gfx10+ (#135983) Adding Permlanex16Op to ROCDL dialect to enable subgroup reduce lowering to DPP ops for gfx 10+ devices. See [this PR](https://github.com/llvm/llvm-project/pull/133204). --------- Signed-off-by: Muzammiluddin Syed --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 16 ++++++++++++++++ mlir/test/Dialect/LLVMIR/rocdl.mlir | 10 ++++++++++ mlir/test/Target/LLVMIR/rocdl.mlir | 14 ++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 900155c274b4d..186a4f53f93cb 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0], }]; } +// PermLaneX16 intrinsic operation +def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0], + [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, + [4, 5], ["fi", "boundControl"]>, + Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2, + I1Attr:$fi, I1Attr:$boundControl)> { + let results = (outs LLVM_Type:$res); + let assemblyFormat = [{ + attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0) `,` type($src1) + }]; + let description = [{ + Performs a `permlanex16` operation with the given operands, applying the + permutation specified by $fi to the provided inputs. + }]; +} + def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>, BuildableType<"::mlir::VectorType::get(" "{2},$_builder.getI16Type())">; diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 828fd58544597..cda1a9ca5f1f6 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -889,6 +889,16 @@ llvm.func @rocdl.readlane(%src : f32) -> f32 { // ----- +llvm.func @rocdl.permlanex16(%src : f32) -> f32 { + %cst0 = llvm.mlir.constant(-1 : i32) : i32 + // CHECK-LABEL: rocdl.permlanex16 + // CHECK: rocdl.permlanex16 %{{.*}} %{{.*}} + %ret = rocdl.permlanex16 %src, %src, %cst0, %cst0, 0, -1 : f32, i32 + llvm.return %ret : f32 +} + +// ----- + // expected-error@below {{attribute attached to unexpected op}} func.func private @expected_llvm_func() attributes { rocdl.kernel } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index e70617bfff99e..3db1f7b2b6427 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -872,6 +872,20 @@ llvm.func @rocdl.make.buffer.rsrc.p7.p1(%ptr : !llvm.ptr<1>, llvm.return %rsrc : !llvm.ptr<7> } +llvm.func @rocdl.permlanex16(%src0 : f32, %src1 : i32, %src2 : vector<2 x f32>, %src3 : vector<2 x i32>) -> f32 { + %cst0 = llvm.mlir.constant(-1 : i32) : i32 + // CHECK-LABEL: rocdl.permlanex16 + // CHECK: call float @llvm.amdgcn.permlanex16.f32(float %{{.*}}, float %{{.*}}, i32 -1, i32 -1, i1 false, i1 true) + %ret0 = rocdl.permlanex16 %src0, %src0, %cst0, %cst0, 0, -1 : f32, i32 + // CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i32 -1, i1 false, i1 true) + %ret1 = rocdl.permlanex16 %src1, %src1, %cst0, %cst0, 0, -1 : i32, i32 + // CHECK: call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}, i32 -1, i32 -1, i1 false, i1 true) + %ret2 = rocdl.permlanex16 %src2, %src2, %cst0, %cst0, 0, -1 : vector<2 x f32>, i32 + // CHECK: call <2 x i32> @llvm.amdgcn.permlanex16.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, i32 -1, i32 -1, i1 false, i1 true) + %ret3 = rocdl.permlanex16 %src3, %src3, %cst0, %cst0, 0, -1 : vector<2 x i32>, i32 + llvm.return %ret0 : f32 +} + llvm.func @rocdl.wmma.fp8(%arg0 : vector<2 x i32>, %arg1 : vector<8xf32>) -> vector<8xf32> { // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}}) %r0 = rocdl.wmma.f32.16x16x16.fp8_fp8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32> From 6727d588919a49ae14d0857d2a642099098c9194 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 14:59:18 -0700 Subject: [PATCH 183/710] [NFC][CFI] Remove unnecessary -rtlib=platform from tests (#136032) Should allow to reland https://github.com/llvm/llvm-project/pull/135981 --- clang/test/Driver/sanitizer-ld.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c index 67ca33d676d20..0ace06bac3b31 100644 --- a/clang/test/Driver/sanitizer-ld.c +++ b/clang/test/Driver/sanitizer-ld.c @@ -831,7 +831,7 @@ // CFI requirements. // RUN: not %clang -fsanitize=cfi \ -// RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ +// RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -### %s 2>&1 \ @@ -841,7 +841,7 @@ // CFI by itself does not link runtime libraries. // RUN: not %clang -fsanitize=cfi \ -// RUN: --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \ +// RUN: --target=x86_64-unknown-linux -fuse-ld=ld \ // RUN: -resource-dir=%S/Inputs/resource_dir \ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: -### %s 2>&1 \ From 7623501c056a38c665ccf718ad318fd16451e4cc Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 15:04:06 -0700 Subject: [PATCH 184/710] [asan] Fix build on fuchsia (#136042) Does not link after #131756 --- compiler-rt/lib/asan/asan_report.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/asan/asan_report.cpp b/compiler-rt/lib/asan/asan_report.cpp index e515f20548c00..8b22aebe7cafc 100644 --- a/compiler-rt/lib/asan/asan_report.cpp +++ b/compiler-rt/lib/asan/asan_report.cpp @@ -149,7 +149,9 @@ class ScopedInErrorReport { // 2. Subsequent error reporting avoids nested lock acquisition patterns. // 3. Eliminates the lock order inversion risk between libdl and ASan's // thread registry. +#if !SANITIZER_SYMBOLIZER_MARKUP Symbolizer::GetOrInit()->GetRefreshedListOfModules(); +#endif // Make sure the registry and sanitizer report mutexes are locked while // we're printing an error report. From 697aa9995c24a977425e672d76a4a434384b16e3 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Wed, 16 Apr 2025 18:17:09 -0400 Subject: [PATCH 185/710] [mlir][SMT] add python bindings (#135674) This PR adds "rich" python bindings to SMT dialect. --- mlir/include/mlir-c/Dialect/SMT.h | 69 ++++++++-------- mlir/include/mlir-c/Target/ExportSMTLIB.h | 10 ++- mlir/lib/Bindings/Python/DialectSMT.cpp | 83 +++++++++++++++++++ mlir/lib/CAPI/Dialect/SMT.cpp | 52 ++++++------ mlir/lib/CAPI/Target/ExportSMTLIB.cpp | 21 ++++- mlir/python/CMakeLists.txt | 24 ++++++ mlir/python/mlir/dialects/SMTOps.td | 14 ++++ mlir/python/mlir/dialects/smt.py | 33 ++++++++ mlir/test/CAPI/smt.c | 97 ++++++++++++----------- mlir/test/python/dialects/smt.py | 87 ++++++++++++++++++++ 10 files changed, 378 insertions(+), 112 deletions(-) create mode 100644 mlir/lib/Bindings/Python/DialectSMT.cpp create mode 100644 mlir/python/mlir/dialects/SMTOps.td create mode 100644 mlir/python/mlir/dialects/smt.py create mode 100644 mlir/test/python/dialects/smt.py diff --git a/mlir/include/mlir-c/Dialect/SMT.h b/mlir/include/mlir-c/Dialect/SMT.h index d076dccce1b06..0ad64746f148b 100644 --- a/mlir/include/mlir-c/Dialect/SMT.h +++ b/mlir/include/mlir-c/Dialect/SMT.h @@ -26,82 +26,83 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SMT, smt); //===----------------------------------------------------------------------===// /// Checks if the given type is any non-func SMT value type. -MLIR_CAPI_EXPORTED bool smtTypeIsAnyNonFuncSMTValueType(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsAnyNonFuncSMTValueType(MlirType type); /// Checks if the given type is any SMT value type. -MLIR_CAPI_EXPORTED bool smtTypeIsAnySMTValueType(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsAnySMTValueType(MlirType type); /// Checks if the given type is a smt::ArrayType. -MLIR_CAPI_EXPORTED bool smtTypeIsAArray(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsAArray(MlirType type); /// Creates an array type with the given domain and range types. -MLIR_CAPI_EXPORTED MlirType smtTypeGetArray(MlirContext ctx, - MlirType domainType, - MlirType rangeType); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetArray(MlirContext ctx, + MlirType domainType, + MlirType rangeType); /// Checks if the given type is a smt::BitVectorType. -MLIR_CAPI_EXPORTED bool smtTypeIsABitVector(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsABitVector(MlirType type); /// Creates a smt::BitVectorType with the given width. -MLIR_CAPI_EXPORTED MlirType smtTypeGetBitVector(MlirContext ctx, int32_t width); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetBitVector(MlirContext ctx, + int32_t width); /// Checks if the given type is a smt::BoolType. -MLIR_CAPI_EXPORTED bool smtTypeIsABool(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsABool(MlirType type); /// Creates a smt::BoolType. -MLIR_CAPI_EXPORTED MlirType smtTypeGetBool(MlirContext ctx); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetBool(MlirContext ctx); /// Checks if the given type is a smt::IntType. -MLIR_CAPI_EXPORTED bool smtTypeIsAInt(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsAInt(MlirType type); /// Creates a smt::IntType. -MLIR_CAPI_EXPORTED MlirType smtTypeGetInt(MlirContext ctx); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetInt(MlirContext ctx); /// Checks if the given type is a smt::FuncType. -MLIR_CAPI_EXPORTED bool smtTypeIsASMTFunc(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsASMTFunc(MlirType type); /// Creates a smt::FuncType with the given domain and range types. -MLIR_CAPI_EXPORTED MlirType smtTypeGetSMTFunc(MlirContext ctx, - size_t numberOfDomainTypes, - const MlirType *domainTypes, - MlirType rangeType); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetSMTFunc(MlirContext ctx, + size_t numberOfDomainTypes, + const MlirType *domainTypes, + MlirType rangeType); /// Checks if the given type is a smt::SortType. -MLIR_CAPI_EXPORTED bool smtTypeIsASort(MlirType type); +MLIR_CAPI_EXPORTED bool mlirSMTTypeIsASort(MlirType type); /// Creates a smt::SortType with the given identifier and sort parameters. -MLIR_CAPI_EXPORTED MlirType smtTypeGetSort(MlirContext ctx, - MlirIdentifier identifier, - size_t numberOfSortParams, - const MlirType *sortParams); +MLIR_CAPI_EXPORTED MlirType mlirSMTTypeGetSort(MlirContext ctx, + MlirIdentifier identifier, + size_t numberOfSortParams, + const MlirType *sortParams); //===----------------------------------------------------------------------===// // Attribute API. //===----------------------------------------------------------------------===// /// Checks if the given string is a valid smt::BVCmpPredicate. -MLIR_CAPI_EXPORTED bool smtAttrCheckBVCmpPredicate(MlirContext ctx, - MlirStringRef str); +MLIR_CAPI_EXPORTED bool mlirSMTAttrCheckBVCmpPredicate(MlirContext ctx, + MlirStringRef str); /// Checks if the given string is a valid smt::IntPredicate. -MLIR_CAPI_EXPORTED bool smtAttrCheckIntPredicate(MlirContext ctx, - MlirStringRef str); +MLIR_CAPI_EXPORTED bool mlirSMTAttrCheckIntPredicate(MlirContext ctx, + MlirStringRef str); /// Checks if the given attribute is a smt::SMTAttribute. -MLIR_CAPI_EXPORTED bool smtAttrIsASMTAttribute(MlirAttribute attr); +MLIR_CAPI_EXPORTED bool mlirSMTAttrIsASMTAttribute(MlirAttribute attr); /// Creates a smt::BitVectorAttr with the given value and width. -MLIR_CAPI_EXPORTED MlirAttribute smtAttrGetBitVector(MlirContext ctx, - uint64_t value, - unsigned width); +MLIR_CAPI_EXPORTED MlirAttribute mlirSMTAttrGetBitVector(MlirContext ctx, + uint64_t value, + unsigned width); /// Creates a smt::BVCmpPredicateAttr with the given string. -MLIR_CAPI_EXPORTED MlirAttribute smtAttrGetBVCmpPredicate(MlirContext ctx, - MlirStringRef str); +MLIR_CAPI_EXPORTED MlirAttribute +mlirSMTAttrGetBVCmpPredicate(MlirContext ctx, MlirStringRef str); /// Creates a smt::IntPredicateAttr with the given string. -MLIR_CAPI_EXPORTED MlirAttribute smtAttrGetIntPredicate(MlirContext ctx, - MlirStringRef str); +MLIR_CAPI_EXPORTED MlirAttribute mlirSMTAttrGetIntPredicate(MlirContext ctx, + MlirStringRef str); #ifdef __cplusplus } diff --git a/mlir/include/mlir-c/Target/ExportSMTLIB.h b/mlir/include/mlir-c/Target/ExportSMTLIB.h index 31f411c4a89c2..59beda54d289b 100644 --- a/mlir/include/mlir-c/Target/ExportSMTLIB.h +++ b/mlir/include/mlir-c/Target/ExportSMTLIB.h @@ -21,9 +21,13 @@ extern "C" { /// Emits SMTLIB for the specified module using the provided callback and user /// data -MLIR_CAPI_EXPORTED MlirLogicalResult mlirExportSMTLIB(MlirModule, - MlirStringCallback, - void *userData); +MLIR_CAPI_EXPORTED MlirLogicalResult +mlirTranslateModuleToSMTLIB(MlirModule, MlirStringCallback, void *userData, + bool inlineSingleUseValues, bool indentLetBody); + +MLIR_CAPI_EXPORTED MlirLogicalResult mlirTranslateOperationToSMTLIB( + MlirOperation, MlirStringCallback, void *userData, + bool inlineSingleUseValues, bool indentLetBody); #ifdef __cplusplus } diff --git a/mlir/lib/Bindings/Python/DialectSMT.cpp b/mlir/lib/Bindings/Python/DialectSMT.cpp new file mode 100644 index 0000000000000..4e7647729fb0a --- /dev/null +++ b/mlir/lib/Bindings/Python/DialectSMT.cpp @@ -0,0 +1,83 @@ +//===- DialectSMT.cpp - Pybind module for SMT dialect API support ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NanobindUtils.h" + +#include "mlir-c/Dialect/SMT.h" +#include "mlir-c/IR.h" +#include "mlir-c/Support.h" +#include "mlir-c/Target/ExportSMTLIB.h" +#include "mlir/Bindings/Python/Diagnostics.h" +#include "mlir/Bindings/Python/Nanobind.h" +#include "mlir/Bindings/Python/NanobindAdaptors.h" + +namespace nb = nanobind; + +using namespace nanobind::literals; + +using namespace mlir; +using namespace mlir::python; +using namespace mlir::python::nanobind_adaptors; + +void populateDialectSMTSubmodule(nanobind::module_ &m) { + + auto smtBoolType = mlir_type_subclass(m, "BoolType", mlirSMTTypeIsABool) + .def_classmethod( + "get", + [](const nb::object &, MlirContext context) { + return mlirSMTTypeGetBool(context); + }, + "cls"_a, "context"_a.none() = nb::none()); + auto smtBitVectorType = + mlir_type_subclass(m, "BitVectorType", mlirSMTTypeIsABitVector) + .def_classmethod( + "get", + [](const nb::object &, int32_t width, MlirContext context) { + return mlirSMTTypeGetBitVector(context, width); + }, + "cls"_a, "width"_a, "context"_a.none() = nb::none()); + + auto exportSMTLIB = [](MlirOperation module, bool inlineSingleUseValues, + bool indentLetBody) { + mlir::python::CollectDiagnosticsToStringScope scope( + mlirOperationGetContext(module)); + PyPrintAccumulator printAccum; + MlirLogicalResult result = mlirTranslateOperationToSMTLIB( + module, printAccum.getCallback(), printAccum.getUserData(), + inlineSingleUseValues, indentLetBody); + if (mlirLogicalResultIsSuccess(result)) + return printAccum.join(); + throw nb::value_error( + ("Failed to export smtlib.\nDiagnostic message " + scope.takeMessage()) + .c_str()); + }; + + m.def( + "export_smtlib", + [&exportSMTLIB](MlirOperation module, bool inlineSingleUseValues, + bool indentLetBody) { + return exportSMTLIB(module, inlineSingleUseValues, indentLetBody); + }, + "module"_a, "inline_single_use_values"_a = false, + "indent_let_body"_a = false); + m.def( + "export_smtlib", + [&exportSMTLIB](MlirModule module, bool inlineSingleUseValues, + bool indentLetBody) { + return exportSMTLIB(mlirModuleGetOperation(module), + inlineSingleUseValues, indentLetBody); + }, + "module"_a, "inline_single_use_values"_a = false, + "indent_let_body"_a = false); +} + +NB_MODULE(_mlirDialectsSMT, m) { + m.doc() = "MLIR SMT Dialect"; + + populateDialectSMTSubmodule(m); +} diff --git a/mlir/lib/CAPI/Dialect/SMT.cpp b/mlir/lib/CAPI/Dialect/SMT.cpp index 3a4620df8ccdf..7e96bbb071533 100644 --- a/mlir/lib/CAPI/Dialect/SMT.cpp +++ b/mlir/lib/CAPI/Dialect/SMT.cpp @@ -25,46 +25,49 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SMT, smt, mlir::smt::SMTDialect) // Type API. //===----------------------------------------------------------------------===// -bool smtTypeIsAnyNonFuncSMTValueType(MlirType type) { +bool mlirSMTTypeIsAnyNonFuncSMTValueType(MlirType type) { return isAnyNonFuncSMTValueType(unwrap(type)); } -bool smtTypeIsAnySMTValueType(MlirType type) { +bool mlirSMTTypeIsAnySMTValueType(MlirType type) { return isAnySMTValueType(unwrap(type)); } -bool smtTypeIsAArray(MlirType type) { return isa(unwrap(type)); } +bool mlirSMTTypeIsAArray(MlirType type) { return isa(unwrap(type)); } -MlirType smtTypeGetArray(MlirContext ctx, MlirType domainType, - MlirType rangeType) { +MlirType mlirSMTTypeGetArray(MlirContext ctx, MlirType domainType, + MlirType rangeType) { return wrap( ArrayType::get(unwrap(ctx), unwrap(domainType), unwrap(rangeType))); } -bool smtTypeIsABitVector(MlirType type) { +bool mlirSMTTypeIsABitVector(MlirType type) { return isa(unwrap(type)); } -MlirType smtTypeGetBitVector(MlirContext ctx, int32_t width) { +MlirType mlirSMTTypeGetBitVector(MlirContext ctx, int32_t width) { return wrap(BitVectorType::get(unwrap(ctx), width)); } -bool smtTypeIsABool(MlirType type) { return isa(unwrap(type)); } +bool mlirSMTTypeIsABool(MlirType type) { return isa(unwrap(type)); } -MlirType smtTypeGetBool(MlirContext ctx) { +MlirType mlirSMTTypeGetBool(MlirContext ctx) { return wrap(BoolType::get(unwrap(ctx))); } -bool smtTypeIsAInt(MlirType type) { return isa(unwrap(type)); } +bool mlirSMTTypeIsAInt(MlirType type) { return isa(unwrap(type)); } -MlirType smtTypeGetInt(MlirContext ctx) { +MlirType mlirSMTTypeGetInt(MlirContext ctx) { return wrap(IntType::get(unwrap(ctx))); } -bool smtTypeIsASMTFunc(MlirType type) { return isa(unwrap(type)); } +bool mlirSMTTypeIsASMTFunc(MlirType type) { + return isa(unwrap(type)); +} -MlirType smtTypeGetSMTFunc(MlirContext ctx, size_t numberOfDomainTypes, - const MlirType *domainTypes, MlirType rangeType) { +MlirType mlirSMTTypeGetSMTFunc(MlirContext ctx, size_t numberOfDomainTypes, + const MlirType *domainTypes, + MlirType rangeType) { SmallVector domainTypesVec; domainTypesVec.reserve(numberOfDomainTypes); @@ -74,10 +77,11 @@ MlirType smtTypeGetSMTFunc(MlirContext ctx, size_t numberOfDomainTypes, return wrap(SMTFuncType::get(unwrap(ctx), domainTypesVec, unwrap(rangeType))); } -bool smtTypeIsASort(MlirType type) { return isa(unwrap(type)); } +bool mlirSMTTypeIsASort(MlirType type) { return isa(unwrap(type)); } -MlirType smtTypeGetSort(MlirContext ctx, MlirIdentifier identifier, - size_t numberOfSortParams, const MlirType *sortParams) { +MlirType mlirSMTTypeGetSort(MlirContext ctx, MlirIdentifier identifier, + size_t numberOfSortParams, + const MlirType *sortParams) { SmallVector sortParamsVec; sortParamsVec.reserve(numberOfSortParams); @@ -91,31 +95,31 @@ MlirType smtTypeGetSort(MlirContext ctx, MlirIdentifier identifier, // Attribute API. //===----------------------------------------------------------------------===// -bool smtAttrCheckBVCmpPredicate(MlirContext ctx, MlirStringRef str) { +bool mlirSMTAttrCheckBVCmpPredicate(MlirContext ctx, MlirStringRef str) { return symbolizeBVCmpPredicate(unwrap(str)).has_value(); } -bool smtAttrCheckIntPredicate(MlirContext ctx, MlirStringRef str) { +bool mlirSMTAttrCheckIntPredicate(MlirContext ctx, MlirStringRef str) { return symbolizeIntPredicate(unwrap(str)).has_value(); } -bool smtAttrIsASMTAttribute(MlirAttribute attr) { +bool mlirSMTAttrIsASMTAttribute(MlirAttribute attr) { return isa(unwrap(attr)); } -MlirAttribute smtAttrGetBitVector(MlirContext ctx, uint64_t value, - unsigned width) { +MlirAttribute mlirSMTAttrGetBitVector(MlirContext ctx, uint64_t value, + unsigned width) { return wrap(BitVectorAttr::get(unwrap(ctx), value, width)); } -MlirAttribute smtAttrGetBVCmpPredicate(MlirContext ctx, MlirStringRef str) { +MlirAttribute mlirSMTAttrGetBVCmpPredicate(MlirContext ctx, MlirStringRef str) { auto predicate = symbolizeBVCmpPredicate(unwrap(str)); assert(predicate.has_value() && "invalid predicate"); return wrap(BVCmpPredicateAttr::get(unwrap(ctx), predicate.value())); } -MlirAttribute smtAttrGetIntPredicate(MlirContext ctx, MlirStringRef str) { +MlirAttribute mlirSMTAttrGetIntPredicate(MlirContext ctx, MlirStringRef str) { auto predicate = symbolizeIntPredicate(unwrap(str)); assert(predicate.has_value() && "invalid predicate"); diff --git a/mlir/lib/CAPI/Target/ExportSMTLIB.cpp b/mlir/lib/CAPI/Target/ExportSMTLIB.cpp index c9ac7ce704af8..4326f967281e1 100644 --- a/mlir/lib/CAPI/Target/ExportSMTLIB.cpp +++ b/mlir/lib/CAPI/Target/ExportSMTLIB.cpp @@ -19,9 +19,24 @@ using namespace mlir; -MlirLogicalResult mlirExportSMTLIB(MlirModule module, - MlirStringCallback callback, - void *userData) { +MlirLogicalResult mlirTranslateOperationToSMTLIB(MlirOperation module, + MlirStringCallback callback, + void *userData, + bool inlineSingleUseValues, + bool indentLetBody) { mlir::detail::CallbackOstream stream(callback, userData); + smt::SMTEmissionOptions options; + options.inlineSingleUseValues = inlineSingleUseValues; + options.indentLetBody = indentLetBody; return wrap(smt::exportSMTLIB(unwrap(module), stream)); } + +MlirLogicalResult mlirTranslateModuleToSMTLIB(MlirModule module, + MlirStringCallback callback, + void *userData, + bool inlineSingleUseValues, + bool indentLetBody) { + return mlirTranslateOperationToSMTLIB(mlirModuleGetOperation(module), + callback, userData, + inlineSingleUseValues, indentLetBody); +} diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index fb115a5f43423..bbf6819608bb9 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -403,6 +403,15 @@ declare_mlir_dialect_python_bindings( "../../include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td" ) +declare_mlir_dialect_python_bindings( + ADD_TO_PARENT MLIRPythonSources.Dialects + ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" + TD_FILE dialects/SMTOps.td + GEN_ENUM_BINDINGS + SOURCES + dialects/smt.py + DIALECT_NAME smt) + declare_mlir_dialect_python_bindings( ADD_TO_PARENT MLIRPythonSources.Dialects ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" @@ -664,6 +673,21 @@ declare_mlir_python_extension(MLIRPythonExtension.LinalgPasses MLIRCAPILinalg ) +declare_mlir_python_extension(MLIRPythonExtension.Dialects.SMT.Pybind + MODULE_NAME _mlirDialectsSMT + ADD_TO_PARENT MLIRPythonSources.Dialects.smt + ROOT_DIR "${PYTHON_SOURCE_DIR}" + PYTHON_BINDINGS_LIBRARY nanobind + SOURCES + DialectSMT.cpp + PRIVATE_LINK_LIBS + LLVMSupport + EMBED_CAPI_LINK_LIBS + MLIRCAPIIR + MLIRCAPISMT + MLIRCAPIExportSMTLIB +) + declare_mlir_python_extension(MLIRPythonExtension.SparseTensorDialectPasses MODULE_NAME _mlirSparseTensorPasses ADD_TO_PARENT MLIRPythonSources.Dialects.sparse_tensor diff --git a/mlir/python/mlir/dialects/SMTOps.td b/mlir/python/mlir/dialects/SMTOps.td new file mode 100644 index 0000000000000..e143f071eb658 --- /dev/null +++ b/mlir/python/mlir/dialects/SMTOps.td @@ -0,0 +1,14 @@ +//===- SMTOps.td - Entry point for SMT bindings ------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BINDINGS_PYTHON_SMT_OPS +#define BINDINGS_PYTHON_SMT_OPS + +include "mlir/Dialect/SMT/IR/SMT.td" + +#endif // BINDINGS_PYTHON_SMT_OPS diff --git a/mlir/python/mlir/dialects/smt.py b/mlir/python/mlir/dialects/smt.py new file mode 100644 index 0000000000000..ae7a4c41cbc3a --- /dev/null +++ b/mlir/python/mlir/dialects/smt.py @@ -0,0 +1,33 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from ._smt_ops_gen import * + +from .._mlir_libs._mlirDialectsSMT import * +from ..extras.meta import region_op + + +def bool_t(): + return BoolType.get() + + +def bv_t(width): + return BitVectorType.get(width) + + +def _solver( + inputs=None, + results=None, + loc=None, + ip=None, +): + if inputs is None: + inputs = [] + if results is None: + results = [] + + return SolverOp(results, inputs, loc=loc, ip=ip) + + +solver = region_op(_solver, terminator=YieldOp) diff --git a/mlir/test/CAPI/smt.c b/mlir/test/CAPI/smt.c index 77815d4f79657..95a9b55e3209b 100644 --- a/mlir/test/CAPI/smt.c +++ b/mlir/test/CAPI/smt.c @@ -34,7 +34,8 @@ void testExportSMTLIB(MlirContext ctx) { MlirModule module = mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(testSMT)); - MlirLogicalResult result = mlirExportSMTLIB(module, dumpCallback, NULL); + MlirLogicalResult result = + mlirTranslateModuleToSMTLIB(module, dumpCallback, NULL, false, false); (void)result; assert(mlirLogicalResultIsSuccess(result)); @@ -44,13 +45,13 @@ void testExportSMTLIB(MlirContext ctx) { } void testSMTType(MlirContext ctx) { - MlirType boolType = smtTypeGetBool(ctx); - MlirType intType = smtTypeGetInt(ctx); - MlirType arrayType = smtTypeGetArray(ctx, intType, boolType); - MlirType bvType = smtTypeGetBitVector(ctx, 32); + MlirType boolType = mlirSMTTypeGetBool(ctx); + MlirType intType = mlirSMTTypeGetInt(ctx); + MlirType arrayType = mlirSMTTypeGetArray(ctx, intType, boolType); + MlirType bvType = mlirSMTTypeGetBitVector(ctx, 32); MlirType funcType = - smtTypeGetSMTFunc(ctx, 2, (MlirType[]){intType, boolType}, boolType); - MlirType sortType = smtTypeGetSort( + mlirSMTTypeGetSMTFunc(ctx, 2, (MlirType[]){intType, boolType}, boolType); + MlirType sortType = mlirSMTTypeGetSort( ctx, mlirIdentifierGet(ctx, mlirStringRefCreateFromCString("sort")), 0, NULL); @@ -68,107 +69,107 @@ void testSMTType(MlirContext ctx) { mlirTypeDump(sortType); // CHECK: bool_is_any_non_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(boolType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(boolType) ? "bool_is_any_non_func_smt_value_type\n" : "bool_is_func_smt_value_type\n"); // CHECK: int_is_any_non_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(intType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(intType) ? "int_is_any_non_func_smt_value_type\n" : "int_is_func_smt_value_type\n"); // CHECK: array_is_any_non_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(arrayType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(arrayType) ? "array_is_any_non_func_smt_value_type\n" : "array_is_func_smt_value_type\n"); // CHECK: bit_vector_is_any_non_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(bvType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(bvType) ? "bit_vector_is_any_non_func_smt_value_type\n" : "bit_vector_is_func_smt_value_type\n"); // CHECK: sort_is_any_non_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(sortType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(sortType) ? "sort_is_any_non_func_smt_value_type\n" : "sort_is_func_smt_value_type\n"); // CHECK: smt_func_is_func_smt_value_type - fprintf(stderr, smtTypeIsAnyNonFuncSMTValueType(funcType) + fprintf(stderr, mlirSMTTypeIsAnyNonFuncSMTValueType(funcType) ? "smt_func_is_any_non_func_smt_value_type\n" : "smt_func_is_func_smt_value_type\n"); // CHECK: bool_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(boolType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(boolType) ? "bool_is_any_smt_value_type\n" : "bool_is_not_any_smt_value_type\n"); // CHECK: int_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(intType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(intType) ? "int_is_any_smt_value_type\n" : "int_is_not_any_smt_value_type\n"); // CHECK: array_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(arrayType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(arrayType) ? "array_is_any_smt_value_type\n" : "array_is_not_any_smt_value_type\n"); // CHECK: array_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(bvType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(bvType) ? "array_is_any_smt_value_type\n" : "array_is_not_any_smt_value_type\n"); // CHECK: smt_func_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(funcType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(funcType) ? "smt_func_is_any_smt_value_type\n" : "smt_func_is_not_any_smt_value_type\n"); // CHECK: sort_is_any_smt_value_type - fprintf(stderr, smtTypeIsAnySMTValueType(sortType) + fprintf(stderr, mlirSMTTypeIsAnySMTValueType(sortType) ? "sort_is_any_smt_value_type\n" : "sort_is_not_any_smt_value_type\n"); // CHECK: int_type_is_not_a_bool - fprintf(stderr, smtTypeIsABool(intType) ? "int_type_is_a_bool\n" - : "int_type_is_not_a_bool\n"); + fprintf(stderr, mlirSMTTypeIsABool(intType) ? "int_type_is_a_bool\n" + : "int_type_is_not_a_bool\n"); // CHECK: bool_type_is_not_a_int - fprintf(stderr, smtTypeIsAInt(boolType) ? "bool_type_is_a_int\n" - : "bool_type_is_not_a_int\n"); + fprintf(stderr, mlirSMTTypeIsAInt(boolType) ? "bool_type_is_a_int\n" + : "bool_type_is_not_a_int\n"); // CHECK: bv_type_is_not_a_array - fprintf(stderr, smtTypeIsAArray(bvType) ? "bv_type_is_a_array\n" - : "bv_type_is_not_a_array\n"); + fprintf(stderr, mlirSMTTypeIsAArray(bvType) ? "bv_type_is_a_array\n" + : "bv_type_is_not_a_array\n"); // CHECK: array_type_is_not_a_bit_vector - fprintf(stderr, smtTypeIsABitVector(arrayType) + fprintf(stderr, mlirSMTTypeIsABitVector(arrayType) ? "array_type_is_a_bit_vector\n" : "array_type_is_not_a_bit_vector\n"); // CHECK: sort_type_is_not_a_smt_func - fprintf(stderr, smtTypeIsASMTFunc(sortType) + fprintf(stderr, mlirSMTTypeIsASMTFunc(sortType) ? "sort_type_is_a_smt_func\n" : "sort_type_is_not_a_smt_func\n"); // CHECK: func_type_is_not_a_sort - fprintf(stderr, smtTypeIsASort(funcType) ? "func_type_is_a_sort\n" - : "func_type_is_not_a_sort\n"); + fprintf(stderr, mlirSMTTypeIsASort(funcType) ? "func_type_is_a_sort\n" + : "func_type_is_not_a_sort\n"); } void testSMTAttribute(MlirContext ctx) { // CHECK: slt_is_BVCmpPredicate - fprintf(stderr, - smtAttrCheckBVCmpPredicate(ctx, mlirStringRefCreateFromCString("slt")) - ? "slt_is_BVCmpPredicate\n" - : "slt_is_not_BVCmpPredicate\n"); + fprintf(stderr, mlirSMTAttrCheckBVCmpPredicate( + ctx, mlirStringRefCreateFromCString("slt")) + ? "slt_is_BVCmpPredicate\n" + : "slt_is_not_BVCmpPredicate\n"); // CHECK: lt_is_not_BVCmpPredicate - fprintf(stderr, - smtAttrCheckBVCmpPredicate(ctx, mlirStringRefCreateFromCString("lt")) - ? "lt_is_BVCmpPredicate\n" - : "lt_is_not_BVCmpPredicate\n"); + fprintf(stderr, mlirSMTAttrCheckBVCmpPredicate( + ctx, mlirStringRefCreateFromCString("lt")) + ? "lt_is_BVCmpPredicate\n" + : "lt_is_not_BVCmpPredicate\n"); // CHECK: slt_is_not_IntPredicate - fprintf(stderr, - smtAttrCheckIntPredicate(ctx, mlirStringRefCreateFromCString("slt")) - ? "slt_is_IntPredicate\n" - : "slt_is_not_IntPredicate\n"); + fprintf(stderr, mlirSMTAttrCheckIntPredicate( + ctx, mlirStringRefCreateFromCString("slt")) + ? "slt_is_IntPredicate\n" + : "slt_is_not_IntPredicate\n"); // CHECK: lt_is_IntPredicate - fprintf(stderr, - smtAttrCheckIntPredicate(ctx, mlirStringRefCreateFromCString("lt")) - ? "lt_is_IntPredicate\n" - : "lt_is_not_IntPredicate\n"); + fprintf(stderr, mlirSMTAttrCheckIntPredicate( + ctx, mlirStringRefCreateFromCString("lt")) + ? "lt_is_IntPredicate\n" + : "lt_is_not_IntPredicate\n"); // CHECK: #smt.bv<5> : !smt.bv<32> - mlirAttributeDump(smtAttrGetBitVector(ctx, 5, 32)); + mlirAttributeDump(mlirSMTAttrGetBitVector(ctx, 5, 32)); // CHECK: 0 : i64 mlirAttributeDump( - smtAttrGetBVCmpPredicate(ctx, mlirStringRefCreateFromCString("slt"))); + mlirSMTAttrGetBVCmpPredicate(ctx, mlirStringRefCreateFromCString("slt"))); // CHECK: 0 : i64 mlirAttributeDump( - smtAttrGetIntPredicate(ctx, mlirStringRefCreateFromCString("lt"))); + mlirSMTAttrGetIntPredicate(ctx, mlirStringRefCreateFromCString("lt"))); } int main(void) { diff --git a/mlir/test/python/dialects/smt.py b/mlir/test/python/dialects/smt.py new file mode 100644 index 0000000000000..6f0cd8835b65b --- /dev/null +++ b/mlir/test/python/dialects/smt.py @@ -0,0 +1,87 @@ +# RUN: %PYTHON %s | FileCheck %s + +from mlir.dialects import smt, arith +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type + + +def run(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f(module) + print(module) + assert module.operation.verify() + + +# CHECK-LABEL: TEST: test_smoke +@run +def test_smoke(_module): + true = smt.constant(True) + false = smt.constant(False) + # CHECK: smt.constant true + # CHECK: smt.constant false + + +# CHECK-LABEL: TEST: test_types +@run +def test_types(_module): + bool_t = smt.bool_t() + bitvector_t = smt.bv_t(5) + # CHECK: !smt.bool + print(bool_t) + # CHECK: !smt.bv<5> + print(bitvector_t) + + +# CHECK-LABEL: TEST: test_solver_op +@run +def test_solver_op(_module): + @smt.solver + def foo1(): + true = smt.constant(True) + false = smt.constant(False) + + # CHECK: smt.solver() : () -> () { + # CHECK: %true = smt.constant true + # CHECK: %false = smt.constant false + # CHECK: } + + f32 = F32Type.get() + + @smt.solver(results=[f32]) + def foo2(): + return arith.ConstantOp(f32, 1.0) + + # CHECK: %{{.*}} = smt.solver() : () -> f32 { + # CHECK: %[[CST1:.*]] = arith.constant 1.000000e+00 : f32 + # CHECK: smt.yield %[[CST1]] : f32 + # CHECK: } + + two = arith.ConstantOp(f32, 2.0) + # CHECK: %[[CST2:.*]] = arith.constant 2.000000e+00 : f32 + print(two) + + @smt.solver(inputs=[two], results=[f32]) + def foo3(z: f32): + return z + + # CHECK: %{{.*}} = smt.solver(%[[CST2]]) : (f32) -> f32 { + # CHECK: ^bb0(%[[ARG0:.*]]: f32): + # CHECK: smt.yield %[[ARG0]] : f32 + # CHECK: } + + +# CHECK-LABEL: TEST: test_export_smtlib +@run +def test_export_smtlib(module): + @smt.solver + def foo1(): + true = smt.constant(True) + smt.assert_(true) + + query = smt.export_smtlib(module.operation) + # CHECK: ; solver scope 0 + # CHECK: (assert true) + # CHECK: (reset) + print(query) From 9c98a9801dff79ea7e440f2a45f669be2cdc965b Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 15:17:47 -0700 Subject: [PATCH 186/710] [NFC][Asan] CRLF to LF in a test --- .../asan/TestCases/asan_lsan_deadlock.cpp | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp index 4e1a2415ad013..28ce4e3e81b23 100644 --- a/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp +++ b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp @@ -1,72 +1,72 @@ -// Test for potential deadlock in LeakSanitizer+AddressSanitizer. -// REQUIRES: leak-detection -// -// RUN: %clangxx_asan -O0 %s -o %t -// RUN: %env_asan_opts=detect_leaks=1 not %run %t 2>&1 | FileCheck %s - -/* - * Purpose: Verify deadlock prevention between ASan error reporting and LSan leak checking. - * - * Test Design: - * 1. Creates contention scenario between: - * - ASan's error reporting (requires lock B -> lock A ordering) - * - LSan's leak check (requires lock A -> lock B ordering) - * 2. Thread timing: - * - Main thread: Holds 'in' mutex -> Triggers LSan check (lock A then B) - * - Worker thread: Triggers ASan OOB error (lock B then A via symbolization) - * - * Deadlock Condition (if unfixed): - * Circular lock dependency forms when: - * [Main Thread] LSan: lock A -> requests lock B - * [Worker Thread] ASan: lock B -> requests lock A - * - * Success Criteria: - * With proper lock ordering enforcement, watchdog should NOT trigger - test exits normally. - * If deadlock occurs, watchdog terminates via _exit(1) after 10s timeout. - */ - -#include -#include -#include -#include -#include - -void Watchdog() { - // Safety mechanism: Turn infinite deadlock into finite test failure - usleep(10000000); - // CHECK-NOT: Timeout! Deadlock detected. - puts("Timeout! Deadlock detected."); - fflush(stdout); - _exit(1); -} - -int main(int argc, char **argv) { - int arr[1] = {0}; - std::mutex in; - in.lock(); - - std::thread w(Watchdog); - w.detach(); - - std::thread t([&]() { - in.unlock(); - /* - * Provoke ASan error: ASan's error reporting acquires: - * 1. ASan's thread registry lock (B) during the reporting - * 2. dl_iterate_phdr lock (A) during symbolization - */ - // CHECK: SUMMARY: AddressSanitizer: stack-buffer-overflow - arr[argc] = 1; // Deliberate OOB access - }); - - in.lock(); - /* - * Critical section: LSan's check acquires: - * 1. dl_iterate_phdr lock (A) - * 2. ASan's thread registry lock (B) - * before Stop The World. - */ - __lsan_do_leak_check(); - t.join(); - return 0; -} +// Test for potential deadlock in LeakSanitizer+AddressSanitizer. +// REQUIRES: leak-detection +// +// RUN: %clangxx_asan -O0 %s -o %t +// RUN: %env_asan_opts=detect_leaks=1 not %run %t 2>&1 | FileCheck %s + +/* + * Purpose: Verify deadlock prevention between ASan error reporting and LSan leak checking. + * + * Test Design: + * 1. Creates contention scenario between: + * - ASan's error reporting (requires lock B -> lock A ordering) + * - LSan's leak check (requires lock A -> lock B ordering) + * 2. Thread timing: + * - Main thread: Holds 'in' mutex -> Triggers LSan check (lock A then B) + * - Worker thread: Triggers ASan OOB error (lock B then A via symbolization) + * + * Deadlock Condition (if unfixed): + * Circular lock dependency forms when: + * [Main Thread] LSan: lock A -> requests lock B + * [Worker Thread] ASan: lock B -> requests lock A + * + * Success Criteria: + * With proper lock ordering enforcement, watchdog should NOT trigger - test exits normally. + * If deadlock occurs, watchdog terminates via _exit(1) after 10s timeout. + */ + +#include +#include +#include +#include +#include + +void Watchdog() { + // Safety mechanism: Turn infinite deadlock into finite test failure + usleep(10000000); + // CHECK-NOT: Timeout! Deadlock detected. + puts("Timeout! Deadlock detected."); + fflush(stdout); + _exit(1); +} + +int main(int argc, char **argv) { + int arr[1] = {0}; + std::mutex in; + in.lock(); + + std::thread w(Watchdog); + w.detach(); + + std::thread t([&]() { + in.unlock(); + /* + * Provoke ASan error: ASan's error reporting acquires: + * 1. ASan's thread registry lock (B) during the reporting + * 2. dl_iterate_phdr lock (A) during symbolization + */ + // CHECK: SUMMARY: AddressSanitizer: stack-buffer-overflow + arr[argc] = 1; // Deliberate OOB access + }); + + in.lock(); + /* + * Critical section: LSan's check acquires: + * 1. dl_iterate_phdr lock (A) + * 2. ASan's thread registry lock (B) + * before Stop The World. + */ + __lsan_do_leak_check(); + t.join(); + return 0; +} From 91df4cce44ac33d2d169614c532868c1dde5df51 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 16 Apr 2025 15:19:54 -0700 Subject: [PATCH 187/710] [NFC][Asan] Disabled test dead-locking on Darwin After #131756. --- compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp index 28ce4e3e81b23..7ca07edbaf18c 100644 --- a/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp +++ b/compiler-rt/test/asan/TestCases/asan_lsan_deadlock.cpp @@ -4,6 +4,9 @@ // RUN: %clangxx_asan -O0 %s -o %t // RUN: %env_asan_opts=detect_leaks=1 not %run %t 2>&1 | FileCheck %s +// Hangs for unknown reasons. +// UNSUPPORTED: darwin + /* * Purpose: Verify deadlock prevention between ASan error reporting and LSan leak checking. * From e64305096a3d1ec122953be6e7008fbf05467d73 Mon Sep 17 00:00:00 2001 From: David Truby Date: Wed, 16 Apr 2025 23:26:20 +0100 Subject: [PATCH 188/710] [flang] Complete alignment of -x language modes with gfortran (#133775) --- clang/include/clang/Driver/Types.def | 4 ++-- clang/test/Driver/fortran.f95 | 4 ++-- .../Driver/input-from-stdin/input-from-stdin.f90 | 2 +- flang/test/Driver/phases.f90 | 12 ++++++------ flang/test/Driver/pp-fixed-form.f90 | 16 ++++++++-------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/clang/include/clang/Driver/Types.def b/clang/include/clang/Driver/Types.def index 214c5e7a789f9..76944ec656917 100644 --- a/clang/include/clang/Driver/Types.def +++ b/clang/include/clang/Driver/Types.def @@ -88,8 +88,8 @@ TYPE("assembler-with-cpp", Asm, PP_Asm, "S", phases // modules when Flang needs to emit pre-processed files. Therefore, the // `PP_TYPE` is set to `PP_Fortran` so that the driver is fine with // "pre-processing a pre-processed file". -TYPE("f95", PP_Fortran, PP_Fortran, "i", phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link) -TYPE("f95-cpp-input", Fortran, PP_Fortran, nullptr, phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link) +TYPE("f95", Fortran, PP_Fortran, nullptr, phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link) +TYPE("f95-cpp-input", PP_Fortran, PP_Fortran, "i", phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link) TYPE("java", Java, INVALID, nullptr, phases::Compile, phases::Backend, phases::Assemble, phases::Link) // LLVM IR/LTO types. We define separate types for IR and LTO because LTO diff --git a/clang/test/Driver/fortran.f95 b/clang/test/Driver/fortran.f95 index 275b1886b2fda..1447312b8bb58 100644 --- a/clang/test/Driver/fortran.f95 +++ b/clang/test/Driver/fortran.f95 @@ -5,14 +5,14 @@ ! RUN: | FileCheck --check-prefix=CHECK-OBJECT %s ! CHECK-OBJECT: gcc ! CHECK-OBJECT: "-c" -! CHECK-OBJECT: "-x" "f95" +! CHECK-OBJECT: "-x" "f95-cpp-input" ! CHECK-OBJECT-NOT: "-cc1as" ! RUN: %clang --target=x86_64-unknown-linux-gnu -integrated-as -S %s -### 2>&1 \ ! RUN: | FileCheck --check-prefix=CHECK-ASM %s ! CHECK-ASM: gcc ! CHECK-ASM: "-S" -! CHECK-ASM: "-x" "f95" +! CHECK-ASM: "-x" "f95-cpp-input" ! CHECK-ASM-NOT: "-cc1" ! RUN: %clang -Wall --target=x86_64-unknown-linux-gnu -integrated-as %s -### 2>&1 | FileCheck --check-prefix=CHECK-WARN %s diff --git a/flang/test/Driver/input-from-stdin/input-from-stdin.f90 b/flang/test/Driver/input-from-stdin/input-from-stdin.f90 index 1fcc0340a64ba..285f0751b35d8 100644 --- a/flang/test/Driver/input-from-stdin/input-from-stdin.f90 +++ b/flang/test/Driver/input-from-stdin/input-from-stdin.f90 @@ -6,7 +6,7 @@ ! Input type is implicit ! RUN: cat %s | %flang -E -cpp - | FileCheck %s --check-prefix=PP-NOT-DEFINED ! RUN: cat %s | %flang -DNEW -E -cpp - | FileCheck %s --check-prefix=PP-DEFINED -! RUN: cat %s | %flang -DNEW -E - | FileCheck %s --check-prefix=PP-DEFINED +! RUN: cat %s | %flang -DNEW -E - | FileCheck %s --check-prefix=PP-NOT-DEFINED ! RUN: cat %s | %flang -DNEW -E -nocpp - | FileCheck %s --check-prefix=PP-NOT-DEFINED ! Input type is explicit diff --git a/flang/test/Driver/phases.f90 b/flang/test/Driver/phases.f90 index b688600dae035..9346773c883ef 100644 --- a/flang/test/Driver/phases.f90 +++ b/flang/test/Driver/phases.f90 @@ -4,15 +4,15 @@ ! RUN: %flang -fsyntax-only -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=COMPILE ! RUN: %flang -c -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=EMIT_OBJ -! PP: +- 0: input, "{{.*}}phases.f90", f95-cpp-input -! PP-NEXT: 1: preprocessor, {0}, f95 +! PP: +- 0: input, "{{.*}}phases.f90", f95 +! PP-NEXT: 1: preprocessor, {0}, f95-cpp-input -! COMPILE: +- 0: input, "{{.*}}phases.f90", f95-cpp-input -! COMPILE-NEXT: 1: preprocessor, {0}, f95 +! COMPILE: +- 0: input, "{{.*}}phases.f90", f95 +! COMPILE-NEXT: 1: preprocessor, {0}, f95-cpp-input ! COMPILE-NEXT: 2: compiler, {1}, none -! EMIT_OBJ: +- 0: input, "{{.*}}phases.f90", f95-cpp-input -! EMIT_OBJ-NEXT: 1: preprocessor, {0}, f95 +! EMIT_OBJ: +- 0: input, "{{.*}}phases.f90", f95 +! EMIT_OBJ-NEXT: 1: preprocessor, {0}, f95-cpp-input ! EMIT_OBJ-NEXT: 2: compiler, {1}, ir ! EMIT_OBJ-NEXT: +- 3: backend, {2}, assembler ! EMIT_OBJ-NEXT: 4: assembler, {3}, object diff --git a/flang/test/Driver/pp-fixed-form.f90 b/flang/test/Driver/pp-fixed-form.f90 index 4695da78763ae..bb869cd3341a7 100644 --- a/flang/test/Driver/pp-fixed-form.f90 +++ b/flang/test/Driver/pp-fixed-form.f90 @@ -1,19 +1,19 @@ !RUN: %flang -save-temps -### %S/Inputs/free-form-test.f90 2>&1 | FileCheck %s --check-prefix=FREE -FREE: "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/free-form-test.f90" -FREE-NEXT: "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95" "free-form-test.i" +FREE: "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95" "{{.*}}/free-form-test.f90" +FREE-NEXT: "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95-cpp-input" "free-form-test.i" !RUN: %flang -save-temps -### %S/Inputs/fixed-form-test.f 2>&1 | FileCheck %s --check-prefix=FIXED -FIXED: "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/fixed-form-test.f" -FIXED-NEXT: "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95" "fixed-form-test.i" +FIXED: "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95" "{{.*}}/fixed-form-test.f" +FIXED-NEXT: "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95-cpp-input" "fixed-form-test.i" !RUN: %flang -save-temps -### -ffree-form %S/Inputs/free-form-test.f90 2>&1 | FileCheck %s --check-prefix=FREE-FLAG -FREE-FLAG: "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/free-form-test.f90" +FREE-FLAG: "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95" "{{.*}}/free-form-test.f90" FREE-FLAG-NEXT: "-fc1" {{.*}} "-emit-llvm-bc" "-ffree-form" FREE-FLAG-NOT: "-ffixed-form" -FREE-FLAG-SAME: "-x" "f95" "free-form-test.i" +FREE-FLAG-SAME: "-x" "f95-cpp-input" "free-form-test.i" !RUN: %flang -save-temps -### -ffixed-form %S/Inputs/fixed-form-test.f 2>&1 | FileCheck %s --check-prefix=FIXED-FLAG -FIXED-FLAG: "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/fixed-form-test.f" +FIXED-FLAG: "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95" "{{.*}}/fixed-form-test.f" FIXED-FLAG-NEXT: "-fc1" {{.*}} "-emit-llvm-bc" "-ffixed-form" FIXED-FLAG-NOT: "-ffixed-form" -FIXED-FLAG-SAME: "-x" "f95" "fixed-form-test.i" +FIXED-FLAG-SAME: "-x" "f95-cpp-input" "fixed-form-test.i" From 8ebdd9d8a19543992195f197de215c53d506fb72 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 16 Apr 2025 15:40:34 -0700 Subject: [PATCH 189/710] Reapply "[LLVM][TableGen] Parameterize NumToSkip in DecoderEmitter" (#136017) (#136019) This reverts commit 7fd0c8acd4659ccd0aef5486afe32c8ddf0f2957, and fixes the assert condition in `patchNumToSkip`. --- llvm/lib/Target/AArch64/CMakeLists.txt | 2 +- llvm/test/TableGen/VarLenDecoder.td | 4 +- llvm/test/TableGen/trydecode-emission.td | 10 +- llvm/test/TableGen/trydecode-emission2.td | 16 +-- llvm/test/TableGen/trydecode-emission3.td | 2 +- llvm/test/TableGen/trydecode-emission4.td | 2 +- llvm/utils/TableGen/DecoderEmitter.cpp | 115 ++++++++++++---------- 7 files changed, 83 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 2300e479bc110..ba1d1605ec104 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -7,7 +7,7 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler --num-to-skip-size=3) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td index 5cf0bf8911859..b77702ff7c5c1 100644 --- a/llvm/test/TableGen/VarLenDecoder.td +++ b/llvm/test/TableGen/VarLenDecoder.td @@ -47,9 +47,9 @@ def FOO32 : MyVarInst { } // CHECK: MCD::OPC_ExtractField, 3, 5, // Inst{7-3} ... -// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, 0, // Skip to: 12 +// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16 -// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, 0, // Skip to: 21 +// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, // Skip to: 19 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32 // CHECK-NEXT: MCD::OPC_Fail, diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/trydecode-emission.td index 20d2446eeac7f..2b4239f4fbe65 100644 --- a/llvm/test/TableGen/trydecode-emission.td +++ b/llvm/test/TableGen/trydecode-emission.td @@ -34,10 +34,10 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 4, 4, // Inst{7-4} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 18, 0, 0, // Skip to: 26 -// CHECK-NEXT: /* 8 */ MCD::OPC_CheckField, 2, 2, 0, 7, 0, 0, // Skip to: 22 -// CHECK-NEXT: /* 15 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 22 -// CHECK-NEXT: /* 22 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA -// CHECK-NEXT: /* 26 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 16, 0, // Skip to: 23 +// CHECK-NEXT: /* 7 */ MCD::OPC_CheckField, 2, 2, 0, 6, 0, // Skip to: 19 +// CHECK-NEXT: /* 13 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 19 +// CHECK-NEXT: /* 19 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA +// CHECK-NEXT: /* 23 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/trydecode-emission2.td index 0584034e41233..7d30474058f73 100644 --- a/llvm/test/TableGen/trydecode-emission2.td +++ b/llvm/test/TableGen/trydecode-emission2.td @@ -31,14 +31,14 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 2, 1, // Inst{2} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 36, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 8 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... -// CHECK-NEXT: /* 11 */ MCD::OPC_FilterValue, 0, 28, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 16 */ MCD::OPC_CheckField, 0, 2, 3, 7, 0, 0, // Skip to: 30 -// CHECK-NEXT: /* 23 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 30 -// CHECK-NEXT: /* 30 */ MCD::OPC_CheckField, 3, 2, 0, 7, 0, 0, // Skip to: 44 -// CHECK-NEXT: /* 37 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, 0, // Opcode: InstA, skip to: 44 -// CHECK-NEXT: /* 44 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 31, 0, // Skip to: 38 +// CHECK-NEXT: /* 7 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... +// CHECK-NEXT: /* 10 */ MCD::OPC_FilterValue, 0, 24, 0, // Skip to: 38 +// CHECK-NEXT: /* 14 */ MCD::OPC_CheckField, 0, 2, 3, 6, 0, // Skip to: 26 +// CHECK-NEXT: /* 20 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 26 +// CHECK-NEXT: /* 26 */ MCD::OPC_CheckField, 3, 2, 0, 6, 0, // Skip to: 38 +// CHECK-NEXT: /* 32 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, // Opcode: InstA, skip to: 38 +// CHECK-NEXT: /* 38 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } // CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/trydecode-emission3.td index 4c5be7e1af229..0abbe62fe337e 100644 --- a/llvm/test/TableGen/trydecode-emission3.td +++ b/llvm/test/TableGen/trydecode-emission3.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s + // RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/trydecode-emission4.td index 1e51ba5e40768..413e4a0d1275a 100644 --- a/llvm/test/TableGen/trydecode-emission4.td +++ b/llvm/test/TableGen/trydecode-emission4.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s // Test for OPC_ExtractField/OPC_CheckField with start bit > 255. // These large start values may arise for architectures with long instruction diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 9c6015cc24576..eff63c6b45bb3 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -32,8 +32,10 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -76,6 +78,12 @@ static cl::opt DecoderEmitterSuppressDuplicates( "significantly reducing Table Duplications")), cl::init(SUPPRESSION_DISABLE), cl::cat(DisassemblerEmitterCat)); +static cl::opt + NumToSkipSizeInBytes("num-to-skip-size", + cl::desc("number of bytes to use for num-to-skip " + "entries in the decoder table (2 or 3)"), + cl::init(2), cl::cat(DisassemblerEmitterCat)); + STATISTIC(NumEncodings, "Number of encodings considered"); STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info"); @@ -130,10 +138,29 @@ struct DecoderTable : public std::vector { // in the table for patching. size_t insertNumToSkip() { size_t Size = size(); - insert(end(), 3, 0); + insert(end(), NumToSkipSizeInBytes, 0); return Size; } + + void patchNumToSkip(size_t FixupIdx, uint32_t DestIdx) { + // Calculate the distance from the byte following the fixup entry byte + // to the destination. The Target is calculated from after the + // `NumToSkipSizeInBytes`-byte NumToSkip entry itself, so subtract + // `NumToSkipSizeInBytes` from the displacement here to account for that. + assert(DestIdx >= FixupIdx + NumToSkipSizeInBytes && + "Expecting a forward jump in the decoding table"); + uint32_t Delta = DestIdx - FixupIdx - NumToSkipSizeInBytes; + if (!isUIntN(8 * NumToSkipSizeInBytes, Delta)) + PrintFatalError( + "disassembler decoding table too large, try --num-to-skip-size=3"); + + (*this)[FixupIdx] = static_cast(Delta); + (*this)[FixupIdx + 1] = static_cast(Delta >> 8); + if (NumToSkipSizeInBytes == 3) + (*this)[FixupIdx + 2] = static_cast(Delta >> 16); + } }; + struct DecoderTableInfo { DecoderTable Table; FixupScopeList FixupStack; @@ -690,19 +717,8 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, uint32_t DestIdx) { // Any NumToSkip fixups in the current scope can resolve to the // current location. - for (uint32_t FixupIdx : reverse(Fixups)) { - // Calculate the distance from the byte following the fixup entry byte - // to the destination. The Target is calculated from after the 24-bit - // NumToSkip entry itself, so subtract three from the displacement here - // to account for that. - uint32_t Delta = DestIdx - FixupIdx - 3; - // Our NumToSkip entries are 24-bits. Make sure our table isn't too - // big. - assert(isUInt<24>(Delta)); - Table[FixupIdx] = (uint8_t)Delta; - Table[FixupIdx + 1] = (uint8_t)(Delta >> 8); - Table[FixupIdx + 2] = (uint8_t)(Delta >> 16); - } + for (uint32_t FixupIdx : Fixups) + Table.patchNumToSkip(FixupIdx, DestIdx); } // Emit table entries to decode instructions given a segment or segments @@ -759,15 +775,9 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { Delegate->emitTableEntries(TableInfo); // Now that we've emitted the body of the handler, update the NumToSkip - // of the filter itself to be able to skip forward when false. Subtract - // three as to account for the width of the NumToSkip field itself. - if (PrevFilter) { - uint32_t NumToSkip = Table.size() - PrevFilter - 3; - assert(isUInt<24>(NumToSkip) && "disassembler decoding table too large!"); - Table[PrevFilter] = (uint8_t)NumToSkip; - Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8); - Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16); - } + // of the filter itself to be able to skip forward when false. + if (PrevFilter) + Table.patchNumToSkip(PrevFilter, Table.size()); } // If there is no fallthrough, then the final filter should get fixed @@ -814,7 +824,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << (unsigned)*I++ << ", "; }; - // Emit 24-bit numtoskip value to OS, returning the NumToSkip value. + // Emit `NumToSkipSizeInBytes`-byte numtoskip value to OS, returning the + // NumToSkip value. auto emitNumToSkip = [](DecoderTable::const_iterator &I, formatted_raw_ostream &OS) { uint8_t Byte = *I++; @@ -823,9 +834,11 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, Byte = *I++; OS << (unsigned)Byte << ", "; NumToSkip |= Byte << 8; - Byte = *I++; - OS << (unsigned)(Byte) << ", "; - NumToSkip |= Byte << 16; + if (NumToSkipSizeInBytes == 3) { + Byte = *I++; + OS << (unsigned)(Byte) << ", "; + NumToSkip |= Byte << 16; + } return NumToSkip; }; @@ -867,7 +880,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // The filter value is ULEB128 encoded. emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -883,7 +896,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // ULEB128 encoded field value. emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -893,7 +906,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << Indent << "MCD::OPC_CheckPredicate, "; emitULEB128(I, OS); - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -925,7 +938,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // Fallthrough for OPC_TryDecode. - // 24-bit numtoskip value. + // numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Opcode: " << NumberedEncodings[EncodingID] @@ -1411,9 +1424,9 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(NumBits); TableInfo.Table.insertULEB128(Ilnd.FieldVal); - // The fixup is always 24-bits, so go ahead and allocate the space - // in the table so all our relative position calculations work OK even - // before we fully resolve the real value here. + // Allocate space in the table for fixup (NumToSkipSizeInBytes) so all + // our relative position calculations work OK even before we fully + // resolve the real value here. // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.insertNumToSkip()); @@ -2157,7 +2170,18 @@ insertBits(InsnType &field, uint64_t bits, unsigned startBit, unsigned numBits) // decodeInstruction(). static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst) { + OS << formatv("\nconstexpr unsigned NumToSkipSizeInBytes = {};\n", + NumToSkipSizeInBytes); + OS << R"( +inline unsigned decodeNumToSkip(const uint8_t *&Ptr) { + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + if constexpr (NumToSkipSizeInBytes == 3) + NumToSkip |= (*Ptr++) << 16; + return NumToSkip; +} + template static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, InsnType insn, uint64_t Address, @@ -2195,10 +2219,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the field value. uint64_t Val = decodeULEB128AndIncUnsafe(++Ptr); bool Failed = Val != CurFieldValue; - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Perform the filter operation. if (Failed) @@ -2222,10 +2243,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen); Ptr += PtrLen; bool Failed = ExpectedValue != FieldValue; - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // If the actual and expected values don't match, skip. if (Failed) @@ -2240,10 +2258,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, case MCD::OPC_CheckPredicate: { // Decode the Predicate Index value. unsigned PIdx = decodeULEB128AndIncUnsafe(++Ptr); - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Check the predicate. bool Failed = !checkDecoderPredicate(PIdx, Bits); if (Failed) @@ -2278,10 +2293,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the Opcode value. unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr); unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr); - // NumToSkip is a plain 24-bit integer. - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - NumToSkip |= (*Ptr++) << 16; + unsigned NumToSkip = decodeNumToSkip(Ptr); // Perform the decode operation. MCInst TmpMI; @@ -2406,6 +2418,9 @@ handleHwModesUnrelatedEncodings(const CodeGenInstruction *Instr, // Emits disassembler code for instruction decoding. void DecoderEmitter::run(raw_ostream &o) { + if (NumToSkipSizeInBytes != 2 && NumToSkipSizeInBytes != 3) + PrintFatalError("Invalid value for num-to-skip-size, must be 2 or 3"); + formatted_raw_ostream OS(o); OS << R"( #include "llvm/MC/MCInst.h" From d35bf17e8a0cf37959149257c0eda7b9f912390e Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 16 Apr 2025 15:45:18 -0700 Subject: [PATCH 190/710] [HLSL] Add a warning for implicit bindings (#135909) Implicit bindings will cause very confusing crashes in the backend at present, so this is intended at least partially as a stop gap until we get them implemented (see #110722). However, I do think that this is useful in the longer term as well as an off-by-default warning, as it is quite easy to miss a binding or two when using explicit bindings and the results of that can be surprisingly hard to debug. I've filed #135907 to track turning this into an off-by-default warning or removing it eventually as we see fit. --- clang/include/clang/Basic/DiagnosticGroups.td | 3 ++ .../clang/Basic/DiagnosticSemaKinds.td | 1 + clang/lib/Parse/ParseHLSL.cpp | 2 +- clang/lib/Sema/SemaHLSL.cpp | 13 ++++++ .../test/AST/HLSL/ByteAddressBuffers-AST.hlsl | 12 +++--- clang/test/AST/HLSL/OutArgExpr.hlsl | 6 +-- .../test/AST/HLSL/StructuredBuffers-AST.hlsl | 20 ++++----- clang/test/AST/HLSL/TypedBuffers-AST.hlsl | 10 ++--- .../AST/HLSL/ast-dump-comment-cbuffer.hlsl | 4 +- clang/test/AST/HLSL/cbuffer.hlsl | 8 ++-- .../test/AST/HLSL/cbuffer_and_namespaces.hlsl | 8 ++-- clang/test/AST/HLSL/default_cbuffer.hlsl | 4 +- ...d_resource_element_compatible_concept.hlsl | 2 +- ...d_resource_element_compatible_concept.hlsl | 2 +- clang/test/AST/HLSL/packoffset.hlsl | 2 +- clang/test/AST/HLSL/pch.hlsl | 9 ++-- clang/test/AST/HLSL/pch_hlsl_buffer.hlsl | 13 +++--- clang/test/AST/HLSL/pch_with_buf.hlsl | 7 ++-- clang/test/AST/HLSL/private.hlsl | 2 +- .../CodeGenHLSL/GlobalConstructorLib.hlsl | 4 +- .../test/CodeGenHLSL/GlobalConstructors.hlsl | 2 +- .../AppendStructuredBuffer-elementtype.hlsl | 4 +- .../ConsumeStructuredBuffer-elementtype.hlsl | 4 +- .../builtins/RWBuffer-elementtype.hlsl | 4 +- .../builtins/RWBuffer-subscript.hlsl | 4 +- .../RWStructuredBuffer-elementtype.hlsl | 2 +- ...erOrderedStructuredBuffer-elementtype.hlsl | 2 +- .../StructuredBuffer-elementtype.hlsl | 2 +- .../StructuredBuffers-methods-ps.hlsl | 4 +- .../StructuredBuffers-subscripts.hlsl | 2 +- clang/test/CodeGenHLSL/cbuffer.hlsl | 3 +- clang/test/CodeGenHLSL/cbuffer_align.hlsl | 4 +- .../CodeGenHLSL/cbuffer_and_namespaces.hlsl | 4 +- ...uffer_with_static_global_and_function.hlsl | 3 +- clang/test/CodeGenHLSL/default_cbuffer.hlsl | 5 +-- .../default_cbuffer_with_layout.hlsl | 7 ++-- .../implicit-norecurse-attrib.hlsl | 4 +- clang/test/CodeGenHLSL/inline-functions.hlsl | 12 +++--- clang/test/CodeGenHLSL/static-local-ctor.hlsl | 3 +- clang/test/ParserHLSL/cb_error.hlsl | 2 +- .../ParserHLSL/hlsl_is_rov_attr_error.hlsl | 2 +- .../hlsl_raw_buffer_attr_error.hlsl | 2 +- .../ParserHLSL/hlsl_resource_class_attr.hlsl | 2 +- .../hlsl_resource_handle_attrs.hlsl | 2 +- clang/test/ParserHLSL/invalid_inside_cb.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl | 2 +- .../SemaHLSL/BuiltIns/StructuredBuffers.hlsl | 2 +- .../SemaHLSL/BuiltIns/hlsl_resource_t.hlsl | 2 +- .../Language/AggregateSplatCast-errors.hlsl | 4 +- clang/test/SemaHLSL/cb_error.hlsl | 2 +- clang/test/SemaHLSL/export.hlsl | 2 +- clang/test/SemaHLSL/packoffset-invalid.hlsl | 2 +- .../SemaHLSL/resource_binding_attr_error.hlsl | 4 +- .../resource_binding_attr_error_basic.hlsl | 2 +- .../resource_binding_attr_error_resource.hlsl | 2 +- ...urce_binding_attr_error_silence_diags.hlsl | 4 +- .../resource_binding_attr_error_space.hlsl | 4 +- .../resource_binding_attr_error_udt.hlsl | 2 +- .../SemaHLSL/resource_binding_implicit.hlsl | 41 +++++++++++++++++++ 59 files changed, 168 insertions(+), 126 deletions(-) create mode 100644 clang/test/SemaHLSL/resource_binding_implicit.hlsl diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index d97bbfee2e4d5..b234d60fee8fc 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1612,6 +1612,9 @@ def HLSLExtension : DiagGroup<"hlsl-extensions", [HLSL202y]>; // Warning for mix packoffset and non-packoffset. def HLSLMixPackOffset : DiagGroup<"mix-packoffset">; +// Warning for implicit resource bindings. +def HLSLImplicitBinding : DiagGroup<"hlsl-implicit-binding">; + // Warnings for DXIL validation def DXILValidation : DiagGroup<"dxil-validation">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3f7499d8656bd..6cbe8b60fe9bf 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12734,6 +12734,7 @@ def warn_hlsl_deprecated_register_type_i: Warning<"binding type 'i' ignored. The def err_hlsl_unsupported_register_number : Error<"register number should be an integer">; def err_hlsl_expected_space : Error<"invalid space specifier '%0' used; expected 'space' followed by an integer, like space1">; def err_hlsl_space_on_global_constant : Error<"register space cannot be specified on global constants">; +def warn_hlsl_implicit_binding : Warning<"resource has implicit register binding">, InGroup, DefaultError; def warn_hlsl_packoffset_mix : Warning<"cannot mix packoffset elements with nonpackoffset elements in a cbuffer">, InGroup; def err_hlsl_packoffset_overlap : Error<"packoffset overlap between %0, %1">; diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp index f4c109f9a81a2..b832f7b6792d9 100644 --- a/clang/lib/Parse/ParseHLSL.cpp +++ b/clang/lib/Parse/ParseHLSL.cpp @@ -75,6 +75,7 @@ Decl *Parser::ParseHLSLBuffer(SourceLocation &DeclEnd) { Decl *D = Actions.HLSL().ActOnStartBuffer(getCurScope(), IsCBuffer, BufferLoc, Identifier, IdentifierLoc, T.getOpenLocation()); + Actions.ProcessDeclAttributeList(Actions.CurScope, D, Attrs); while (Tok.isNot(tok::r_brace) && Tok.isNot(tok::eof)) { // FIXME: support attribute on constants inside cbuffer/tbuffer. @@ -98,7 +99,6 @@ Decl *Parser::ParseHLSLBuffer(SourceLocation &DeclEnd) { BufferScope.Exit(); Actions.HLSL().ActOnFinishBuffer(D, DeclEnd); - Actions.ProcessDeclAttributeList(Actions.CurScope, D, Attrs); return D; } diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 0b442b75d174d..76fb81d39d67c 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -305,6 +305,10 @@ static bool isResourceRecordTypeOrArrayOf(const Type *Ty) { return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr; } +static bool isResourceRecordTypeOrArrayOf(VarDecl *VD) { + return isResourceRecordTypeOrArrayOf(VD->getType().getTypePtr()); +} + // Returns true if the type is a leaf element type that is not valid to be // included in HLSL Buffer, such as a resource class, empty struct, zero-sized // array, or a builtin intangible type. Returns false it is a valid leaf element @@ -541,6 +545,10 @@ void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) { // create buffer layout struct createHostLayoutStructForBuffer(SemaRef, BufDecl); + if (std::none_of(Dcl->attr_begin(), Dcl->attr_end(), + [](Attr *A) { return isa(A); })) + SemaRef.Diag(Dcl->getLocation(), diag::warn_hlsl_implicit_binding); + SemaRef.PopDeclContext(); } @@ -3248,10 +3256,12 @@ void SemaHLSL::collectResourceBindingsOnVarDecl(VarDecl *VD) { void SemaHLSL::processExplicitBindingsOnDecl(VarDecl *VD) { assert(VD->hasGlobalStorage() && "expected global variable"); + bool HasBinding = false; for (Attr *A : VD->attrs()) { HLSLResourceBindingAttr *RBA = dyn_cast(A); if (!RBA) continue; + HasBinding = true; RegisterType RT = RBA->getRegisterType(); assert(RT != RegisterType::I && "invalid or obsolete register type should " @@ -3278,6 +3288,9 @@ void SemaHLSL::processExplicitBindingsOnDecl(VarDecl *VD) { << static_cast(RT); } } + + if (!HasBinding && isResourceRecordTypeOrArrayOf(VD)) + SemaRef.Diag(VD->getLocation(), diag::warn_hlsl_implicit_binding); } static bool CastInitializer(Sema &S, ASTContext &Ctx, Expr *E, diff --git a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl index 38e5b6281c42e..3a11aff5f651a 100644 --- a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl @@ -1,24 +1,24 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=ByteAddressBuffer %s | FileCheck -DRESOURCE=ByteAddressBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=ByteAddressBuffer %s | FileCheck -DRESOURCE=ByteAddressBuffer \ // RUN: -check-prefixes=CHECK,CHECK-SRV,CHECK-NOSUBSCRIPT %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RWByteAddressBuffer %s | FileCheck -DRESOURCE=RWByteAddressBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RWByteAddressBuffer %s | FileCheck -DRESOURCE=RWByteAddressBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RasterizerOrderedByteAddressBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedByteAddressBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RasterizerOrderedByteAddressBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedByteAddressBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT %s diff --git a/clang/test/AST/HLSL/OutArgExpr.hlsl b/clang/test/AST/HLSL/OutArgExpr.hlsl index b07c2efadbf4a..1cdb05418a3dd 100644 --- a/clang/test/AST/HLSL/OutArgExpr.hlsl +++ b/clang/test/AST/HLSL/OutArgExpr.hlsl @@ -1,7 +1,7 @@ // RUN: rm -f %t.pch -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-pch -finclude-default-header -o %t.pch %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -finclude-default-header -include-pch %t.pch %s -ast-dump | FileCheck --check-prefix=AST %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -finclude-default-header -include-pch %t.pch %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -emit-pch -finclude-default-header -o %t.pch %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -finclude-default-header -include-pch %t.pch %s -ast-dump | FileCheck --check-prefix=AST %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -finclude-default-header -include-pch %t.pch %s -ast-print | FileCheck %s #ifndef TEST_HLSL diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl index dcead068f481e..5c4716a6b6ad0 100644 --- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl @@ -1,40 +1,40 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \ // RUN: -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \ // RUN: -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl index f665b06d691e8..fc345b79e4aa0 100644 --- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \ // RUN: -DRESOURCE=RWBuffer %s | FileCheck -DRESOURCE=RWBuffer -check-prefix=EMPTY %s // -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \ // RUN: -DRESOURCE=RWBuffer %s | FileCheck -DRESOURCE=RWBuffer \ // RUN: -check-prefixes=CHECK,CHECK-UAV %s @@ -66,7 +66,7 @@ RESOURCE Buffer; // CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ' lvalue .__handle {{.*}} +// CHECK-SAME: ' lvalue .__handle {{.*}} // CHECK-NEXT: CXXThisExpr {{.*}} 'const [[RESOURCE]]' lvalue implicit this // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline @@ -81,7 +81,7 @@ RESOURCE Buffer; // CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ' lvalue .__handle {{.*}} +// CHECK-SAME: ' lvalue .__handle {{.*}} // CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline @@ -96,7 +96,7 @@ RESOURCE Buffer; // CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ' lvalue .__handle {{.*}} +// CHECK-SAME: ' lvalue .__handle {{.*}} // CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl index 37946b7cedea2..4cca9cc742c07 100644 --- a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl +++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON -// RUN: %clang_cc1 -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST // JSON:"kind": "HLSLBufferDecl", // JSON:"name": "A", diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl index 5c5aa6fc5ab10..726183821e57f 100644 --- a/clang/test/AST/HLSL/cbuffer.hlsl +++ b/clang/test/AST/HLSL/cbuffer.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s struct EmptyStruct { }; @@ -55,14 +55,14 @@ cbuffer CB { } _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __cblayout_CB), ""); -// Check that buffer layout struct does not include resources or empty types +// Check that buffer layout struct does not include resources or empty types // CHECK: HLSLBufferDecl {{.*}} line:[[# @LINE + 2]]:9 cbuffer CB // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer cbuffer CB { // CHECK: VarDecl {{.*}} used a2 'hlsl_constant float' float a2; // CHECK: VarDecl {{.*}} b2 'RWBuffer':'hlsl::RWBuffer' - RWBuffer b2; + RWBuffer b2; // CHECK: VarDecl {{.*}} c2 'EmptyStruct' EmptyStruct c2; // CHECK: VarDecl {{.*}} d2 'float[0]' @@ -123,7 +123,7 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __cblay // check that layout struct is created for E because because its base struct // is empty and should be eliminated, and BTypedef should reuse the previously -// defined '__cblayout_B' +// defined '__cblayout_B' // CHECK: HLSLBufferDecl {{.*}} line:[[# @LINE + 2]]:9 cbuffer CB // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer cbuffer CB { diff --git a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl index b2860fe07216e..3589535923241 100644 --- a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl +++ b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s // CHECK: CXXRecordDecl {{.*}} struct EmptyStruct definition struct EmptyStruct { @@ -7,11 +7,11 @@ struct EmptyStruct { // CHECK: NamespaceDecl {{.*}} NS1 namespace NS1 { // CHECK: CXXRecordDecl {{.*}} struct Foo definition - struct Foo { + struct Foo { float a; EmptyStruct es; }; - + // CHECK: CXXRecordDecl {{.*}} struct Bar definition struct Bar { // CHECK: CXXRecordDecl {{.*}} struct Foo definition @@ -56,7 +56,7 @@ struct CB1ExpectedShape { _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(CB1ExpectedShape, __cblayout_CB1), ""); namespace NS2 { - struct Foo { + struct Foo { float d[4]; EmptyStruct es; }; diff --git a/clang/test/AST/HLSL/default_cbuffer.hlsl b/clang/test/AST/HLSL/default_cbuffer.hlsl index 9e0fce7cc53cf..03b3ebd0d9f2b 100644 --- a/clang/test/AST/HLSL/default_cbuffer.hlsl +++ b/clang/test/AST/HLSL/default_cbuffer.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s struct EmptyStruct { }; @@ -14,7 +14,7 @@ struct S { float a; // CHECK: VarDecl {{.*}} b 'RWBuffer':'hlsl::RWBuffer' -RWBuffer b; +RWBuffer b; // CHECK: VarDecl {{.*}} c 'EmptyStruct' EmptyStruct c; diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl index a4f6e6c44794e..38c28b8e65394 100644 --- a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl +++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -ast-dump-filter=__is_structured_resource_element_compatible %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -ast-dump-filter=__is_structured_resource_element_compatible %s | FileCheck %s // CHECK: ConceptDecl {{.*}} __is_structured_resource_element_compatible // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 element_type diff --git a/clang/test/AST/HLSL/is_typed_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_typed_resource_element_compatible_concept.hlsl index 24a57624e2e9d..e4aefd5ac6aca 100644 --- a/clang/test/AST/HLSL/is_typed_resource_element_compatible_concept.hlsl +++ b/clang/test/AST/HLSL/is_typed_resource_element_compatible_concept.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -ast-dump-filter=__is_typed_resource_element_compatible %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -ast-dump-filter=__is_typed_resource_element_compatible %s | FileCheck %s // CHECK: ConceptDecl {{.*}} __is_typed_resource_element_compatible // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 element_type diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl index 1f57540aeac2f..4fe8aed5cb31a 100644 --- a/clang/test/AST/HLSL/packoffset.hlsl +++ b/clang/test/AST/HLSL/packoffset.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump -x hlsl %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump -x hlsl %s | FileCheck %s // CHECK: HLSLBufferDecl {{.*}} cbuffer A diff --git a/clang/test/AST/HLSL/pch.hlsl b/clang/test/AST/HLSL/pch.hlsl index 483af0f5b4c79..353f894b8bc25 100644 --- a/clang/test/AST/HLSL/pch.hlsl +++ b/clang/test/AST/HLSL/pch.hlsl @@ -1,12 +1,9 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \ -// RUN: -finclude-default-header -emit-pch -o %t %S/Inputs/pch.hlsl -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \ -// RUN: -finclude-default-header -include-pch %t -ast-dump-all %s \ -// RUN: | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -emit-pch -o %t %S/Inputs/pch.hlsl +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -include-pch %t -ast-dump-all %s | FileCheck %s // Make sure PCH works by using function declared in PCH header and declare a RWBuffer in current file. // CHECK:FunctionDecl 0x[[FOO:[0-9a-f]+]] <{{.*}}:2:1, line:4:1> line:2:8 imported used foo 'float2 (float2, float2)' -// CHECK:VarDecl 0x{{[0-9a-f]+}} <{{.*}}:10:1, col:23> col:23 Buffer 'hlsl::RWBuffer' +// CHECK:VarDecl 0x{{[0-9a-f]+}} <{{.*}}:{{[0-9]+}}:1, col:23> col:23 Buffer 'hlsl::RWBuffer' hlsl::RWBuffer Buffer; float2 bar(float2 a, float2 b) { diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl index bfb89b4833677..9c73f587b7210 100644 --- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl +++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl @@ -1,8 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \ -// RUN: -emit-pch -o %t %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \ -// RUN: -include-pch %t -ast-dump-all %S/Inputs/empty.hlsl \ -// RUN: | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-pch -o %t %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -include-pch %t -ast-dump-all %S/Inputs/empty.hlsl | FileCheck %s cbuffer A { float a; @@ -17,19 +14,19 @@ float foo() { } // Make sure cbuffer/tbuffer works for PCH. -// CHECK: HLSLBufferDecl {{.*}} line:7:9 imported cbuffer A +// CHECK: HLSLBufferDecl {{.*}} line:{{[0-9]+}}:9 imported cbuffer A // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit struct __cblayout_A definition // CHECK: FieldDecl {{.*}} imported a 'float' -// CHECK: HLSLBufferDecl {{.*}} line:11:9 imported tbuffer B +// CHECK: HLSLBufferDecl {{.*}} line:{{[0-9]+}}:9 imported tbuffer B // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV // CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit struct __cblayout_B definition // CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float' -// CHECK-NEXT: FunctionDecl {{.*}} line:15:7 imported foo 'float ()' +// CHECK-NEXT: FunctionDecl {{.*}} line:{{[0-9]+}}:7 imported foo 'float ()' // CHECK-NEXT: CompoundStmt {{.*}} // CHECK-NEXT: ReturnStmt {{.*}} // CHECK-NEXT: BinaryOperator {{.*}} 'float' '+' diff --git a/clang/test/AST/HLSL/pch_with_buf.hlsl b/clang/test/AST/HLSL/pch_with_buf.hlsl index 7fb5e2a3812ea..4a0bced6bbc33 100644 --- a/clang/test/AST/HLSL/pch_with_buf.hlsl +++ b/clang/test/AST/HLSL/pch_with_buf.hlsl @@ -1,13 +1,12 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -emit-pch -o %t %S/Inputs/pch_with_buf.hlsl -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \ -// RUN: -finclude-default-header -include-pch %t -ast-dump-all %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -emit-pch -o %t %S/Inputs/pch_with_buf.hlsl +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -include-pch %t -ast-dump-all %s | FileCheck %s // Make sure PCH works by using function declared in PCH header. // CHECK:FunctionDecl 0x[[FOO:[0-9a-f]+]] <{{.*}}:2:1, line:4:1> line:2:8 imported used foo 'float2 (float2, float2)' // Make sure buffer defined in PCH works. // CHECK:VarDecl 0x{{[0-9a-f]+}} col:17 imported Buf 'RWBuffer' // Make sure declare a RWBuffer in current file works. -// CHECK:VarDecl 0x{{[0-9a-f]+}} <{{.*}}:11:1, col:23> col:23 Buf2 'hlsl::RWBuffer' +// CHECK:VarDecl 0x{{[0-9a-f]+}} <{{.*}}:{{[0-9]+}}:1, col:23> col:23 Buf2 'hlsl::RWBuffer' hlsl::RWBuffer Buf2; float2 bar(float2 a, float2 b) { diff --git a/clang/test/AST/HLSL/private.hlsl b/clang/test/AST/HLSL/private.hlsl index e00afb8f5cbd8..4fd04792eaec0 100644 --- a/clang/test/AST/HLSL/private.hlsl +++ b/clang/test/AST/HLSL/private.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s // CHECK: VarDecl {{.*}} global_scalar 'hlsl_private int' static cinit static int global_scalar = 0; diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl index 09c44f6242c53..b798c2a6d6c4b 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=CHECK,INLINE +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=CHECK,INLINE // Make sure global variable for ctors exist for lib profile. // CHECK:@llvm.global_ctors diff --git a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl index 7b26dba0d1901..9d0d9d32aaea0 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s RWBuffer Buffer; diff --git a/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl index 85face8eaeb6c..7fc01e59d3f8c 100644 --- a/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL struct MyStruct { float4 a; @@ -11,7 +11,7 @@ struct MyStruct { // DXIL: %"class.hlsl::AppendStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) // DXIL: %"class.hlsl::AppendStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) // DXIL: %"class.hlsl::AppendStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) -// DXIL: %"class.hlsl::AppendStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) // DXIL: %"class.hlsl::AppendStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) // DXIL: %"class.hlsl::AppendStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) // DXIL: %"class.hlsl::AppendStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) diff --git a/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl index 5ed9e9ad8160f..338d73e9a3890 100644 --- a/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL struct MyStruct { float4 a; @@ -11,7 +11,7 @@ struct MyStruct { // DXIL: %"class.hlsl::ConsumeStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) // DXIL: %"class.hlsl::ConsumeStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) // DXIL: %"class.hlsl::ConsumeStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) -// DXIL: %"class.hlsl::ConsumeStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) // DXIL: %"class.hlsl::ConsumeStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) // DXIL: %"class.hlsl::ConsumeStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) // DXIL: %"class.hlsl::ConsumeStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl index 0944ad59d5fb5..a8b6e543d82bd 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL -// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPIRV +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPIRV // DXIL: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", i16, 1, 0, 1) } // DXIL: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", i16, 1, 0, 0) } diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl index 2ad5b82a02912..6f296b5609c67 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXC,CHECK -// RUN: %clang_cc1 -triple spirv1.6-pc-vulkan1.3-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXC,CHECK +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple spirv1.6-pc-vulkan1.3-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK RWBuffer In; RWBuffer Out; diff --git a/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl index f2f6956ce1541..2fdeca2dda6fb 100644 --- a/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s // CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0) } // CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0) } diff --git a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedStructuredBuffer-elementtype.hlsl index 68d626de689f2..e50d97b119058 100644 --- a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedStructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL struct MyStruct { float4 a; diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl index d322cdc0d0645..0986c9b3330f0 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", i16, 0, 0) } // CHECK: %"class.hlsl::StructuredBuffer.0" = type { target("dx.RawBuffer", i16, 0, 0) } diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl index 5b1d8e3052eae..f2aea4e376b03 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// RUN-DISABLED: %clang_cc1 -Wno-hlsl-implicit-binding -triple spirv-vulkan-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV // NOTE: SPIRV codegen for resource methods is not yet implemented diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-subscripts.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-subscripts.hlsl index 2af7c3ed3219f..c86aca6f342e0 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-subscripts.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-subscripts.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s StructuredBuffer In; RWStructuredBuffer Out1; diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl index 0a0465cc44e91..e13999a9982e2 100644 --- a/clang/test/CodeGenHLSL/cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer.hlsl @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s // CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }> // CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }> diff --git a/clang/test/CodeGenHLSL/cbuffer_align.hlsl b/clang/test/CodeGenHLSL/cbuffer_align.hlsl index 25fe20da7a230..42abe4f3421d1 100644 --- a/clang/test/CodeGenHLSL/cbuffer_align.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer_align.hlsl @@ -1,6 +1,4 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -fsyntax-only -verify -verify-ignore-unexpected=warning +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -std=hlsl2021 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fsyntax-only -verify -verify-ignore-unexpected=warning struct S0 { half a; diff --git a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl index 7cbde19b67d1f..188c9831044e6 100644 --- a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl @@ -1,6 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Make sure cbuffer inside namespace works. diff --git a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl index 5eecfc41579f2..611e041f0df63 100644 --- a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK: %__cblayout_A = type <{ float }> diff --git a/clang/test/CodeGenHLSL/default_cbuffer.hlsl b/clang/test/CodeGenHLSL/default_cbuffer.hlsl index 82dc01eb09be2..bec048a80d82b 100644 --- a/clang/test/CodeGenHLSL/default_cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/default_cbuffer.hlsl @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s // CHECK: %"__cblayout_$Globals" = type <{ float, float, target("dx.Layout", %__cblayout_S, 4, 0) }> // CHECK: %__cblayout_S = type <{ float }> @@ -20,7 +19,7 @@ struct S { }; float a; -RWBuffer b; +RWBuffer b; EmptyStruct c; float d[0]; RWBuffer e[2]; diff --git a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl index 1400288ba7699..7e69e1c040935 100644 --- a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl +++ b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute \ -// RUN: -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s // CHECK: %"__cblayout_$Globals" = type <{ i32, float, [4 x double], <4 x i32>, <4 x float>, // CHECK-SAME: target("dx.Layout", %S, 8, 0) }> @@ -16,7 +15,7 @@ struct S { float2 v; -}; +}; int a; float b : register(c1); @@ -34,4 +33,4 @@ void main() { // CHECK: !hlsl.cbs = !{![[CB:.*]]} // CHECK: ![[CB]] = !{ptr @"$Globals.cb", ptr addrspace(2) @a, ptr addrspace(2) @b, ptr addrspace(2) @c, -// CHECK-SAME: ptr addrspace(2) @d, ptr addrspace(2) @e, ptr addrspace(2) @s} \ No newline at end of file +// CHECK-SAME: ptr addrspace(2) @d, ptr addrspace(2) @e, ptr addrspace(2) @s} diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl index a8ab6ce98ae7e..ab5001a0496de 100644 --- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl +++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.3-library -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s -// RUN: %clang_cc1 -x hlsl -triple dxil-pc-shadermodel6.0-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -x hlsl -triple dxil-pc-shadermodel6.3-library -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -x hlsl -triple dxil-pc-shadermodel6.0-compute -finclude-default-header %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Verify that a few different function types all get the NoRecurse attribute diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl index 4748eeee7475f..c69fee902e305 100644 --- a/clang/test/CodeGenHLSL/inline-functions.hlsl +++ b/clang/test/CodeGenHLSL/inline-functions.hlsl @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE,OPT_ATTR -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE,OPT_ATTR +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR // Tests that user functions will always be inlined. // This includes exported functions and mangled entry point implementation functions. diff --git a/clang/test/CodeGenHLSL/static-local-ctor.hlsl b/clang/test/CodeGenHLSL/static-local-ctor.hlsl index eba37e3f4c6b8..7aeb5e987d6b2 100644 --- a/clang/test/CodeGenHLSL/static-local-ctor.hlsl +++ b/clang/test/CodeGenHLSL/static-local-ctor.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -disable-llvm-passes %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -disable-llvm-passes %s | FileCheck %s // Verify that no per variable _Init_thread instructions are emitted for non-trivial static locals // These would normally be emitted by the MicrosoftCXXABI, but the DirectX backend should exlude them @@ -34,4 +34,3 @@ void main() { mybuf = buf[0]; InitBuf(mybuf); } - diff --git a/clang/test/ParserHLSL/cb_error.hlsl b/clang/test/ParserHLSL/cb_error.hlsl index 245bf0a03fac9..f6c2dd9fd26ab 100644 --- a/clang/test/ParserHLSL/cb_error.hlsl +++ b/clang/test/ParserHLSL/cb_error.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify // expected-error@+2 {{expected identifier}} // expected-error@+1 {{expected unqualified-id}} diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl index 3b2c12e7a96c5..9920b65561527 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -8,7 +8,7 @@ __hlsl_resource_t [[hlsl::is_rov]] res1; // expected-error@+1{{'is_rov' attribute takes no arguments}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; - + // expected-error@+1{{use of undeclared identifier 'gibberish'}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl index 77530cbf9e4d9..a638d1d3e156c 100644 --- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl @@ -5,7 +5,7 @@ // expected-error@+1{{'raw_buffer' attribute takes no arguments}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2; - + // expected-error@+1{{use of undeclared identifier 'gibberish'}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl index fbada8b4b99f7..36995e28b3b85 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index b5737f5dac8a9..f2cc8918c0ff1 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s // CHECK: ClassTemplateSpecializationDecl {{.*}} class RWBuffer definition implicit_instantiation // CHECK: TemplateArgument type 'float' diff --git a/clang/test/ParserHLSL/invalid_inside_cb.hlsl b/clang/test/ParserHLSL/invalid_inside_cb.hlsl index b74021fd22422..926f53e58bc7f 100644 --- a/clang/test/ParserHLSL/invalid_inside_cb.hlsl +++ b/clang/test/ParserHLSL/invalid_inside_cb.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -o - -fsyntax-only %s -verify // template not allowed inside cbuffer. cbuffer A { diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl index 34930d8963688..91e96b995585f 100644 --- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s typedef vector float3; typedef vector double2; diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl index fb14429025d5a..991b04c80ac86 100644 --- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s typedef vector float3; diff --git a/clang/test/SemaHLSL/BuiltIns/hlsl_resource_t.hlsl b/clang/test/SemaHLSL/BuiltIns/hlsl_resource_t.hlsl index a17ec327ba9e7..83fad4b8deb7c 100644 --- a/clang/test/SemaHLSL/BuiltIns/hlsl_resource_t.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/hlsl_resource_t.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -triple dxil-unknown-shadermodel6.3-library %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -fsyntax-only -verify -triple dxil-unknown-shadermodel6.3-library %s // Note: As HLSL resource type are sizeless type, we don't exhaustively // test for cases covered by sizeless-1.c and similar tests. diff --git a/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl b/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl index 662dae27e8200..7dc3c3e1c3e6c 100644 --- a/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl +++ b/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -verify -verify-ignore-unexpected=note struct S { int A : 8; @@ -46,4 +46,4 @@ struct X { export void cantCast5() { X x = (X)1; // expected-error@-1 {{no matching conversion for C-style cast from 'int' to 'X'}} -} \ No newline at end of file +} diff --git a/clang/test/SemaHLSL/cb_error.hlsl b/clang/test/SemaHLSL/cb_error.hlsl index 95c917a9bb9ee..9640ab252b942 100644 --- a/clang/test/SemaHLSL/cb_error.hlsl +++ b/clang/test/SemaHLSL/cb_error.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify // expected-note@+1 {{declared here}} cbuffer a { diff --git a/clang/test/SemaHLSL/export.hlsl b/clang/test/SemaHLSL/export.hlsl index 2d19fa561fa0a..bda68db09287c 100644 --- a/clang/test/SemaHLSL/export.hlsl +++ b/clang/test/SemaHLSL/export.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify export void f1(); diff --git a/clang/test/SemaHLSL/packoffset-invalid.hlsl b/clang/test/SemaHLSL/packoffset-invalid.hlsl index 526a511edf1f2..0b06d55ccd688 100644 --- a/clang/test/SemaHLSL/packoffset-invalid.hlsl +++ b/clang/test/SemaHLSL/packoffset-invalid.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -fnative-half-type -verify %s +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple dxil-pc-shadermodel6.3-library -fnative-half-type -verify %s // expected-warning@+1{{cannot mix packoffset elements with nonpackoffset elements in a cbuffer}} cbuffer Mix diff --git a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl index 74aff79f0e37f..a3a91c3ddddb8 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify template struct MyTemplatedSRV { @@ -58,7 +58,7 @@ void bar(MyTemplatedSRV U : register(u3)) { } -struct S { +struct S { // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} MyTemplatedSRV U : register(u3); }; diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl index 760c057630a7f..5d4059f9a9cf4 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify // expected-error@+1{{binding type 't' only applies to SRV resources}} float f1 : register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl index 4b6af47c0ab72..a003a11765105 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify // This test validates the diagnostics that are emitted when a variable with a "resource" type // is bound to a register using the register annotation diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl index e63f264452da7..70b53524d3faa 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify // expected-no-diagnostics float f2 : register(b9); @@ -11,7 +11,7 @@ cbuffer g_cbuffer1 { struct Eg12{ - RWBuffer a; + RWBuffer a; }; Eg12 e12 : register(c9); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl index 70e64e6ca7528..e66b1114ac320 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify // valid cbuffer cbuf { @@ -30,7 +30,7 @@ cbuffer cbuf4 { // this test validates that no diagnostic is emitted on the space parameter, because // this register annotation is not in the global scope. // expected-error@+1 {{binding type 'u' only applies to UAV resources}} - float a : register(u2, space3); + float a : register(u2, space3); } // expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}} diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl index 40517f393e128..3472ea762487b 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify template struct MyTemplatedUAV { diff --git a/clang/test/SemaHLSL/resource_binding_implicit.hlsl b/clang/test/SemaHLSL/resource_binding_implicit.hlsl new file mode 100644 index 0000000000000..8f0e721c7153f --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_implicit.hlsl @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -Wno-error=hlsl-implicit-binding -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// expected-warning@+1 {{resource has implicit register binding}} +cbuffer cb0 { + int a; +} + +// No warning - this is an element of the $Globals buffer not it's own binding. +float b; + +// expected-warning@+1 {{resource has implicit register binding}} +RWBuffer c; + +// No warning - explicit binding. +RWBuffer d : register(u0); + +// TODO: Add this test once #135287 lands +// TODO: ... @+1 {{resource has implicit register binding}} +// TODO: RWBuffer dd : register(space1); + +// No warning - explicit binding. +RWBuffer ddd : register(u3, space4); + +struct S { int x; }; +// expected-warning@+1 {{resource has implicit register binding}} +StructuredBuffer e; + +// No warning - __hlsl_resource_t isn't itself a resource object. +__hlsl_resource_t [[hlsl::resource_class(SRV)]] f; + +struct CustomSRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; +// expected-warning@+1 {{resource has implicit register binding}} +CustomSRV g; + +// expected-warning@+1 {{resource has implicit register binding}} +RWBuffer h[10]; + +// No warning - explicit binding. +RWBuffer hh[100] : register(u4); From b9f1de04f65b062559d01c83dfd3948601924ee1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 16 Apr 2025 15:47:16 -0700 Subject: [PATCH 191/710] [TableGen] Combine the two separate OperandMapping loops in PseudoLoweringEmitter. (#136007) Previously we had one loop over the DAG for immediates and registers and another loop over the destination operands for mapping from the source. Now we have a single loop over the destination operands that handles immediates, registers, and named operands. A helper method is added so we can handle operands and sub-operands specified by a sub-dag. My goal is to allow a named operand to appear in a sub-dag which wasn't supported before. This will allow the destination instruction to have an operand with sub-operands when the source does not have sub operands. For RISC-V, I'm looking into using an operand with sub-operands to represent an reg+offset memory address. I need to be able to lower a pseudo instruction that only has a register operand to an instruction that has a reg+offset operand. The offset will be filled in with 0 during expansion and the register will be copied from the source. The expansion would look like this: def PseudoCALLIndirect : Pseudo<(outs), (ins GPRJALR:$rs1), [(riscv_call GPRJALR:$rs1)]>, PseudoInstExpansion<(JALR X1, (ops GPR:$rs1, 0))>; --- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 194 +++++++++--------- 1 file changed, 98 insertions(+), 96 deletions(-) diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index 96325eac95004..44a17a3906fe6 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -51,10 +51,11 @@ class PseudoLoweringEmitter { SmallVector Expansions; - unsigned addDagOperandMapping(const Record *Rec, const DagInit *Dag, - const CodeGenInstruction &Insn, - IndexedMap &OperandMap, - unsigned BaseIdx); + void addOperandMapping(unsigned MIOpNo, unsigned NumOps, const Record *Rec, + const DagInit *Dag, unsigned DagIdx, + const Record *OpRec, IndexedMap &OperandMap, + const StringMap &SourceOperands, + const CodeGenInstruction &SourceInsn); void evaluateExpansion(const Record *Pseudo); void emitLoweringEmitter(raw_ostream &o); @@ -66,64 +67,67 @@ class PseudoLoweringEmitter { }; } // End anonymous namespace -// FIXME: This pass currently can only expand a pseudo to a single instruction. -// The pseudo expansion really should take a list of dags, not just -// a single dag, so we can do fancier things. -unsigned PseudoLoweringEmitter::addDagOperandMapping( - const Record *Rec, const DagInit *Dag, const CodeGenInstruction &Insn, - IndexedMap &OperandMap, unsigned BaseIdx) { - unsigned OpsAdded = 0; - for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) { - if (const DefInit *DI = dyn_cast(Dag->getArg(i))) { - // Physical register reference. Explicit check for the special case - // "zero_reg" definition. - if (DI->getDef()->isSubClassOf("Register") || - DI->getDef()->getName() == "zero_reg") { - auto &Entry = OperandMap[BaseIdx + i]; - Entry.Kind = OpData::Reg; - Entry.Data.Reg = DI->getDef(); - ++OpsAdded; - continue; - } +void PseudoLoweringEmitter::addOperandMapping( + unsigned MIOpNo, unsigned NumOps, const Record *Rec, const DagInit *Dag, + unsigned DagIdx, const Record *OpRec, IndexedMap &OperandMap, + const StringMap &SourceOperands, + const CodeGenInstruction &SourceInsn) { + const Init *DagArg = Dag->getArg(DagIdx); + if (const DefInit *DI = dyn_cast(DagArg)) { + // Physical register reference. Explicit check for the special case + // "zero_reg" definition. + if (DI->getDef()->isSubClassOf("Register") || + DI->getDef()->getName() == "zero_reg") { + auto &Entry = OperandMap[MIOpNo]; + Entry.Kind = OpData::Reg; + Entry.Data.Reg = DI->getDef(); + return; + } - // Normal operands should always have the same type, or we have a - // problem. - // FIXME: We probably shouldn't ever get a non-zero BaseIdx here. - assert(BaseIdx == 0 && "Named subargument in pseudo expansion?!"); - if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) - PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + - "', operand type '" + DI->getDef()->getName() + - "' does not match expansion operand type '" + - Insn.Operands[BaseIdx + i].Rec->getName() + - "'"); - // Source operand maps to destination operand. The Data element - // will be filled in later, just set the Kind for now. Do it - // for each corresponding MachineInstr operand, not just the first. - for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I) - OperandMap[BaseIdx + i + I].Kind = OpData::Operand; - OpsAdded += Insn.Operands[i].MINumOperands; - } else if (const IntInit *II = dyn_cast(Dag->getArg(i))) { - auto &Entry = OperandMap[BaseIdx + i]; - Entry.Kind = OpData::Imm; - Entry.Data.Imm = II->getValue(); - ++OpsAdded; - } else if (const auto *BI = dyn_cast(Dag->getArg(i))) { - auto &Entry = OperandMap[BaseIdx + i]; - Entry.Kind = OpData::Imm; - Entry.Data.Imm = *BI->convertInitializerToInt(); - ++OpsAdded; - } else if (const DagInit *SubDag = dyn_cast(Dag->getArg(i))) { - // Just add the operands recursively. This is almost certainly - // a constant value for a complex operand (> 1 MI operand). - unsigned NewOps = - addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i); - OpsAdded += NewOps; - // Since we added more than one, we also need to adjust the base. - BaseIdx += NewOps - 1; - } else - llvm_unreachable("Unhandled pseudo-expansion argument type!"); - } - return OpsAdded; + if (DI->getDef() != OpRec) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', operand type '" + DI->getDef()->getName() + + "' does not match expansion operand type '" + + OpRec->getName() + "'"); + + StringMap::const_iterator SourceOp = + SourceOperands.find(Dag->getArgNameStr(DagIdx)); + if (SourceOp == SourceOperands.end()) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', output operand '" + + Dag->getArgNameStr(DagIdx) + + "' has no matching source operand"); + const auto &SrcOpnd = SourceInsn.Operands[SourceOp->getValue()]; + if (NumOps != SrcOpnd.MINumOperands) + PrintFatalError( + Rec, + "In pseudo instruction '" + Rec->getName() + "', output operand '" + + OpRec->getName() + + "' has a different number of sub operands than source operand '" + + SrcOpnd.Rec->getName() + "'"); + + // Source operand maps to destination operand. Do it for each corresponding + // MachineInstr operand, not just the first. + for (unsigned I = 0, E = NumOps; I != E; ++I) { + auto &Entry = OperandMap[MIOpNo + I]; + Entry.Kind = OpData::Operand; + Entry.Data.Operand = SrcOpnd.MIOperandNo + I; + } + + LLVM_DEBUG(dbgs() << " " << SourceOp->getValue() << " ==> " << DagIdx + << "\n"); + } else if (const auto *II = dyn_cast(DagArg)) { + assert(NumOps == 1); + auto &Entry = OperandMap[MIOpNo]; + Entry.Kind = OpData::Imm; + Entry.Data.Imm = II->getValue(); + } else if (const auto *BI = dyn_cast(DagArg)) { + assert(NumOps == 1); + auto &Entry = OperandMap[MIOpNo]; + Entry.Kind = OpData::Imm; + Entry.Data.Imm = *BI->convertInitializerToInt(); + } else + llvm_unreachable("Unhandled pseudo-expansion argument type!"); } void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { @@ -157,14 +161,6 @@ void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { "', result operator '" + Operator->getName() + "' has the wrong number of operands"); - unsigned NumMIOperands = 0; - for (const auto &Op : Insn.Operands) - NumMIOperands += Op.MINumOperands; - IndexedMap OperandMap; - OperandMap.grow(NumMIOperands); - - addDagOperandMapping(Rec, Dag, Insn, OperandMap, 0); - // If there are more operands that weren't in the DAG, they have to // be operands that have default values, or we have an error. Currently, // Operands that are a subclass of OperandWithDefaultOp have default values. @@ -180,37 +176,43 @@ void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { for (const auto &[Idx, SrcOp] : enumerate(SourceInsn.Operands)) SourceOperands[SrcOp.Name] = Idx; - LLVM_DEBUG(dbgs() << " Operand mapping:\n"); - for (const auto &[Idx, Opnd] : enumerate(Insn.Operands)) { - // We've already handled constant values. Just map instruction operands - // here. - if (OperandMap[Opnd.MIOperandNo].Kind != OpData::Operand) - continue; - StringMap::iterator SourceOp = - SourceOperands.find(Dag->getArgNameStr(Idx)); - if (SourceOp == SourceOperands.end()) - PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + - "', output operand '" + Dag->getArgNameStr(Idx) + - "' has no matching source operand"); - const auto &SrcOpnd = SourceInsn.Operands[SourceOp->getValue()]; - if (Opnd.MINumOperands != SrcOpnd.MINumOperands) - PrintFatalError( - Rec, - "In pseudo instruction '" + Rec->getName() + "', output operand '" + - Opnd.Rec->getName() + - "' has a different number of sub operands than source operand '" + - SrcOpnd.Rec->getName() + "'"); + unsigned NumMIOperands = 0; + for (const auto &Op : Insn.Operands) + NumMIOperands += Op.MINumOperands; + IndexedMap OperandMap; + OperandMap.grow(NumMIOperands); - // Map the source operand to the destination operand index for each - // MachineInstr operand. - for (unsigned I = 0, E = Opnd.MINumOperands; I != E; ++I) - OperandMap[Opnd.MIOperandNo + I].Data.Operand = SrcOpnd.MIOperandNo + I; + // FIXME: This pass currently can only expand a pseudo to a single + // instruction. The pseudo expansion really should take a list of dags, not + // just a single dag, so we can do fancier things. + LLVM_DEBUG(dbgs() << " Operand mapping:\n"); + for (const auto &[Idx, DstOp] : enumerate(Insn.Operands)) { + unsigned MIOpNo = DstOp.MIOperandNo; - LLVM_DEBUG(dbgs() << " " << SourceOp->getValue() << " ==> " << Idx - << "\n"); + if (const auto *SubDag = dyn_cast(Dag->getArg(Idx))) { + if (!DstOp.MIOperandInfo || DstOp.MIOperandInfo->getNumArgs() == 0) + PrintFatalError(Rec, "In pseudo instruction '" + Rec->getName() + + "', operand '" + DstOp.Rec->getName() + + "' does not have suboperands"); + if (DstOp.MINumOperands != SubDag->getNumArgs()) { + PrintFatalError( + Rec, "In pseudo instruction '" + Rec->getName() + "', '" + + SubDag->getAsString() + + "' has wrong number of operands for operand type '" + + DstOp.Rec->getName() + "'"); + } + for (unsigned I = 0, E = DstOp.MINumOperands; I != E; ++I) { + auto *OpndRec = cast(DstOp.MIOperandInfo->getArg(I))->getDef(); + addOperandMapping(MIOpNo + I, 1, Rec, SubDag, I, OpndRec, OperandMap, + SourceOperands, SourceInsn); + } + } else { + addOperandMapping(MIOpNo, DstOp.MINumOperands, Rec, Dag, Idx, DstOp.Rec, + OperandMap, SourceOperands, SourceInsn); + } } - Expansions.push_back(PseudoExpansion(SourceInsn, Insn, OperandMap)); + Expansions.emplace_back(SourceInsn, Insn, OperandMap); } void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { From f7bdf30cb9b8a10847c29b9cad34e5e8abc1ecb8 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 16 Apr 2025 15:48:30 -0700 Subject: [PATCH 192/710] Add empty top level .clang-format-ignore (#136022) Otherwise if the source tree is embedded in another project with a .clang-format-ignore, some clang-format tests fail because they use that .clang-format-ignore. --- .clang-format-ignore | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .clang-format-ignore diff --git a/.clang-format-ignore b/.clang-format-ignore new file mode 100644 index 0000000000000..e69de29bb2d1d From f8ea2ed59820a0bef3f23638ce7a5d10165f7109 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 16 Apr 2025 16:18:34 -0700 Subject: [PATCH 193/710] [bazel] Fix build for 8ebdd9d8a19543992195f197de215c53d506fb72 --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b77ddf634eec6..143304c24c241 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2148,7 +2148,10 @@ llvm_target_lib_list = [lib for lib in [ "lib/Target/AArch64/AArch64GenSubtargetInfo.inc", ), ( - ["-gen-disassembler"], + [ + "-gen-disassembler", + "--num-to-skip-size=3", + ], "lib/Target/AArch64/AArch64GenDisassemblerTables.inc", ), ( From f3bf844d2ff0a2984ca9bf976014decc0241d2b4 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Wed, 16 Apr 2025 17:15:15 -0700 Subject: [PATCH 194/710] [flang] Unwrap sequence types when checking for descriptor members (#136039) The TBAA generation gives conservative TBAA metadata when handling an access of a record type with a descriptor member, since the access may be a regular data access OR another descriptor. Array members were being incorrectly identified as non-descriptor-members, and were giving incorrect TBAA metadata which led to bugs showing up in the optimizer when LLVM encountered mismatching TBAA. `fir::isRecordWithDescriptorMember` now unwraps sequence types before checking for descriptor members. --- flang/lib/Optimizer/Dialect/FIRType.cpp | 1 + flang/test/Fir/tbaa-codegen-records.fir | 30 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 flang/test/Fir/tbaa-codegen-records.fir diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index 4d40c2618a0d0..b76856d72a017 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -435,6 +435,7 @@ bool isRecordWithDescriptorMember(mlir::Type ty) { ty = unwrapSequenceType(ty); if (auto recTy = mlir::dyn_cast(ty)) for (auto [field, memTy] : recTy.getTypeList()) { + memTy = unwrapSequenceType(memTy); if (mlir::isa(memTy)) return true; if (mlir::isa(memTy) && diff --git a/flang/test/Fir/tbaa-codegen-records.fir b/flang/test/Fir/tbaa-codegen-records.fir new file mode 100644 index 0000000000000..336354098f0f8 --- /dev/null +++ b/flang/test/Fir/tbaa-codegen-records.fir @@ -0,0 +1,30 @@ +// RUN: fir-opt --split-input-file --pass-pipeline="builtin.module(fir-to-llvm-ir{apply-tbaa=true})" %s | FileCheck %s + +// Ensure that records with array members are identified as having descriptor members, +// as reflected by the TBAA metadata. + +func.func @record_array_member(%arg0 : !fir.ref>>}>>}>>) { + %c0_i64 = arith.constant 0 : i64 + %c1 = arith.constant 1 : index + %1 = fir.alloca !fir.type<_QFTt2{y:!fir.array<1x!fir.type<_QFTt{x:!fir.box>>}>>}> {bindc_name = "z", uniq_name = "_QFB1Ez"} + fir.copy %arg0 to %1 no_overlap : !fir.ref>>}>>}>>, !fir.ref>>}>>}>> + %3 = fir.coordinate_of %1, y : (!fir.ref>>}>>}>>) -> !fir.ref>>}>>> + %4 = fircg.ext_array_coor %3(%c1)<%c1> : (!fir.ref>>}>>>, index, index) -> !fir.ref>>}>> + %5 = fir.coordinate_of %4, x : (!fir.ref>>}>>) -> !fir.ref>>> + %6 = fir.load %5 : !fir.ref>>> + return +} + +// CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root +// CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_tag + +// CHECK-LABEL: llvm.func @record_array_member( +// CHECK-SAME: %[[ARG0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr) { +// CHECK: %[[X_VAL:.*]] = llvm.alloca %{{.+}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr +// CHECK: %[[Z_VAL:.*]] = llvm.alloca %{{.+}} x !llvm.struct<"_QFTt2", (array<1 x struct<"_QFTt", (struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)>>)> {bindc_name = "z"} : (i64) -> !llvm.ptr +// CHECK: "llvm.intr.memcpy"(%[[Z_VAL]], %[[ARG0]], %{{.+}}) <{isVolatile = false, tbaa = [#[[$ATTR_2]]]}> : (!llvm.ptr, !llvm.ptr, i64) -> () + +// CHECK: "llvm.intr.memcpy"(%[[X_VAL]], %{{.+}}, %{{.+}}) <{isVolatile = false, tbaa = [#[[$ATTR_4]]]}> : (!llvm.ptr, !llvm.ptr, i32) -> () From eef978290ccb847c67bf0431e5fdd1dc4c7d408d Mon Sep 17 00:00:00 2001 From: Mohamed Emad <73320969+hulxv@users.noreply.github.com> Date: Thu, 17 Apr 2025 02:16:12 +0200 Subject: [PATCH 195/710] [clang-doc][NFC] clean unused variable in HTML generator (#135505) While reading the code, I found some dead variables that are not used anymore but it still declared without removing them. --- clang-tools-extra/clang-doc/HTMLGenerator.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp index cb10f16804024..aceb83e8c4c57 100644 --- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp +++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp @@ -716,7 +716,6 @@ genHTML(const EnumInfo &I, const ClangDocContext &CDCtx) { maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc); - std::string Description; if (!I.Description.empty()) Out.emplace_back(genHTML(I.Description)); @@ -759,7 +758,6 @@ genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx, maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc); - std::string Description; if (!I.Description.empty()) Out.emplace_back(genHTML(I.Description)); @@ -777,7 +775,6 @@ genHTML(const NamespaceInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx, Out.emplace_back(std::make_unique(HTMLTag::TAG_H1, InfoTitle)); - std::string Description; if (!I.Description.empty()) Out.emplace_back(genHTML(I.Description)); @@ -820,7 +817,6 @@ genHTML(const RecordInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx, maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc); - std::string Description; if (!I.Description.empty()) Out.emplace_back(genHTML(I.Description)); From 218531821a6b6f1132a245a1f52e125610dc7f6a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 16 Apr 2025 17:19:50 -0700 Subject: [PATCH 196/710] LowerTypeTests: Fix quadratic complexity (try 2). Currently we have quadratic complexity in LowerTypeTests because ScopedSaveAliaseesAndUsed loops over all aliases for each disjoint set, and the number of aliases and number of disjoint sets is roughly proportional to the program size. Fix that by moving ScopedSaveAliaseesAndUsed to LowerTypeTestsModule::lower() so that we do this only once. Reland of #135875 with fix for bug that caused check-lld test failures. The fix is to only remove functions from llvm.used/llvm.compiler.used because buildBitSetsFromGlobalVariables, which now runs while ScopedSaveAliaseesAndUsed is in scope, will delete global variables, which would otherwise lead to a use-after-free when they are added back to llvm.used or llvm.compiler.used. Reviewers: fmayer, vitalybuka Reviewed By: fmayer Pull Request: https://github.com/llvm/llvm-project/pull/136053 --- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 194 ++++++++++++--------- 1 file changed, 107 insertions(+), 87 deletions(-) diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 7cf7d74acfcfa..38da7329bbd58 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -352,6 +352,30 @@ struct ScopedSaveAliaseesAndUsed { std::vector> FunctionAliases; std::vector> ResolverIFuncs; + // This function only removes functions from llvm.used and llvm.compiler.used. + // We cannot remove global variables because they need to follow RAUW, as + // they may be deleted by buildBitSetsFromGlobalVariables. + void collectAndEraseUsedFunctions(Module &M, + SmallVectorImpl &Vec, + bool CompilerUsed) { + auto *GV = collectUsedGlobalVariables(M, Vec, CompilerUsed); + if (!GV) + return; + // There's no API to only remove certain array elements from + // llvm.used/llvm.compiler.used, so we remove all of them and add back only + // the non-functions. + GV->eraseFromParent(); + auto NonFuncBegin = + std::stable_partition(Vec.begin(), Vec.end(), [](GlobalValue *GV) { + return isa(GV); + }); + if (CompilerUsed) + appendToCompilerUsed(M, {NonFuncBegin, Vec.end()}); + else + appendToUsed(M, {NonFuncBegin, Vec.end()}); + Vec.resize(NonFuncBegin - Vec.begin()); + } + ScopedSaveAliaseesAndUsed(Module &M) : M(M) { // The users of this class want to replace all function references except // for aliases and llvm.used/llvm.compiler.used with references to a jump @@ -365,10 +389,8 @@ struct ScopedSaveAliaseesAndUsed { // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW // replace the aliasees and then set them back to their original values at // the end. - if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false)) - GV->eraseFromParent(); - if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true)) - GV->eraseFromParent(); + collectAndEraseUsedFunctions(M, Used, false); + collectAndEraseUsedFunctions(M, CompilerUsed, true); for (auto &GA : M.aliases()) { // FIXME: This should look past all aliases not just interposable ones, @@ -1669,61 +1691,55 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout); - { - ScopedSaveAliaseesAndUsed S(M); + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Function *F = cast(Functions[I]->getGlobal()); + bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); - // Build aliases pointing to offsets into the jump table, and replace - // references to the original functions with references to the aliases. - for (unsigned I = 0; I != Functions.size(); ++I) { - Function *F = cast(Functions[I]->getGlobal()); - bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); - - Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( - JumpTableType, JumpTable, - ArrayRef{ConstantInt::get(IntPtrTy, 0), - ConstantInt::get(IntPtrTy, I)}); - - const bool IsExported = Functions[I]->isExported(); - if (!IsJumpTableCanonical) { - GlobalValue::LinkageTypes LT = IsExported - ? GlobalValue::ExternalLinkage - : GlobalValue::InternalLinkage; - GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, - F->getName() + ".cfi_jt", - CombinedGlobalElemPtr, &M); - if (IsExported) - JtAlias->setVisibility(GlobalValue::HiddenVisibility); - else - appendToUsed(M, {JtAlias}); - } + Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr( + JumpTableType, JumpTable, + ArrayRef{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}); + + const bool IsExported = Functions[I]->isExported(); + if (!IsJumpTableCanonical) { + GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage + : GlobalValue::InternalLinkage; + GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT, + F->getName() + ".cfi_jt", + CombinedGlobalElemPtr, &M); + if (IsExported) + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + else + appendToUsed(M, {JtAlias}); + } - if (IsExported) { - if (IsJumpTableCanonical) - ExportSummary->cfiFunctionDefs().emplace(F->getName()); - else - ExportSummary->cfiFunctionDecls().emplace(F->getName()); - } + if (IsExported) { + if (IsJumpTableCanonical) + ExportSummary->cfiFunctionDefs().emplace(F->getName()); + else + ExportSummary->cfiFunctionDecls().emplace(F->getName()); + } - if (!IsJumpTableCanonical) { - if (F->hasExternalWeakLinkage()) - replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, - IsJumpTableCanonical); - else - replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); - } else { - assert(F->getType()->getAddressSpace() == 0); - - GlobalAlias *FAlias = - GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "", - CombinedGlobalElemPtr, &M); - FAlias->setVisibility(F->getVisibility()); - FAlias->takeName(F); - if (FAlias->hasName()) - F->setName(FAlias->getName() + ".cfi"); - replaceCfiUses(F, FAlias, IsJumpTableCanonical); - if (!F->hasLocalLinkage()) - F->setVisibility(GlobalVariable::HiddenVisibility); - } + if (!IsJumpTableCanonical) { + if (F->hasExternalWeakLinkage()) + replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, + IsJumpTableCanonical); + else + replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); + } else { + assert(F->getType()->getAddressSpace() == 0); + + GlobalAlias *FAlias = GlobalAlias::create( + F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); + FAlias->setVisibility(F->getVisibility()); + FAlias->takeName(F); + if (FAlias->hasName()) + F->setName(FAlias->getName() + ".cfi"); + replaceCfiUses(F, FAlias, IsJumpTableCanonical); + if (!F->hasLocalLinkage()) + F->setVisibility(GlobalVariable::HiddenVisibility); } } @@ -2339,39 +2355,43 @@ bool LowerTypeTestsModule::lower() { if (GlobalClasses.empty()) return false; - // For each disjoint set we found... - for (const auto &C : GlobalClasses) { - if (!C->isLeader()) - continue; - - ++NumTypeIdDisjointSets; - // Build the list of type identifiers in this disjoint set. - std::vector TypeIds; - std::vector Globals; - std::vector ICallBranchFunnels; - for (auto M : GlobalClasses.members(*C)) { - if (isa(M)) - TypeIds.push_back(cast(M)); - else if (isa(M)) - Globals.push_back(cast(M)); - else - ICallBranchFunnels.push_back(cast(M)); - } - - // Order type identifiers by unique ID for determinism. This ordering is - // stable as there is a one-to-one mapping between metadata and unique IDs. - llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { - return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; - }); + { + ScopedSaveAliaseesAndUsed S(M); + // For each disjoint set we found... + for (const auto &C : GlobalClasses) { + if (!C->isLeader()) + continue; - // Same for the branch funnels. - llvm::sort(ICallBranchFunnels, - [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { - return F1->UniqueId < F2->UniqueId; - }); + ++NumTypeIdDisjointSets; + // Build the list of type identifiers in this disjoint set. + std::vector TypeIds; + std::vector Globals; + std::vector ICallBranchFunnels; + for (auto M : GlobalClasses.members(*C)) { + if (isa(M)) + TypeIds.push_back(cast(M)); + else if (isa(M)) + Globals.push_back(cast(M)); + else + ICallBranchFunnels.push_back(cast(M)); + } - // Build bitsets for this disjoint set. - buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); + // Order type identifiers by unique ID for determinism. This ordering is + // stable as there is a one-to-one mapping between metadata and unique + // IDs. + llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { + return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; + }); + + // Same for the branch funnels. + llvm::sort(ICallBranchFunnels, + [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { + return F1->UniqueId < F2->UniqueId; + }); + + // Build bitsets for this disjoint set. + buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); + } } allocateByteArrays(); From 18855ece3c34a0d76a2126538d60760ddeee2de8 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Wed, 16 Apr 2025 17:31:01 -0700 Subject: [PATCH 197/710] [lldb] Add summary for NSIndirectTaggedPointerString (#136025) rdar://143164164 --- .../source/Plugins/Language/ObjC/NSString.cpp | 51 ++++++++++++++++--- lldb/source/Plugins/Language/ObjC/NSString.h | 4 ++ .../Plugins/Language/ObjC/ObjCLanguage.cpp | 4 ++ .../objc/foundation/tagged/strings/Makefile | 3 ++ .../tagged/strings/TestObjCTaggedStrings.py | 12 +++++ .../objc/foundation/tagged/strings/main.m | 17 +++++++ 6 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 lldb/test/API/lang/objc/foundation/tagged/strings/Makefile create mode 100644 lldb/test/API/lang/objc/foundation/tagged/strings/TestObjCTaggedStrings.py create mode 100644 lldb/test/API/lang/objc/foundation/tagged/strings/main.m diff --git a/lldb/source/Plugins/Language/ObjC/NSString.cpp b/lldb/source/Plugins/Language/ObjC/NSString.cpp index a99d042572bfe..2626b9a3f7b8a 100644 --- a/lldb/source/Plugins/Language/ObjC/NSString.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSString.cpp @@ -63,12 +63,17 @@ bool lldb_private::formatters::NSStringSummaryProvider( if (class_name.empty()) return false; - bool is_tagged_ptr = class_name == "NSTaggedPointerString" && - descriptor->GetTaggedPointerInfo(); - // for a tagged pointer, the descriptor has everything we need - if (is_tagged_ptr) - return NSTaggedString_SummaryProvider(valobj, descriptor, stream, - summary_options); + // For tagged pointers, the descriptor has everything needed. + bool is_tagged = descriptor->GetTaggedPointerInfo(); + if (is_tagged) { + if (class_name == "NSTaggedPointerString") + return NSTaggedString_SummaryProvider(valobj, descriptor, stream, + summary_options); + + if (class_name == "NSIndirectTaggedPointerString") + return NSIndirectTaggedString_SummaryProvider(valobj, descriptor, stream, + summary_options); + } auto &additionals_map(NSString_Additionals::GetAdditionalSummaries()); auto iter = additionals_map.find(class_name_cs), end = additionals_map.end(); @@ -368,3 +373,37 @@ bool lldb_private::formatters::NSTaggedString_SummaryProvider( stream << suffix; return true; } + +bool lldb_private::formatters::NSIndirectTaggedString_SummaryProvider( + ValueObject &valobj, ObjCLanguageRuntime::ClassDescriptorSP descriptor, + Stream &stream, const TypeSummaryOptions &summary_options) { + if (!descriptor) + return false; + + uint64_t payload = 0; + if (!descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) + return false; + + // First 47 bits are the address of the contents. + addr_t ptr = payload & 0x7fffffffffffULL; + // Next 13 bits are the string's length. + size_t size = (payload >> 47) & 0x1fff; + + Status status; + std::vector buf(size); + if (auto process_sp = valobj.GetProcessSP()) + if (process_sp->ReadMemory(ptr, buf.data(), size, status)) { + llvm::StringRef prefix, suffix; + if (auto *language = Language::FindPlugin(summary_options.GetLanguage())) + std::tie(prefix, suffix) = + language->GetFormatterPrefixSuffix("NSString"); + stream << prefix << '"'; + stream.PutCString({buf.data(), size}); + stream << '"' << suffix; + return true; + } + + if (status.Fail()) + stream.Format("<{0}>", status); + return false; +} diff --git a/lldb/source/Plugins/Language/ObjC/NSString.h b/lldb/source/Plugins/Language/ObjC/NSString.h index 8c9fcf955f1f8..5d405b30b6817 100644 --- a/lldb/source/Plugins/Language/ObjC/NSString.h +++ b/lldb/source/Plugins/Language/ObjC/NSString.h @@ -25,6 +25,10 @@ bool NSTaggedString_SummaryProvider( ValueObject &valobj, ObjCLanguageRuntime::ClassDescriptorSP descriptor, Stream &stream, const TypeSummaryOptions &summary_options); +bool NSIndirectTaggedString_SummaryProvider( + ValueObject &valobj, ObjCLanguageRuntime::ClassDescriptorSP descriptor, + Stream &stream, const TypeSummaryOptions &summary_options); + bool NSAttributedStringSummaryProvider(ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options); diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp index c835b439a64dd..3b8e21cbb9269 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp @@ -691,6 +691,10 @@ static void LoadObjCFormatters(TypeCategoryImplSP objc_category_sp) { AddCXXSummary( objc_category_sp, lldb_private::formatters::NSStringSummaryProvider, "NSString summary provider", "NSTaggedPointerString", appkit_flags); + AddCXXSummary(objc_category_sp, + lldb_private::formatters::NSStringSummaryProvider, + "NSString summary provider", "NSIndirectTaggedPointerString", + appkit_flags); AddCXXSummary(objc_category_sp, lldb_private::formatters::NSAttributedStringSummaryProvider, diff --git a/lldb/test/API/lang/objc/foundation/tagged/strings/Makefile b/lldb/test/API/lang/objc/foundation/tagged/strings/Makefile new file mode 100644 index 0000000000000..a3198db9e8e88 --- /dev/null +++ b/lldb/test/API/lang/objc/foundation/tagged/strings/Makefile @@ -0,0 +1,3 @@ +OBJC_SOURCES := main.m +LD_EXTRAS := -framework Foundation +include Makefile.rules diff --git a/lldb/test/API/lang/objc/foundation/tagged/strings/TestObjCTaggedStrings.py b/lldb/test/API/lang/objc/foundation/tagged/strings/TestObjCTaggedStrings.py new file mode 100644 index 0000000000000..66dc895b6a9ed --- /dev/null +++ b/lldb/test/API/lang/objc/foundation/tagged/strings/TestObjCTaggedStrings.py @@ -0,0 +1,12 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestCase(TestBase): + def test(self): + """Verify summary formatter for tagged strings.""" + self.build() + lldbutil.run_to_source_breakpoint(self, "break here", lldb.SBFileSpec("main.m")) + self.expect("v str1 str2", patterns=['@"nineDigit"', '@"tenDigitXX"']) diff --git a/lldb/test/API/lang/objc/foundation/tagged/strings/main.m b/lldb/test/API/lang/objc/foundation/tagged/strings/main.m new file mode 100644 index 0000000000000..c2b5575986a99 --- /dev/null +++ b/lldb/test/API/lang/objc/foundation/tagged/strings/main.m @@ -0,0 +1,17 @@ +#import + +@interface NSObject (Fake) +// 9 digit selector +- (void)nineDigit; +// 10 digit selector +- (void)tenDigitXX; +@end + +int main() { + SEL sel1 = @selector(nineDigit); + NSString *str1 = NSStringFromSelector(sel1); + SEL sel2 = @selector(tenDigitXX); + NSString *str2 = NSStringFromSelector(sel2); + NSLog(@"break here %@, %@", str1, str2); + return 0; +} From 78671dbb26628f072cb94de784349c50201c1e0f Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Thu, 17 Apr 2025 03:53:08 +0300 Subject: [PATCH 198/710] [Sparc] Use helper class for emitting CFI instructions into MIR (#136027) Also, guard emission by `needsFrameMoves()` check. There are no changes in tests because cfi instructions are currently ignored by AsmPrinter when they don't need to be printed/encoded. PR: https://github.com/llvm/llvm-project/pull/136027 --- llvm/include/llvm/CodeGen/CFIInstBuilder.h | 4 +++ llvm/lib/Target/Sparc/SparcFrameLowering.cpp | 34 ++++---------------- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CFIInstBuilder.h b/llvm/include/llvm/CodeGen/CFIInstBuilder.h index 9025624c0d8ab..bf7102eeea497 100644 --- a/llvm/include/llvm/CodeGen/CFIInstBuilder.h +++ b/llvm/include/llvm/CodeGen/CFIInstBuilder.h @@ -82,6 +82,10 @@ class CFIInstBuilder { TRI.getDwarfRegNum(Reg2, IsEH))); } + void buildWindowSave() const { + insertCFIInst(MCCFIInstruction::createWindowSave(nullptr)); + } + void buildRestore(MCRegister Reg) const { insertCFIInst(MCCFIInstruction::createRestore( nullptr, TRI.getDwarfRegNum(Reg, IsEH))); diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index 14233a526eec1..2934c88b6bffc 100644 --- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -14,6 +14,7 @@ #include "SparcInstrInfo.h" #include "SparcMachineFunctionInfo.h" #include "SparcSubtarget.h" +#include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -88,14 +89,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineFrameInfo &MFI = MF.getFrameInfo(); const SparcSubtarget &Subtarget = MF.getSubtarget(); - const SparcInstrInfo &TII = - *static_cast(Subtarget.getInstrInfo()); - const SparcRegisterInfo &RegInfo = - *static_cast(Subtarget.getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - // Debug location must be unknown since the first debug location is used - // to determine the end of the prologue. - DebugLoc dl; // Get the number of bytes to allocate from the FrameInfo int NumBytes = (int) MFI.getStackSize(); @@ -141,26 +135,12 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri); - unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true); - - // Emit ".cfi_def_cfa_register 30". - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // Emit ".cfi_window_save". - CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true); - unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true); - // Emit ".cfi_register 15, 31". - CFIIndex = MF.addFrameInst( - MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + if (MF.needsFrameMoves()) { + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::NoFlags); + CFIBuilder.buildDefCFARegister(SP::I6); + CFIBuilder.buildWindowSave(); + CFIBuilder.buildRegister(SP::O7, SP::I7); + } } MachineBasicBlock::iterator SparcFrameLowering:: From 6d8bf3cf3dcc5d85bec7b1e70a59a02cdfdaa1b4 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 16 Apr 2025 18:24:10 -0700 Subject: [PATCH 199/710] Revert "Reapply "[LLVM][TableGen] Parameterize NumToSkip in DecoderEmitter" (#136017)" (#136068) Reverts llvm/llvm-project#136019 Expensive checks tests are failing, so reverting. --- llvm/lib/Target/AArch64/CMakeLists.txt | 2 +- llvm/test/TableGen/VarLenDecoder.td | 4 +- llvm/test/TableGen/trydecode-emission.td | 10 +- llvm/test/TableGen/trydecode-emission2.td | 16 +-- llvm/test/TableGen/trydecode-emission3.td | 2 +- llvm/test/TableGen/trydecode-emission4.td | 2 +- llvm/utils/TableGen/DecoderEmitter.cpp | 115 ++++++++++------------ 7 files changed, 68 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index ba1d1605ec104..2300e479bc110 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -7,7 +7,7 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler --num-to-skip-size=3) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td index b77702ff7c5c1..5cf0bf8911859 100644 --- a/llvm/test/TableGen/VarLenDecoder.td +++ b/llvm/test/TableGen/VarLenDecoder.td @@ -47,9 +47,9 @@ def FOO32 : MyVarInst { } // CHECK: MCD::OPC_ExtractField, 3, 5, // Inst{7-3} ... -// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11 +// CHECK-NEXT: MCD::OPC_FilterValue, 8, 4, 0, 0, // Skip to: 12 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16 -// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, // Skip to: 19 +// CHECK-NEXT: MCD::OPC_FilterValue, 9, 4, 0, 0, // Skip to: 21 // CHECK-NEXT: MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32 // CHECK-NEXT: MCD::OPC_Fail, diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/trydecode-emission.td index 2b4239f4fbe65..20d2446eeac7f 100644 --- a/llvm/test/TableGen/trydecode-emission.td +++ b/llvm/test/TableGen/trydecode-emission.td @@ -34,10 +34,10 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 4, 4, // Inst{7-4} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 16, 0, // Skip to: 23 -// CHECK-NEXT: /* 7 */ MCD::OPC_CheckField, 2, 2, 0, 6, 0, // Skip to: 19 -// CHECK-NEXT: /* 13 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 19 -// CHECK-NEXT: /* 19 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA -// CHECK-NEXT: /* 23 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 18, 0, 0, // Skip to: 26 +// CHECK-NEXT: /* 8 */ MCD::OPC_CheckField, 2, 2, 0, 7, 0, 0, // Skip to: 22 +// CHECK-NEXT: /* 15 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 22 +// CHECK-NEXT: /* 22 */ MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA +// CHECK-NEXT: /* 26 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/trydecode-emission2.td index 7d30474058f73..0584034e41233 100644 --- a/llvm/test/TableGen/trydecode-emission2.td +++ b/llvm/test/TableGen/trydecode-emission2.td @@ -31,14 +31,14 @@ def InstB : TestInstruction { } // CHECK: /* 0 */ MCD::OPC_ExtractField, 2, 1, // Inst{2} ... -// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 31, 0, // Skip to: 38 -// CHECK-NEXT: /* 7 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... -// CHECK-NEXT: /* 10 */ MCD::OPC_FilterValue, 0, 24, 0, // Skip to: 38 -// CHECK-NEXT: /* 14 */ MCD::OPC_CheckField, 0, 2, 3, 6, 0, // Skip to: 26 -// CHECK-NEXT: /* 20 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, // Opcode: InstB, skip to: 26 -// CHECK-NEXT: /* 26 */ MCD::OPC_CheckField, 3, 2, 0, 6, 0, // Skip to: 38 -// CHECK-NEXT: /* 32 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, // Opcode: InstA, skip to: 38 -// CHECK-NEXT: /* 38 */ MCD::OPC_Fail, +// CHECK-NEXT: /* 3 */ MCD::OPC_FilterValue, 0, 36, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 8 */ MCD::OPC_ExtractField, 5, 3, // Inst{7-5} ... +// CHECK-NEXT: /* 11 */ MCD::OPC_FilterValue, 0, 28, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 16 */ MCD::OPC_CheckField, 0, 2, 3, 7, 0, 0, // Skip to: 30 +// CHECK-NEXT: /* 23 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 0, 0, 0, 0, // Opcode: InstB, skip to: 30 +// CHECK-NEXT: /* 30 */ MCD::OPC_CheckField, 3, 2, 0, 7, 0, 0, // Skip to: 44 +// CHECK-NEXT: /* 37 */ MCD::OPC_TryDecode, {{[0-9]+}}, {{[0-9]+}}, 1, 0, 0, 0, // Opcode: InstA, skip to: 44 +// CHECK-NEXT: /* 44 */ MCD::OPC_Fail, // CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } // CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; } diff --git a/llvm/test/TableGen/trydecode-emission3.td b/llvm/test/TableGen/trydecode-emission3.td index 0abbe62fe337e..4c5be7e1af229 100644 --- a/llvm/test/TableGen/trydecode-emission3.td +++ b/llvm/test/TableGen/trydecode-emission3.td @@ -1,4 +1,4 @@ - // RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s include "llvm/Target/Target.td" diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/trydecode-emission4.td index 413e4a0d1275a..1e51ba5e40768 100644 --- a/llvm/test/TableGen/trydecode-emission4.td +++ b/llvm/test/TableGen/trydecode-emission4.td @@ -1,4 +1,4 @@ -// RUN: llvm-tblgen -gen-disassembler --num-to-skip-size=3 -I %p/../../include %s | FileCheck %s +// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s // Test for OPC_ExtractField/OPC_CheckField with start bit > 255. // These large start values may arise for architectures with long instruction diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index eff63c6b45bb3..9c6015cc24576 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -32,10 +32,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -78,12 +76,6 @@ static cl::opt DecoderEmitterSuppressDuplicates( "significantly reducing Table Duplications")), cl::init(SUPPRESSION_DISABLE), cl::cat(DisassemblerEmitterCat)); -static cl::opt - NumToSkipSizeInBytes("num-to-skip-size", - cl::desc("number of bytes to use for num-to-skip " - "entries in the decoder table (2 or 3)"), - cl::init(2), cl::cat(DisassemblerEmitterCat)); - STATISTIC(NumEncodings, "Number of encodings considered"); STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info"); @@ -138,29 +130,10 @@ struct DecoderTable : public std::vector { // in the table for patching. size_t insertNumToSkip() { size_t Size = size(); - insert(end(), NumToSkipSizeInBytes, 0); + insert(end(), 3, 0); return Size; } - - void patchNumToSkip(size_t FixupIdx, uint32_t DestIdx) { - // Calculate the distance from the byte following the fixup entry byte - // to the destination. The Target is calculated from after the - // `NumToSkipSizeInBytes`-byte NumToSkip entry itself, so subtract - // `NumToSkipSizeInBytes` from the displacement here to account for that. - assert(DestIdx >= FixupIdx + NumToSkipSizeInBytes && - "Expecting a forward jump in the decoding table"); - uint32_t Delta = DestIdx - FixupIdx - NumToSkipSizeInBytes; - if (!isUIntN(8 * NumToSkipSizeInBytes, Delta)) - PrintFatalError( - "disassembler decoding table too large, try --num-to-skip-size=3"); - - (*this)[FixupIdx] = static_cast(Delta); - (*this)[FixupIdx + 1] = static_cast(Delta >> 8); - if (NumToSkipSizeInBytes == 3) - (*this)[FixupIdx + 2] = static_cast(Delta >> 16); - } }; - struct DecoderTableInfo { DecoderTable Table; FixupScopeList FixupStack; @@ -717,8 +690,19 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, uint32_t DestIdx) { // Any NumToSkip fixups in the current scope can resolve to the // current location. - for (uint32_t FixupIdx : Fixups) - Table.patchNumToSkip(FixupIdx, DestIdx); + for (uint32_t FixupIdx : reverse(Fixups)) { + // Calculate the distance from the byte following the fixup entry byte + // to the destination. The Target is calculated from after the 24-bit + // NumToSkip entry itself, so subtract three from the displacement here + // to account for that. + uint32_t Delta = DestIdx - FixupIdx - 3; + // Our NumToSkip entries are 24-bits. Make sure our table isn't too + // big. + assert(isUInt<24>(Delta)); + Table[FixupIdx] = (uint8_t)Delta; + Table[FixupIdx + 1] = (uint8_t)(Delta >> 8); + Table[FixupIdx + 2] = (uint8_t)(Delta >> 16); + } } // Emit table entries to decode instructions given a segment or segments @@ -775,9 +759,15 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { Delegate->emitTableEntries(TableInfo); // Now that we've emitted the body of the handler, update the NumToSkip - // of the filter itself to be able to skip forward when false. - if (PrevFilter) - Table.patchNumToSkip(PrevFilter, Table.size()); + // of the filter itself to be able to skip forward when false. Subtract + // three as to account for the width of the NumToSkip field itself. + if (PrevFilter) { + uint32_t NumToSkip = Table.size() - PrevFilter - 3; + assert(isUInt<24>(NumToSkip) && "disassembler decoding table too large!"); + Table[PrevFilter] = (uint8_t)NumToSkip; + Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8); + Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16); + } } // If there is no fallthrough, then the final filter should get fixed @@ -824,8 +814,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << (unsigned)*I++ << ", "; }; - // Emit `NumToSkipSizeInBytes`-byte numtoskip value to OS, returning the - // NumToSkip value. + // Emit 24-bit numtoskip value to OS, returning the NumToSkip value. auto emitNumToSkip = [](DecoderTable::const_iterator &I, formatted_raw_ostream &OS) { uint8_t Byte = *I++; @@ -834,11 +823,9 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, Byte = *I++; OS << (unsigned)Byte << ", "; NumToSkip |= Byte << 8; - if (NumToSkipSizeInBytes == 3) { - Byte = *I++; - OS << (unsigned)(Byte) << ", "; - NumToSkip |= Byte << 16; - } + Byte = *I++; + OS << (unsigned)(Byte) << ", "; + NumToSkip |= Byte << 16; return NumToSkip; }; @@ -880,7 +867,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // The filter value is ULEB128 encoded. emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -896,7 +883,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // ULEB128 encoded field value. emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -906,7 +893,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, OS << Indent << "MCD::OPC_CheckPredicate, "; emitULEB128(I, OS); - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Skip to: " << ((I - Table.begin()) + NumToSkip) << "\n"; break; @@ -938,7 +925,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, // Fallthrough for OPC_TryDecode. - // numtoskip value. + // 24-bit numtoskip value. uint32_t NumToSkip = emitNumToSkip(I, OS); OS << "// Opcode: " << NumberedEncodings[EncodingID] @@ -1424,9 +1411,9 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(NumBits); TableInfo.Table.insertULEB128(Ilnd.FieldVal); - // Allocate space in the table for fixup (NumToSkipSizeInBytes) so all - // our relative position calculations work OK even before we fully - // resolve the real value here. + // The fixup is always 24-bits, so go ahead and allocate the space + // in the table so all our relative position calculations work OK even + // before we fully resolve the real value here. // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.insertNumToSkip()); @@ -2170,18 +2157,7 @@ insertBits(InsnType &field, uint64_t bits, unsigned startBit, unsigned numBits) // decodeInstruction(). static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst) { - OS << formatv("\nconstexpr unsigned NumToSkipSizeInBytes = {};\n", - NumToSkipSizeInBytes); - OS << R"( -inline unsigned decodeNumToSkip(const uint8_t *&Ptr) { - unsigned NumToSkip = *Ptr++; - NumToSkip |= (*Ptr++) << 8; - if constexpr (NumToSkipSizeInBytes == 3) - NumToSkip |= (*Ptr++) << 16; - return NumToSkip; -} - template static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, InsnType insn, uint64_t Address, @@ -2219,7 +2195,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the field value. uint64_t Val = decodeULEB128AndIncUnsafe(++Ptr); bool Failed = Val != CurFieldValue; - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Perform the filter operation. if (Failed) @@ -2243,7 +2222,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen); Ptr += PtrLen; bool Failed = ExpectedValue != FieldValue; - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // If the actual and expected values don't match, skip. if (Failed) @@ -2258,7 +2240,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, case MCD::OPC_CheckPredicate: { // Decode the Predicate Index value. unsigned PIdx = decodeULEB128AndIncUnsafe(++Ptr); - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Check the predicate. bool Failed = !checkDecoderPredicate(PIdx, Bits); if (Failed) @@ -2293,7 +2278,10 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, // Decode the Opcode value. unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr); unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr); - unsigned NumToSkip = decodeNumToSkip(Ptr); + // NumToSkip is a plain 24-bit integer. + unsigned NumToSkip = *Ptr++; + NumToSkip |= (*Ptr++) << 8; + NumToSkip |= (*Ptr++) << 16; // Perform the decode operation. MCInst TmpMI; @@ -2418,9 +2406,6 @@ handleHwModesUnrelatedEncodings(const CodeGenInstruction *Instr, // Emits disassembler code for instruction decoding. void DecoderEmitter::run(raw_ostream &o) { - if (NumToSkipSizeInBytes != 2 && NumToSkipSizeInBytes != 3) - PrintFatalError("Invalid value for num-to-skip-size, must be 2 or 3"); - formatted_raw_ostream OS(o); OS << R"( #include "llvm/MC/MCInst.h" From 6b8d072cfd41f647f2c241f0a1a0843a279d049b Mon Sep 17 00:00:00 2001 From: Wu Yingcong Date: Thu, 17 Apr 2025 09:33:48 +0800 Subject: [PATCH 200/710] [libc] Fix incorrect unsigned comparison (#135595) There is a problem with such unsigned comparison pattern: ``` if(unsigned_a - unsigned_b > 0) { /* only NOT go here when unsigned_a==unsigned_b */ } ``` When `unsigned_a` < `unsigned_b`, the result will still be `>0` due to underflow. This patch fixes two of the occurrences I found. Also remove two redundant `if` where its condition is guaranteed by outer `if`. --- libc/src/stdio/printf_core/float_dec_converter.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h index ee5549825a6f2..ed004f9a26a13 100644 --- a/libc/src/stdio/printf_core/float_dec_converter.h +++ b/libc/src/stdio/printf_core/float_dec_converter.h @@ -186,13 +186,12 @@ template class FloatWriter { if (total_digits_written < digits_before_decimal && total_digits_written + buffered_digits >= digits_before_decimal && has_decimal_point) { + // digits_to_write > 0 guaranteed by outer if size_t digits_to_write = digits_before_decimal - total_digits_written; - if (digits_to_write > 0) { - // Write the digits before the decimal point. - RET_IF_RESULT_NEGATIVE(writer->write({block_buffer, digits_to_write})); - } + // Write the digits before the decimal point. + RET_IF_RESULT_NEGATIVE(writer->write({block_buffer, digits_to_write})); RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT)); - if (buffered_digits - digits_to_write > 0) { + if (buffered_digits > digits_to_write) { // Write the digits after the decimal point. RET_IF_RESULT_NEGATIVE( writer->write({block_buffer + digits_to_write, @@ -217,12 +216,11 @@ template class FloatWriter { total_digits_written + BLOCK_SIZE * max_block_count >= digits_before_decimal && has_decimal_point) { + // digits_to_write > 0 guaranteed by outer if size_t digits_to_write = digits_before_decimal - total_digits_written; - if (digits_to_write > 0) { - RET_IF_RESULT_NEGATIVE(writer->write(MAX_BLOCK_DIGIT, digits_to_write)); - } + RET_IF_RESULT_NEGATIVE(writer->write(MAX_BLOCK_DIGIT, digits_to_write)); RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT)); - if ((BLOCK_SIZE * max_block_count) - digits_to_write > 0) { + if ((BLOCK_SIZE * max_block_count) > digits_to_write) { RET_IF_RESULT_NEGATIVE(writer->write( MAX_BLOCK_DIGIT, (BLOCK_SIZE * max_block_count) - digits_to_write)); } From 53eae22e228532fe3349890b6c7fc10b9c10dbee Mon Sep 17 00:00:00 2001 From: joaosaffran <126493771+joaosaffran@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:38:53 -0700 Subject: [PATCH 201/710] [DirectX] adding support in obj2yaml and yaml2obj to root constants (#127840) Adding support for Root Constant in MC, Object and obj2yaml and yaml2obj, this PR adds: - new structures to dxbc definition. - serialize and desirialize logic from dxcontainer to yaml - tests validating against dxc - adding support to multiple parts. Closes: https://github.com/llvm/llvm-project/issues/126633 --------- Co-authored-by: joaosaffran --- llvm/include/llvm/BinaryFormat/DXContainer.h | 77 ++++++++++- .../BinaryFormat/DXContainerConstants.def | 18 +++ .../llvm/MC/DXContainerRootSignature.h | 24 +++- llvm/include/llvm/Object/DXContainer.h | 72 +++++++++- .../include/llvm/ObjectYAML/DXContainerYAML.h | 38 +++++- llvm/lib/BinaryFormat/DXContainer.cpp | 20 +++ llvm/lib/MC/DXContainerRootSignature.cpp | 76 ++++++++++- llvm/lib/Object/DXContainer.cpp | 31 ++--- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 19 ++- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 84 ++++++++++-- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 17 ++- .../ContainerData/RootSignature-Flags.ll | 3 +- .../RootSignature-MultipleEntryFunctions.ll | 8 +- .../DXContainer/RootSignature-Flags.yaml | 18 +-- .../RootSignature-InvalidType.yaml | 29 +++++ .../RootSignature-InvalidVisibility.yaml | 33 +++++ .../RootSignature-MultipleParameters.yaml | 59 +++++++++ llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 9 +- llvm/unittests/Object/DXContainerTest.cpp | 123 ++++++++++++++---- .../ObjectYAML/DXContainerYAMLTest.cpp | 111 +++++++++++++++- 20 files changed, 764 insertions(+), 105 deletions(-) create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidType.yaml create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidVisibility.yaml create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 28905e27837a7..455657980bf40 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,6 +14,7 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" @@ -157,6 +158,40 @@ enum class RootElementFlag : uint32_t { #include "DXContainerConstants.def" }; +#define ROOT_PARAMETER(Val, Enum) Enum = Val, +enum class RootParameterType : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getRootParameterTypes(); + +#define ROOT_PARAMETER(Val, Enum) \ + case Val: \ + return true; +inline bool isValidParameterType(uint32_t V) { + switch (V) { +#include "DXContainerConstants.def" + } + return false; +} + +#define SHADER_VISIBILITY(Val, Enum) Enum = Val, +enum class ShaderVisibility : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getShaderVisibility(); + +#define SHADER_VISIBILITY(Val, Enum) \ + case Val: \ + return true; +inline bool isValidShaderVisibility(uint32_t V) { + switch (V) { +#include "DXContainerConstants.def" + } + return false; +} + PartType parsePartType(StringRef S); struct VertexPSVInfo { @@ -546,15 +581,49 @@ struct ProgramSignatureElement { static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); -struct RootSignatureValidations { +// following dx12 naming +// https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_root_constants +struct RootConstants { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; + + void swapBytes() { + sys::swapByteOrder(ShaderRegister); + sys::swapByteOrder(RegisterSpace); + sys::swapByteOrder(Num32BitValues); + } +}; - static bool isValidRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } +struct RootParameterHeader { + uint32_t ParameterType; + uint32_t ShaderVisibility; + uint32_t ParameterOffset; - static bool isValidVersion(uint32_t Version) { - return (Version == 1 || Version == 2); + void swapBytes() { + sys::swapByteOrder(ParameterType); + sys::swapByteOrder(ShaderVisibility); + sys::swapByteOrder(ParameterOffset); } }; +struct RootSignatureHeader { + uint32_t Version; + uint32_t NumParameters; + uint32_t ParametersOffset; + uint32_t NumStaticSamplers; + uint32_t StaticSamplerOffset; + uint32_t Flags; + + void swapBytes() { + sys::swapByteOrder(Version); + sys::swapByteOrder(NumParameters); + sys::swapByteOrder(ParametersOffset); + sys::swapByteOrder(NumStaticSamplers); + sys::swapByteOrder(StaticSamplerOffset); + sys::swapByteOrder(Flags); + } +}; } // namespace dxbc } // namespace llvm diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 6d44ea14df444..590ded5e8c899 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -72,6 +72,24 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG +#ifdef ROOT_PARAMETER + +ROOT_PARAMETER(1, Constants32Bit) +#undef ROOT_PARAMETER +#endif // ROOT_PARAMETER + +#ifdef SHADER_VISIBILITY + +SHADER_VISIBILITY(0, All) +SHADER_VISIBILITY(1, Vertex) +SHADER_VISIBILITY(2, Hull) +SHADER_VISIBILITY(3, Domain) +SHADER_VISIBILITY(4, Geometry) +SHADER_VISIBILITY(5, Pixel) +SHADER_VISIBILITY(6, Amplification) +SHADER_VISIBILITY(7, Mesh) +#undef SHADER_VISIBILITY +#endif // SHADER_VISIBILITY #ifdef DXIL_MODULE_FLAG diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index e414112498798..fee799249b255 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -6,23 +6,33 @@ // //===----------------------------------------------------------------------===// +#include "llvm/BinaryFormat/DXContainer.h" #include #include namespace llvm { class raw_ostream; - namespace mcdxbc { + +struct RootParameter { + dxbc::RootParameterHeader Header; + union { + dxbc::RootConstants Constants; + }; +}; struct RootSignatureDesc { - uint32_t Version = 2; - uint32_t NumParameters = 0; - uint32_t RootParametersOffset = 0; - uint32_t NumStaticSamplers = 0; - uint32_t StaticSamplersOffset = 0; - uint32_t Flags = 0; + + uint32_t Version = 2U; + uint32_t Flags = 0U; + uint32_t RootParameterOffset = 0U; + uint32_t StaticSamplersOffset = 0u; + uint32_t NumStaticSamplers = 0u; + SmallVector Parameters; void write(raw_ostream &OS) const; + + size_t getSize() const; }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c3a2f756bd683..e8287ce078365 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -18,10 +18,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/Error.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include +#include #include namespace llvm { @@ -116,6 +118,40 @@ template struct ViewArray { }; namespace DirectX { +struct RootParameterView { + const dxbc::RootParameterHeader &Header; + StringRef ParamData; + RootParameterView(const dxbc::RootParameterHeader &H, StringRef P) + : Header(H), ParamData(P) {} + + template Expected readParameter() { + T Struct; + if (sizeof(T) != ParamData.size()) + return make_error( + "Reading structure out of file bounds", object_error::parse_failed); + + memcpy(&Struct, ParamData.data(), sizeof(T)); + // DXContainer is always little endian + if (sys::IsBigEndianHost) + Struct.swapBytes(); + return Struct; + } +}; + +struct RootConstantView : RootParameterView { + static bool classof(const RootParameterView *V) { + return V->Header.ParameterType == + (uint32_t)dxbc::RootParameterType::Constants32Bit; + } + + llvm::Expected read() { + return readParameter(); + } +}; + +static Error parseFailed(const Twine &Msg) { + return make_error(Msg.str(), object_error::parse_failed); +} class RootSignature { private: @@ -125,17 +161,49 @@ class RootSignature { uint32_t NumStaticSamplers; uint32_t StaticSamplersOffset; uint32_t Flags; + ViewArray ParametersHeaders; + StringRef PartData; + + using param_header_iterator = ViewArray::iterator; public: - RootSignature() {} + RootSignature(StringRef PD) : PartData(PD) {} - Error parse(StringRef Data); + Error parse(); uint32_t getVersion() const { return Version; } uint32_t getNumParameters() const { return NumParameters; } uint32_t getRootParametersOffset() const { return RootParametersOffset; } uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } + uint32_t getNumRootParameters() const { return ParametersHeaders.size(); } + llvm::iterator_range param_headers() const { + return llvm::make_range(ParametersHeaders.begin(), ParametersHeaders.end()); + } uint32_t getFlags() const { return Flags; } + + llvm::Expected + getParameter(const dxbc::RootParameterHeader &Header) const { + size_t DataSize; + + if (!dxbc::isValidParameterType(Header.ParameterType)) + return parseFailed("invalid parameter type"); + + switch (static_cast(Header.ParameterType)) { + case dxbc::RootParameterType::Constants32Bit: + DataSize = sizeof(dxbc::RootConstants); + break; + } + size_t EndOfSectionByte = getNumStaticSamplers() == 0 + ? PartData.size() + : getStaticSamplersOffset(); + + if (Header.ParameterOffset + DataSize > EndOfSectionByte) + return parseFailed("Reading structure out of file bounds"); + + StringRef Buff = PartData.substr(Header.ParameterOffset, DataSize); + RootParameterView View = RootParameterView(Header, Buff); + return View; + } }; class PSVRuntimeInfo { diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index ecad35e82b155..393bba9c79bf8 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -74,18 +74,43 @@ struct ShaderHash { }; #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; + +struct RootConstantsYaml { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; +}; + +struct RootParameterYamlDesc { + uint32_t Type; + uint32_t Visibility; + uint32_t Offset; + + union { + RootConstantsYaml Constants; + }; +}; + struct RootSignatureYamlDesc { RootSignatureYamlDesc() = default; - RootSignatureYamlDesc(const object::DirectX::RootSignature &Data); uint32_t Version; - uint32_t NumParameters; + uint32_t NumRootParameters; uint32_t RootParametersOffset; uint32_t NumStaticSamplers; uint32_t StaticSamplersOffset; + SmallVector Parameters; + uint32_t getEncodedFlags(); + iterator_range params() { + return make_range(Parameters.begin(), Parameters.end()); + } + + static llvm::Expected + create(const object::DirectX::RootSignature &Data); + #include "llvm/BinaryFormat/DXContainerConstants.def" }; @@ -192,6 +217,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::ResourceBindInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureElement) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::PSVInfo::MaskVector) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::RootParameterYamlDesc) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode) @@ -264,6 +290,14 @@ template <> struct MappingTraits { DXContainerYAML::RootSignatureYamlDesc &RootSignature); }; +template <> struct MappingTraits { + static void mapping(IO &IO, llvm::DXContainerYAML::RootParameterYamlDesc &P); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, llvm::DXContainerYAML::RootConstantsYaml &C); +}; + } // namespace yaml } // namespace llvm diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 97ceb16ccf53f..8e7b7d313706a 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,6 +60,26 @@ ArrayRef> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } +#define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibility::Enum}, + +static const EnumEntry ShaderVisibilityValues[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getShaderVisibility() { + return ArrayRef(ShaderVisibilityValues); +} + +#define ROOT_PARAMETER(Val, Enum) {#Enum, RootParameterType::Enum}, + +static const EnumEntry RootParameterTypes[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getRootParameterTypes() { + return ArrayRef(RootParameterTypes); +} + #define SEMANTIC_KIND(Val, Enum) {#Enum, PSV::SemanticKind::Enum}, static const EnumEntry SemanticKindNames[] = { diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index b6f2b85bac74e..c2731d95c955e 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,17 +7,81 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Support/EndianStream.h" using namespace llvm; using namespace llvm::mcdxbc; +static uint32_t writePlaceholder(raw_svector_ostream &Stream) { + const uint32_t DummyValue = std::numeric_limits::max(); + uint32_t Offset = Stream.tell(); + support::endian::write(Stream, DummyValue, llvm::endianness::little); + return Offset; +} + +static void rewriteOffsetToCurrentByte(raw_svector_ostream &Stream, + uint32_t Offset) { + uint32_t Value = + support::endian::byte_swap( + Stream.tell()); + Stream.pwrite(reinterpret_cast(&Value), sizeof(Value), Offset); +} + +size_t RootSignatureDesc::getSize() const { + size_t Size = sizeof(dxbc::RootSignatureHeader) + + Parameters.size() * sizeof(dxbc::RootParameterHeader); + + for (const mcdxbc::RootParameter &P : Parameters) { + switch (P.Header.ParameterType) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): + Size += sizeof(dxbc::RootConstants); + break; + } + } + return Size; +} + void RootSignatureDesc::write(raw_ostream &OS) const { + SmallString<256> Storage; + raw_svector_ostream BOS(Storage); + BOS.reserveExtraSpace(getSize()); + + const uint32_t NumParameters = Parameters.size(); + + support::endian::write(BOS, Version, llvm::endianness::little); + support::endian::write(BOS, NumParameters, llvm::endianness::little); + support::endian::write(BOS, RootParameterOffset, llvm::endianness::little); + support::endian::write(BOS, NumStaticSamplers, llvm::endianness::little); + support::endian::write(BOS, StaticSamplersOffset, llvm::endianness::little); + support::endian::write(BOS, Flags, llvm::endianness::little); + + SmallVector ParamsOffsets; + for (const mcdxbc::RootParameter &P : Parameters) { + support::endian::write(BOS, P.Header.ParameterType, + llvm::endianness::little); + support::endian::write(BOS, P.Header.ShaderVisibility, + llvm::endianness::little); + + ParamsOffsets.push_back(writePlaceholder(BOS)); + } + + assert(NumParameters == ParamsOffsets.size()); + for (size_t I = 0; I < NumParameters; ++I) { + rewriteOffsetToCurrentByte(BOS, ParamsOffsets[I]); + const mcdxbc::RootParameter &P = Parameters[I]; - support::endian::write(OS, Version, llvm::endianness::little); - support::endian::write(OS, NumParameters, llvm::endianness::little); - support::endian::write(OS, RootParametersOffset, llvm::endianness::little); - support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); - support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); - support::endian::write(OS, Flags, llvm::endianness::little); + switch (P.Header.ParameterType) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): + support::endian::write(BOS, P.Constants.ShaderRegister, + llvm::endianness::little); + support::endian::write(BOS, P.Constants.RegisterSpace, + llvm::endianness::little); + support::endian::write(BOS, P.Constants.Num32BitValues, + llvm::endianness::little); + break; + } + } + assert(Storage.size() == getSize()); + OS.write(Storage.data(), Storage.size()); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1eb1453c65147..95f6788e75aa6 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -20,10 +20,6 @@ static Error parseFailed(const Twine &Msg) { return make_error(Msg.str(), object_error::parse_failed); } -static Error validationFailed(const Twine &Msg) { - return make_error(Msg.str(), inconvertibleErrorCode()); -} - template static Error readStruct(StringRef Buffer, const char *Src, T &Struct) { // Don't read before the beginning or past the end of the file @@ -100,8 +96,8 @@ Error DXContainer::parseHash(StringRef Part) { Error DXContainer::parseRootSignature(StringRef Part) { if (RootSignature) return parseFailed("More than one RTS0 part is present in the file"); - RootSignature = DirectX::RootSignature(); - if (Error Err = RootSignature->parse(Part)) + RootSignature = DirectX::RootSignature(Part); + if (Error Err = RootSignature->parse()) return Err; return Error::success(); } @@ -246,23 +242,17 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { IteratorState.Offset = Offset; } -Error DirectX::RootSignature::parse(StringRef Data) { - const char *Current = Data.begin(); +Error DirectX::RootSignature::parse() { + const char *Current = PartData.begin(); // Root Signature headers expects 6 integers to be present. - if (Data.size() < 6 * sizeof(uint32_t)) + if (PartData.size() < 6 * sizeof(uint32_t)) return parseFailed( "Invalid root signature, insufficient space for header."); - uint32_t VValue = - support::endian::read(Current); + Version = support::endian::read(Current); Current += sizeof(uint32_t); - if (!dxbc::RootSignatureValidations::isValidVersion(VValue)) - return validationFailed("unsupported root signature version read: " + - llvm::Twine(VValue)); - Version = VValue; - NumParameters = support::endian::read(Current); Current += sizeof(uint32_t); @@ -279,14 +269,11 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - uint32_t FValue = - support::endian::read(Current); + Flags = support::endian::read(Current); Current += sizeof(uint32_t); - if (!dxbc::RootSignatureValidations::isValidRootFlag(FValue)) - return validationFailed("unsupported root signature flag value read: " + - llvm::Twine(FValue)); - Flags = FValue; + ParametersHeaders.Data = PartData.substr( + RootParametersOffset, NumParameters * sizeof(dxbc::RootParameterHeader)); return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index f6ed09c857bb7..86e24eae4abc6 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -269,11 +269,26 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { mcdxbc::RootSignatureDesc RS; RS.Flags = P.RootSignature->getEncodedFlags(); RS.Version = P.RootSignature->Version; - RS.NumParameters = P.RootSignature->NumParameters; - RS.RootParametersOffset = P.RootSignature->RootParametersOffset; + RS.RootParameterOffset = P.RootSignature->RootParametersOffset; RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; + for (const auto &Param : P.RootSignature->Parameters) { + mcdxbc::RootParameter NewParam; + NewParam.Header = dxbc::RootParameterHeader{ + Param.Type, Param.Visibility, Param.Offset}; + + switch (Param.Type) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): + NewParam.Constants.Num32BitValues = Param.Constants.Num32BitValues; + NewParam.Constants.RegisterSpace = Param.Constants.RegisterSpace; + NewParam.Constants.ShaderRegister = Param.Constants.ShaderRegister; + break; + } + + RS.Parameters.push_back(NewParam); + } + RS.write(OS); break; } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f03c7da65999d..59914fe30082d 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -12,9 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/ObjectYAML/DXContainerYAML.h" +#include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Support/Error.h" #include "llvm/Support/ScopedPrinter.h" +#include +#include namespace llvm { @@ -29,16 +33,60 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { #include "llvm/BinaryFormat/DXContainerConstants.def" } -DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( - const object::DirectX::RootSignature &Data) - : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), - RootParametersOffset(Data.getRootParametersOffset()), - NumStaticSamplers(Data.getNumStaticSamplers()), - StaticSamplersOffset(Data.getStaticSamplersOffset()) { +llvm::Expected +DXContainerYAML::RootSignatureYamlDesc::create( + const object::DirectX::RootSignature &Data) { + + RootSignatureYamlDesc RootSigDesc; + + RootSigDesc.Version = Data.getVersion(); + RootSigDesc.NumStaticSamplers = Data.getNumStaticSamplers(); + RootSigDesc.StaticSamplersOffset = Data.getStaticSamplersOffset(); + RootSigDesc.NumRootParameters = Data.getNumRootParameters(); + RootSigDesc.RootParametersOffset = Data.getRootParametersOffset(); + uint32_t Flags = Data.getFlags(); + for (const dxbc::RootParameterHeader &PH : Data.param_headers()) { + + RootParameterYamlDesc NewP; + NewP.Offset = PH.ParameterOffset; + + if (!dxbc::isValidParameterType(PH.ParameterType)) + return createStringError(std::errc::invalid_argument, + "Invalid value for parameter type"); + + NewP.Type = PH.ParameterType; + + if (!dxbc::isValidShaderVisibility(PH.ShaderVisibility)) + return createStringError(std::errc::invalid_argument, + "Invalid value for shader visibility"); + + NewP.Visibility = PH.ShaderVisibility; + + llvm::Expected ParamViewOrErr = + Data.getParameter(PH); + if (Error E = ParamViewOrErr.takeError()) + return std::move(E); + object::DirectX::RootParameterView ParamView = ParamViewOrErr.get(); + + if (auto *RCV = dyn_cast(&ParamView)) { + llvm::Expected ConstantsOrErr = RCV->read(); + if (Error E = ConstantsOrErr.takeError()) + return std::move(E); + + auto Constants = *ConstantsOrErr; + + NewP.Constants.Num32BitValues = Constants.Num32BitValues; + NewP.Constants.ShaderRegister = Constants.ShaderRegister; + NewP.Constants.RegisterSpace = Constants.RegisterSpace; + } + RootSigDesc.Parameters.push_back(NewP); + } #define ROOT_ELEMENT_FLAG(Num, Val) \ - Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; + RootSigDesc.Val = \ + (Flags & llvm::to_underlying(dxbc::RootElementFlag::Val)) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" + return RootSigDesc; } uint32_t DXContainerYAML::RootSignatureYamlDesc::getEncodedFlags() { @@ -212,14 +260,34 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { IO.mapRequired("Version", S.Version); - IO.mapRequired("NumParameters", S.NumParameters); + IO.mapRequired("NumRootParameters", S.NumRootParameters); IO.mapRequired("RootParametersOffset", S.RootParametersOffset); IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); + IO.mapRequired("Parameters", S.Parameters); #define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } +void MappingTraits::mapping( + IO &IO, llvm::DXContainerYAML::RootConstantsYaml &C) { + IO.mapRequired("Num32BitValues", C.Num32BitValues); + IO.mapRequired("RegisterSpace", C.RegisterSpace); + IO.mapRequired("ShaderRegister", C.ShaderRegister); +} + +void MappingTraits::mapping( + IO &IO, llvm::DXContainerYAML::RootParameterYamlDesc &P) { + IO.mapRequired("ParameterType", P.Type); + IO.mapRequired("ShaderVisibility", P.Visibility); + + switch (P.Type) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): + IO.mapRequired("Constants", P.Constants); + break; + } +} + void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { IO.mapRequired("Name", P.Name); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index fd390cdbf9057..3ba0535e0114b 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -92,8 +92,10 @@ static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return HasError; } +static bool verifyRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } + static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Flags)) { + if (!verifyRootFlag(RSD.Flags)) { return reportError(Ctx, "Invalid Root Signature flag value"); } return false; @@ -189,6 +191,8 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, SmallDenseMap &RSDMap = AM.getResult(M); + + const size_t RSHSize = sizeof(dxbc::RootSignatureHeader); OS << "Root Signature Definitions" << "\n"; uint8_t Space = 0; @@ -203,12 +207,11 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, Space++; OS << indent(Space) << "Flags: " << format_hex(RS.Flags, 8) << ":\n"; OS << indent(Space) << "Version: " << RS.Version << ":\n"; - OS << indent(Space) << "NumParameters: " << RS.NumParameters << ":\n"; - OS << indent(Space) << "RootParametersOffset: " << RS.RootParametersOffset - << ":\n"; - OS << indent(Space) << "NumStaticSamplers: " << RS.NumStaticSamplers - << ":\n"; - OS << indent(Space) << "StaticSamplersOffset: " << RS.StaticSamplersOffset + OS << indent(Space) << "NumParameters: " << RS.Parameters.size() << ":\n"; + OS << indent(Space) << "RootParametersOffset: " << RSHSize << ":\n"; + OS << indent(Space) << "NumStaticSamplers: " << 0 << ":\n"; + OS << indent(Space) + << "StaticSamplersOffset: " << RSHSize + RS.Parameters.size_in_bytes() << ":\n"; Space--; // end root signature header diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 3f5bb166ad0e5..ef2b97860bfae 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -22,8 +22,9 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC-NEXT: Size: 24 ; DXC-NEXT: RootSignature: ; DXC-NEXT: Version: 2 -; DXC-NEXT: NumParameters: 0 +; DXC-NEXT: NumRootParameters: 0 ; DXC-NEXT: RootParametersOffset: 0 ; DXC-NEXT: NumStaticSamplers: 0 ; DXC-NEXT: StaticSamplersOffset: 0 +; DXC-NEXT: Parameters: [] ; DXC-NEXT: AllowInputAssemblerInputLayout: true diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 7adb17d0b022f..581ac9aaec110 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -27,14 +27,14 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; CHECK-NEXT: Flags: 0x000001 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 -; CHECK-NEXT: RootParametersOffset: 0 +; CHECK-NEXT: RootParametersOffset: 24 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 24 ; CHECK-LABEL: Definition for 'anotherMain': ; CHECK-NEXT: Flags: 0x000002 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 -; CHECK-NEXT: RootParametersOffset: 0 +; CHECK-NEXT: RootParametersOffset: 24 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 24 diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index b0a3e6945f454..74816d403183a 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -14,10 +14,11 @@ Parts: Size: 24 RootSignature: Version: 2 - NumParameters: 1 - RootParametersOffset: 3 - NumStaticSamplers: 4 - StaticSamplersOffset: 5 + NumRootParameters: 0 + RootParametersOffset: 24 + NumStaticSamplers: 0 + StaticSamplersOffset: 60 + Parameters: [] AllowInputAssemblerInputLayout: true DenyGeometryShaderRootAccess: true @@ -25,9 +26,10 @@ Parts: # CHECK-NEXT: Size: 24 # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 -# CHECK-NEXT: NumParameters: 1 -# CHECK-NEXT: RootParametersOffset: 3 -# CHECK-NEXT: NumStaticSamplers: 4 -# CHECK-NEXT: StaticSamplersOffset: 5 +# CHECK-NEXT: NumRootParameters: 0 +# CHECK-NEXT: RootParametersOffset: 24 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 60 +# CHECK-NEXT: Parameters: [] # CHECK-NEXT: AllowInputAssemblerInputLayout: true # CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidType.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidType.yaml new file mode 100644 index 0000000000000..091e70789d956 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidType.yaml @@ -0,0 +1,29 @@ +# RUN: yaml2obj %s -o %t +# RUN: not obj2yaml 2>&1 %t | FileCheck %s -DFILE=%t + +# CHECK: Error reading file: [[FILE]]: Invalid value for parameter type + + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 2 + NumRootParameters: 2 + RootParametersOffset: 24 + NumStaticSamplers: 0 + StaticSamplersOffset: 64 + Parameters: + - ParameterType: 255 # INVALID + ShaderVisibility: 2 # Hull + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidVisibility.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidVisibility.yaml new file mode 100644 index 0000000000000..1acaf6e4e08a4 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-InvalidVisibility.yaml @@ -0,0 +1,33 @@ +# RUN: yaml2obj %s -o %t +# RUN: not obj2yaml 2>&1 %t | FileCheck %s -DFILE=%t + +# CHECK: Error reading file: [[FILE]]: Invalid value for shader visibility + + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 2 + NumRootParameters: 2 + RootParametersOffset: 24 + NumStaticSamplers: 0 + StaticSamplersOffset: 64 + Parameters: + - ParameterType: 1 # Constants32Bit + ShaderVisibility: 255 # INVALID + Constants: + Num32BitValues: 21 + ShaderRegister: 22 + RegisterSpace: 23 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml new file mode 100644 index 0000000000000..f366d71714359 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml @@ -0,0 +1,59 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 2 + NumRootParameters: 2 + RootParametersOffset: 24 + NumStaticSamplers: 0 + StaticSamplersOffset: 60 + Parameters: + - ParameterType: 1 # Constants32Bit + ShaderVisibility: 2 # Hull + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + - ParameterType: 1 # Constants32Bit + ShaderVisibility: 4 # Geometry + Constants: + Num32BitValues: 21 + ShaderRegister: 22 + RegisterSpace: 23 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 80 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 2 +# CHECK-NEXT: NumRootParameters: 2 +# CHECK-NEXT: RootParametersOffset: 24 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 60 +# CHECK-NEXT: Parameters: +# CHECK-NEXT: - ParameterType: 1 +# CHECK-NEXT: ShaderVisibility: 2 +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 16 +# CHECK-NEXT: RegisterSpace: 14 +# CHECK-NEXT: ShaderRegister: 15 +# CHECK-NEXT: - ParameterType: 1 +# CHECK-NEXT: ShaderVisibility: 4 +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 21 +# CHECK-NEXT: RegisterSpace: 23 +# CHECK-NEXT: ShaderRegister: 22 +# CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index f3ef1b6a27bcf..c727595406767 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -155,8 +155,13 @@ dumpDXContainer(MemoryBufferRef Source) { break; case dxbc::PartType::RTS0: std::optional RS = Container.getRootSignature(); - if (RS.has_value()) - NewPart.RootSignature = DXContainerYAML::RootSignatureYamlDesc(*RS); + if (RS.has_value()) { + auto RootSigDescOrErr = + DXContainerYAML::RootSignatureYamlDesc::create(*RS); + if (Error E = RootSigDescOrErr.takeError()) + return std::move(E); + NewPart.RootSignature = RootSigDescOrErr.get(); + } break; } } diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 5a73f32ab7c32..62ef8e385373f 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -11,6 +11,7 @@ #include "llvm/BinaryFormat/Magic.h" #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/ObjectYAML/yaml2obj.h" +#include "llvm/Support/Error.h" #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" @@ -822,6 +823,57 @@ TEST(DXCFile, MalformedSignature) { } } +TEST(RootSignature, RootParameters) { + { + // Root Parameters offset has changed to 36, additional padding was added, + // as well as fixing the parameter offset. + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<144>(Buffer))); + + auto MaybeRS = C.getRootSignature(); + ASSERT_TRUE(MaybeRS.has_value()); + const auto &RS = MaybeRS.value(); + ASSERT_EQ(RS.getVersion(), 2u); + ASSERT_EQ(RS.getNumParameters(), 1u); + ASSERT_EQ(RS.getRootParametersOffset(), 36u); + ASSERT_EQ(RS.getNumStaticSamplers(), 0u); + ASSERT_EQ(RS.getStaticSamplersOffset(), 44u); + ASSERT_EQ(RS.getFlags(), 17u); + + auto RootParam = *RS.param_headers().begin(); + ASSERT_EQ((unsigned)RootParam.ParameterType, 1u); + ASSERT_EQ((unsigned)RootParam.ShaderVisibility, 2u); + auto ParamView = RS.getParameter(RootParam); + ASSERT_THAT_ERROR(ParamView.takeError(), Succeeded()); + + DirectX::RootConstantView *RootConstantsView = + dyn_cast(&*ParamView); + ASSERT_TRUE(RootConstantsView != nullptr); + auto Constants = RootConstantsView->read(); + + ASSERT_THAT_ERROR(Constants.takeError(), Succeeded()); + + ASSERT_EQ(Constants->ShaderRegister, 15u); + ASSERT_EQ(Constants->RegisterSpace, 14u); + ASSERT_EQ(Constants->Num32BitValues, 16u); + } +} + TEST(RootSignature, ParseRootFlags) { { uint8_t Buffer[] = { @@ -829,7 +881,7 @@ TEST(RootSignature, ParseRootFlags) { 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; DXContainer C = @@ -839,12 +891,11 @@ TEST(RootSignature, ParseRootFlags) { ASSERT_TRUE(RS.has_value()); ASSERT_EQ(RS->getVersion(), 2u); ASSERT_EQ(RS->getNumParameters(), 0u); - ASSERT_EQ(RS->getRootParametersOffset(), 0u); + ASSERT_EQ(RS->getRootParametersOffset(), 24u); ASSERT_EQ(RS->getNumStaticSamplers(), 0u); ASSERT_EQ(RS->getStaticSamplersOffset(), 0u); ASSERT_EQ(RS->getFlags(), 0x01u); } - { // this parameter has the root signature definition missing some values. uint8_t Buffer[] = { @@ -860,33 +911,51 @@ TEST(RootSignature, ParseRootFlags) { FailedWithMessage( "Invalid root signature, insufficient space for header.")); } +} + +TEST(RootSignature, ParseRootConstant) { { - // Version has been changed to an invalid number. uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<100>(Buffer)), - FailedWithMessage("unsupported root signature version read: 3")); - } - { - // Flag has been set to an invalid value - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, - }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<100>(Buffer)), - FailedWithMessage( - "unsupported root signature flag value read: 4278190081")); + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<133>(Buffer))); + + auto MaybeRS = C.getRootSignature(); + ASSERT_TRUE(MaybeRS.has_value()); + const auto &RS = MaybeRS.value(); + ASSERT_EQ(RS.getVersion(), 2u); + ASSERT_EQ(RS.getNumParameters(), 1u); + ASSERT_EQ(RS.getRootParametersOffset(), 24u); + ASSERT_EQ(RS.getNumStaticSamplers(), 0u); + ASSERT_EQ(RS.getStaticSamplersOffset(), 44u); + ASSERT_EQ(RS.getFlags(), 17u); + + auto RootParam = *RS.param_headers().begin(); + ASSERT_EQ((unsigned)RootParam.ParameterType, 1u); + ASSERT_EQ((unsigned)RootParam.ShaderVisibility, 2u); + auto ParamView = RS.getParameter(RootParam); + ASSERT_THAT_ERROR(ParamView.takeError(), Succeeded()); + + DirectX::RootConstantView *RootConstantsView = + dyn_cast(&*ParamView); + ASSERT_TRUE(RootConstantsView != nullptr); + auto Constants = RootConstantsView->read(); + + ASSERT_THAT_ERROR(Constants.takeError(), Succeeded()); + + ASSERT_EQ(Constants->ShaderRegister, 15u); + ASSERT_EQ(Constants->RegisterSpace, 14u); + ASSERT_EQ(Constants->Num32BitValues, 16u); } } diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b48cd9ce53987..61390049bc0df 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -127,10 +127,11 @@ TEST(RootSignature, ParseRootFlags) { Size: 24 RootSignature: Version: 2 - NumParameters: 0 - RootParametersOffset: 0 + NumRootParameters: 0 + RootParametersOffset: 24 NumStaticSamplers: 0 StaticSamplersOffset: 0 + Parameters: [] AllowInputAssemblerInputLayout: true )")); @@ -139,10 +140,114 @@ TEST(RootSignature, ParseRootFlags) { 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; EXPECT_EQ(Storage.size(), 68u); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 68u) == 0); } + +TEST(RootSignature, HeaderData) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer + Header: + Hash: [ 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x5, + 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1 ] + Version: + Major: 1 + Minor: 0 + FileSize: 133 + PartCount: 1 + PartOffsets: [ 36 ] + Parts: + - Name: RTS0 + Size: 89 + RootSignature: + Version: 2 + NumRootParameters: 1 + RootParametersOffset: 255 + NumStaticSamplers: 0 + StaticSamplersOffset: 0 + Parameters: + - ParameterType: 1 + ShaderVisibility: 2 + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + + EXPECT_EQ(Storage.size(), 133u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 133u) == 0); +} + +TEST(RootSignature, ParseRootConstants) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer + Header: + Hash: [ 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x5, + 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1 ] + Version: + Major: 1 + Minor: 0 + FileSize: 133 + PartCount: 1 + PartOffsets: [ 36 ] + Parts: + - Name: RTS0 + Size: 89 + RootSignature: + Version: 2 + NumRootParameters: 1 + RootParametersOffset: 36 + NumStaticSamplers: 0 + StaticSamplersOffset: 0 + Parameters: + - ParameterType: 1 + ShaderVisibility: 2 + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + + EXPECT_EQ(Storage.size(), 133u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 133u) == 0); +} From 9bd0c8726a5e3fd4f76e84692bd920dfca7a8d7f Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Wed, 16 Apr 2025 18:48:10 -0700 Subject: [PATCH 202/710] [clang-format] Fix a bug in BWACS_MultiLine (#135906) Fix #51940 --- clang/lib/Format/TokenAnnotator.cpp | 15 ++++++- clang/lib/Format/UnwrappedLineFormatter.cpp | 45 ++++----------------- clang/lib/Format/UnwrappedLineParser.cpp | 11 ++++- clang/unittests/Format/FormatTest.cpp | 11 +++++ 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index ef5f07e2c62ee..144983f675828 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4153,8 +4153,18 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { ChildSize + Current->SpacesRequiredBefore; } - if (Current->is(TT_CtorInitializerColon)) + if (Current->is(TT_ControlStatementLBrace)) { + if (Style.ColumnLimit > 0 && + Style.BraceWrapping.AfterControlStatement == + FormatStyle::BWACS_MultiLine && + Line.Level * Style.IndentWidth + Line.Last->TotalLength > + Style.ColumnLimit) { + Current->CanBreakBefore = true; + Current->MustBreakBefore = true; + } + } else if (Current->is(TT_CtorInitializerColon)) { InFunctionDecl = false; + } // FIXME: Only calculate this if CanBreakBefore is true once static // initializers etc. are sorted out. @@ -5586,12 +5596,13 @@ static bool isAllmanLambdaBrace(const FormatToken &Tok) { bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, const FormatToken &Right) const { - const FormatToken &Left = *Right.Previous; if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0 && (!Style.RemoveEmptyLinesInUnwrappedLines || &Right == Line.First)) { return true; } + const FormatToken &Left = *Right.Previous; + if (Style.BreakFunctionDefinitionParameters && Line.MightBeFunctionDecl && Line.mightBeFunctionDefinition() && Left.MightBeFunctionDeclParen && Left.ParameterCount > 0) { diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 617d46ad281d5..6806ab18312ea 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -424,43 +424,14 @@ class LineJoiner { : 0; } // Try to merge a control statement block with left brace wrapped. - if (NextLine.First->is(tok::l_brace)) { - if ((TheLine->First->isOneOf(tok::kw_if, tok::kw_else, tok::kw_while, - tok::kw_for, tok::kw_switch, tok::kw_try, - tok::kw_do, TT_ForEachMacro) || - (TheLine->First->is(tok::r_brace) && TheLine->First->Next && - TheLine->First->Next->isOneOf(tok::kw_else, tok::kw_catch))) && - Style.BraceWrapping.AfterControlStatement == - FormatStyle::BWACS_MultiLine) { - // If possible, merge the next line's wrapped left brace with the - // current line. Otherwise, leave it on the next line, as this is a - // multi-line control statement. - return (Style.ColumnLimit == 0 || TheLine->Level * Style.IndentWidth + - TheLine->Last->TotalLength <= - Style.ColumnLimit) - ? 1 - : 0; - } - if (TheLine->First->isOneOf(tok::kw_if, tok::kw_else, tok::kw_while, - tok::kw_for, TT_ForEachMacro)) { - return (Style.BraceWrapping.AfterControlStatement == - FormatStyle::BWACS_Always) - ? tryMergeSimpleBlock(I, E, Limit) - : 0; - } - if (TheLine->First->isOneOf(tok::kw_else, tok::kw_catch) && - Style.BraceWrapping.AfterControlStatement == - FormatStyle::BWACS_MultiLine) { - // This case if different from the upper BWACS_MultiLine processing - // in that a preceding r_brace is not on the same line as else/catch - // most likely because of BeforeElse/BeforeCatch set to true. - // If the line length doesn't fit ColumnLimit, leave l_brace on the - // next line to respect the BWACS_MultiLine. - return (Style.ColumnLimit == 0 || - TheLine->Last->TotalLength <= Style.ColumnLimit) - ? 1 - : 0; - } + if (NextLine.First->is(TT_ControlStatementLBrace)) { + // If possible, merge the next line's wrapped left brace with the + // current line. Otherwise, leave it on the next line, as this is a + // multi-line control statement. + return Style.BraceWrapping.AfterControlStatement == + FormatStyle::BWACS_Always + ? tryMergeSimpleBlock(I, E, Limit) + : 0; } if (PreviousLine && TheLine->First->is(tok::l_brace)) { switch (PreviousLine->First->Tok.getKind()) { diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 5fe65cb9a47e7..b9430d4389feb 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -135,7 +135,8 @@ class CompoundStatementIndenter { CompoundStatementIndenter(UnwrappedLineParser *Parser, const FormatStyle &Style, unsigned &LineLevel) : CompoundStatementIndenter(Parser, LineLevel, - Style.BraceWrapping.AfterControlStatement, + Style.BraceWrapping.AfterControlStatement == + FormatStyle::BWACS_Always, Style.BraceWrapping.IndentBraces) {} CompoundStatementIndenter(UnwrappedLineParser *Parser, unsigned &LineLevel, bool WrapBrace, bool IndentBrace) @@ -3067,7 +3068,7 @@ void UnwrappedLineParser::parseTryCatch() { parseStructuralElement(); --Line->Level; } - while (true) { + for (bool SeenCatch = false;;) { if (FormatTok->is(tok::at)) nextToken(); if (!(FormatTok->isOneOf(tok::kw_catch, Keywords.kw___except, @@ -3077,6 +3078,8 @@ void UnwrappedLineParser::parseTryCatch() { FormatTok->is(Keywords.kw_finally)))) { break; } + if (FormatTok->is(tok::kw_catch)) + SeenCatch = true; nextToken(); while (FormatTok->isNot(tok::l_brace)) { if (FormatTok->is(tok::l_paren)) { @@ -3090,6 +3093,10 @@ void UnwrappedLineParser::parseTryCatch() { } nextToken(); } + if (SeenCatch) { + FormatTok->setFinalizedType(TT_ControlStatementLBrace); + SeenCatch = false; + } NeedsUnwrappedLine = false; Line->MustBeDeclaration = false; CompoundStatementIndenter Indenter(this, Style, Line->Level); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index b62d49e17c83f..49284c7f51e27 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -3419,6 +3419,17 @@ TEST_F(FormatTest, MultiLineControlStatements) { "{\n" "};", Style); + + Style = getLLVMStyle(); + Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always; + Style.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_WithoutElse; + Style.AllowShortLoopsOnASingleLine = true; + Style.BreakBeforeBraces = FormatStyle::BS_Custom; + Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_MultiLine; + verifyFormat("if (true) { return; }", Style); + verifyFormat("while (true) { return; }", Style); + // Failing test in https://reviews.llvm.org/D114521#3151727 + verifyFormat("for (;;) { bar(); }", Style); } TEST_F(FormatTest, BeforeWhile) { From de528d689f3a2305c051528993fe30295f2a0cd9 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Thu, 17 Apr 2025 10:08:50 +0800 Subject: [PATCH 203/710] [Clang] Handle default template arguments for alias CTAD guides (#134807) It's possible that some deduced template arguments come from default arguments, not just from the return type. So we need to recursively visit the default arguments of the parameter if it's referenced, thereby the template parameter referenced by the defualt arguments could come along to the synthesized deduction guide. Fixes https://github.com/llvm/llvm-project/issues/134471 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 20 +++ clang/test/SemaTemplate/deduction-guide.cpp | 117 +++++++++++++++++- 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 07ff1251fc1ad..0bf5e07ec5dfd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -471,6 +471,7 @@ Bug Fixes to C++ Support - Clang no longer crashes when trying to unify the types of arrays with certain differences in qualifiers (this could happen during template argument deduction or when building a ternary operator). (#GH97005) +- Fixed type alias CTAD issues involving default template arguments. (#GH134471) - The initialization kind of elements of structured bindings direct-list-initialized from an array is corrected to direct-initialization. - Clang no longer crashes when a coroutine is declared ``[[noreturn]]``. (#GH127327) diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index b4863cefc3fb4..29c5736a9bf9e 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -690,6 +690,26 @@ SmallVector TemplateParamsReferencedInTemplateArgumentList( SemaRef.MarkUsedTemplateParameters( DeducedArgs, TemplateParamsList->getDepth(), ReferencedTemplateParams); + auto MarkDefaultArgs = [&](auto *Param) { + if (!Param->hasDefaultArgument()) + return; + SemaRef.MarkUsedTemplateParameters( + Param->getDefaultArgument().getArgument(), + TemplateParamsList->getDepth(), ReferencedTemplateParams); + }; + + for (unsigned Index = 0; Index < TemplateParamsList->size(); ++Index) { + if (!ReferencedTemplateParams[Index]) + continue; + auto *Param = TemplateParamsList->getParam(Index); + if (auto *TTPD = dyn_cast(Param)) + MarkDefaultArgs(TTPD); + else if (auto *NTTPD = dyn_cast(Param)) + MarkDefaultArgs(NTTPD); + else + MarkDefaultArgs(cast(Param)); + } + SmallVector Results; for (unsigned Index = 0; Index < TemplateParamsList->size(); ++Index) { if (ReferencedTemplateParams[Index]) diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp index 2888453a12f5b..dabd0cf12f77e 100644 --- a/clang/test/SemaTemplate/deduction-guide.cpp +++ b/clang/test/SemaTemplate/deduction-guide.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++2a -verify -ast-dump -ast-dump-decl-types -ast-dump-filter "deduction guide" %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -std=c++2a -verify -ast-dump -ast-dump-decl-types -ast-dump-filter "deduction guide" %s | FileCheck %s --strict-whitespace -dump-input=always template struct X {}; template typename> struct Y {}; @@ -771,3 +771,118 @@ D d(24); // CHECK-NEXT: `-ParmVarDecl {{.+}} 'U' } // namespace GH132616_DeductionGuide + +namespace GH133132 { + +template +struct A {}; + +template +using AA = A; + +AA a{}; + +// CHECK-LABEL: Dumping GH133132::: +// CHECK-NEXT: FunctionTemplateDecl {{.+}} implicit +// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} class depth 0 index 0 T +// CHECK-NEXT: | `-TemplateArgument type 'int' +// CHECK-NEXT: | `-BuiltinType {{.+}} 'int' +// CHECK-NEXT: |-TemplateTypeParmDecl {{.+}} class depth 0 index 1 U +// CHECK-NEXT: | `-TemplateArgument type 'T':'type-parameter-0-0' +// CHECK-NEXT: | `-TemplateTypeParmType {{.+}} 'T' dependent depth 0 index 0 +// CHECK-NEXT: | `-TemplateTypeParm {{.+}} 'T' +// CHECK-NEXT: |-TypeTraitExpr {{.+}} 'bool' __is_deducible +// CHECK-NEXT: | |-DeducedTemplateSpecializationType {{.+}} 'GH133132::AA' dependent +// CHECK-NEXT: | | `-name: 'GH133132::AA' +// CHECK-NEXT: | | `-TypeAliasTemplateDecl {{.+}} AA +// CHECK-NEXT: | `-TemplateSpecializationType {{.+}} 'A' dependent +// CHECK-NEXT: | |-name: 'A':'GH133132::A' qualified +// CHECK-NEXT: | | `-ClassTemplateDecl {{.+}} A +// CHECK-NEXT: | `-TemplateArgument type 'U':'type-parameter-0-1' +// CHECK-NEXT: | `-SubstTemplateTypeParmType {{.+}} 'U' sugar dependent class depth 0 index 0 _Ty +// CHECK-NEXT: | |-FunctionTemplate {{.+}} '' +// CHECK-NEXT: | `-TemplateTypeParmType {{.+}} 'U' dependent depth 0 index 1 +// CHECK-NEXT: | `-TemplateTypeParm {{.+}} 'U' +// CHECK-NEXT: |-CXXDeductionGuideDecl {{.+}} implicit 'auto () -> A' +// CHECK-NEXT: `-CXXDeductionGuideDecl {{.+}} implicit used 'auto () -> A' implicit_instantiation +// CHECK-NEXT: |-TemplateArgument type 'int' +// CHECK-NEXT: | `-BuiltinType {{.+}} 'int' +// CHECK-NEXT: `-TemplateArgument type 'int' +// CHECK-NEXT: `-BuiltinType {{.+}} 'int' + +template