From 68c011aa085ab8ec198198e45c83de605a7dc31f Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Fri, 10 Jul 2020 09:46:57 +0200 Subject: [PATCH 01/10] [builtins] Optimize udivmodti4 for many platforms. Summary: While benchmarking uint128 division we found out that it has huge latency for small divisors https://reviews.llvm.org/D83027 ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor 13.0 13.0 55000000 BM_DivideIntrinsic128UniformDivisor<__int128> 14.3 14.3 50000000 BM_RemainderIntrinsic128UniformDivisor 13.5 13.5 52000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 14.1 14.1 50000000 BM_DivideIntrinsic128SmallDivisor 153 153 5000000 BM_DivideIntrinsic128SmallDivisor<__int128> 170 170 3000000 BM_RemainderIntrinsic128SmallDivisor 153 153 5000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 155 155 5000000 ``` This patch suggests a more optimized version of the division: If the divisor is 64 bit, we can proceed with the divq instruction on x86 or constant multiplication mechanisms for other platforms. Once both divisor and dividend are not less than 2**64, we use branch free subtract algorithm, it has at most 64 cycles. After that our benchmarks improved significantly ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor 11.0 11.0 64000000 BM_DivideIntrinsic128UniformDivisor<__int128> 13.8 13.8 51000000 BM_RemainderIntrinsic128UniformDivisor 11.6 11.6 61000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 13.7 13.7 52000000 BM_DivideIntrinsic128SmallDivisor 27.1 27.1 26000000 BM_DivideIntrinsic128SmallDivisor<__int128> 29.4 29.4 24000000 BM_RemainderIntrinsic128SmallDivisor 27.9 27.8 26000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 29.1 29.1 25000000 ``` If not using divq instrinsics, it is still much better ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor 12.2 12.2 58000000 BM_DivideIntrinsic128UniformDivisor<__int128> 13.5 13.5 52000000 BM_RemainderIntrinsic128UniformDivisor 12.7 12.7 56000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 13.7 13.7 51000000 BM_DivideIntrinsic128SmallDivisor 30.2 30.2 24000000 BM_DivideIntrinsic128SmallDivisor<__int128> 33.2 33.2 22000000 BM_RemainderIntrinsic128SmallDivisor 31.4 31.4 23000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 33.8 33.8 21000000 ``` PowerPC benchmarks: Was ``` BM_DivideIntrinsic128UniformDivisor 22.3 22.3 32000000 BM_DivideIntrinsic128UniformDivisor<__int128> 23.8 23.8 30000000 BM_RemainderIntrinsic128UniformDivisor 22.5 22.5 32000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 24.9 24.9 29000000 BM_DivideIntrinsic128SmallDivisor 394 394 2000000 BM_DivideIntrinsic128SmallDivisor<__int128> 397 397 2000000 BM_RemainderIntrinsic128SmallDivisor 399 399 2000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 397 397 2000000 ``` With this patch ``` BM_DivideIntrinsic128UniformDivisor 21.7 21.7 33000000 BM_DivideIntrinsic128UniformDivisor<__int128> 23.0 23.0 31000000 BM_RemainderIntrinsic128UniformDivisor 21.9 21.9 33000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 23.9 23.9 30000000 BM_DivideIntrinsic128SmallDivisor 32.7 32.6 23000000 BM_DivideIntrinsic128SmallDivisor<__int128> 33.4 33.4 21000000 BM_RemainderIntrinsic128SmallDivisor 31.1 31.1 22000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 33.2 33.2 22000000 ``` My email: danilak@google.com, I don't have commit rights Reviewers: howard.hinnant, courbet, MaskRay Reviewed By: courbet Subscribers: steven.zhang, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D81809 --- compiler-rt/lib/builtins/udivmodti4.c | 285 +++++++++++--------------- 1 file changed, 124 insertions(+), 161 deletions(-) diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c index dd14a8b579ca50..55def37c9e1fee 100644 --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -14,182 +14,145 @@ #ifdef CRT_HAS_128BIT +// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. +// Remainder stored in r. +// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division +// fallback. For a correctness proof see the reference for this algorithm +// in Knuth, Volume 2, section 4.3.1, Algorithm D. +UNUSED +static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, + du_int *r) { + const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; + const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits) + du_int un1, un0; // Norm. dividend LSD's + du_int vn1, vn0; // Norm. divisor digits + du_int q1, q0; // Quotient digits + du_int un64, un21, un10; // Dividend digit pairs + du_int rhat; // A remainder + si_int s; // Shift amount for normalization + + s = __builtin_clzll(v); + if (s > 0) { + // Normalize the divisor. + v = v << s; + un64 = (u1 << s) | (u0 >> (n_udword_bits - s)); + un10 = u0 << s; // Shift dividend left + } else { + // Avoid undefined behavior of (u0 >> 64). + un64 = u1; + un10 = u0; + } + + // Break divisor up into two 32-bit digits. + vn1 = v >> (n_udword_bits / 2); + vn0 = v & 0xFFFFFFFF; + + // Break right half of dividend into two digits. + un1 = un10 >> (n_udword_bits / 2); + un0 = un10 & 0xFFFFFFFF; + + // Compute the first quotient digit, q1. + q1 = un64 / vn1; + rhat = un64 - q1 * vn1; + + // q1 has at most error 2. No more than 2 iterations. + while (q1 >= b || q1 * vn0 > b * rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + un21 = un64 * b + un1 - q1 * v; + + // Compute the second quotient digit. + q0 = un21 / vn1; + rhat = un21 - q0 * vn1; + + // q0 has at most error 2. No more than 2 iterations. + while (q0 >= b || q0 * vn0 > b * rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + *r = (un21 * b + un0 - q0 * v) >> s; + return q1 * b + q0; +} + +static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, + du_int *r) { +#if defined(__x86_64__) + du_int result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [ v ] "r"(v), "a"(u0), "d"(u1)); + return result; +#else + return udiv128by64to64default(u1, u0, v, r); +#endif +} + // Effects: if rem != 0, *rem = a % b // Returns: a / b -// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide - COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) { - const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; - utwords n; - n.all = a; - utwords d; - d.all = b; - utwords q; - utwords r; - unsigned sr; - // special cases, X is unknown, K != 0 - if (n.s.high == 0) { - if (d.s.high == 0) { - // 0 X - // --- - // 0 X - if (rem) - *rem = n.s.low % d.s.low; - return n.s.low / d.s.low; - } - // 0 X - // --- - // K X + utwords dividend; + dividend.all = a; + utwords divisor; + divisor.all = b; + utwords quotient; + utwords remainder; + if (divisor.all > dividend.all) { if (rem) - *rem = n.s.low; + *rem = dividend.all; return 0; } - // n.s.high != 0 - if (d.s.low == 0) { - if (d.s.high == 0) { - // K X - // --- - // 0 0 - if (rem) - *rem = n.s.high % d.s.low; - return n.s.high / d.s.low; - } - // d.s.high != 0 - if (n.s.low == 0) { - // K 0 - // --- - // K 0 - if (rem) { - r.s.high = n.s.high % d.s.high; - r.s.low = 0; - *rem = r.all; - } - return n.s.high / d.s.high; - } - // K K - // --- - // K 0 - if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ { - if (rem) { - r.s.low = n.s.low; - r.s.high = n.s.high & (d.s.high - 1); - *rem = r.all; - } - return n.s.high >> __builtin_ctzll(d.s.high); - } - // K K - // --- - // K 0 - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 2 or sr large - if (sr > n_udword_bits - 2) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - // r.all = n.all >> sr; - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* d.s.low != 0 */ { - if (d.s.high == 0) { - // K X - // --- - // 0 K - if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ { - if (rem) - *rem = n.s.low & (d.s.low - 1); - if (d.s.low == 1) - return n.all; - sr = __builtin_ctzll(d.s.low); - q.s.high = n.s.high >> sr; - q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - return q.all; - } - // K X - // --- - // 0 K - sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) - - __builtin_clzll(n.s.high); - // 2 <= sr <= n_utword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - if (sr == n_udword_bits) { - q.s.low = 0; - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ { - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ { - q.s.low = n.s.low << (n_utword_bits - sr); - q.s.high = (n.s.high << (n_utword_bits - sr)) | - (n.s.low >> (sr - n_udword_bits)); - r.s.high = 0; - r.s.low = n.s.high >> (sr - n_udword_bits); - } + // When the divisor fits in 64 bits, we can use an optimized path. + if (divisor.s.high == 0) { + remainder.s.high = 0; + if (dividend.s.high < divisor.s.low) { + // The result fits in 64 bits. + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); + quotient.s.high = 0; } else { - // K X - // --- - // K K - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 1 or sr large - if (sr > n_udword_bits - 1) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - q.s.low = 0; - if (sr == n_udword_bits) { - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else { - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - q.s.high = n.s.low << (n_udword_bits - sr); - } + // First, divide with the high part to get the remainder in dividend.s.high. + // After that dividend.s.high < divisor.s.low. + quotient.s.high = dividend.s.high / divisor.s.low; + dividend.s.high = dividend.s.high % divisor.s.low; + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); } + if (rem) + *rem = remainder.all; + return quotient.all; } - // Not a special case - // q and r are initialized with: - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - // 1 <= sr <= n_utword_bits - 1 - su_int carry = 0; - for (; sr > 0; --sr) { - // r:q = ((r:q) << 1) | carry - r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1)); - r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1)); - q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1)); - q.s.low = (q.s.low << 1) | carry; - // carry = 0; - // if (r.all >= d.all) + // 0 <= shift <= 63. + si_int shift = + __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high); + divisor.all <<= shift; + quotient.s.high = 0; + quotient.s.low = 0; + for (; shift >= 0; --shift) { + quotient.s.low <<= 1; + // Branch free version of. + // if (dividend.all >= divisor.all) // { - // r.all -= d.all; - // carry = 1; + // dividend.all -= divisor.all; + // carry = 1; // } - const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1); - carry = s & 1; - r.all -= d.all & s; + const ti_int s = + (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1); + quotient.s.low |= s & 1; + dividend.all -= divisor.all & s; + divisor.all >>= 1; } - q.all = (q.all << 1) | carry; if (rem) - *rem = r.all; - return q.all; + *rem = dividend.all; + return quotient.all; } #endif // CRT_HAS_128BIT From 21bacc215413d10df53a4690e9561e9b96698742 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Fri, 10 Jul 2020 10:54:18 +0300 Subject: [PATCH 02/10] [analyzer][tests] Measure peak memory consumption for every project Differential Revision: https://reviews.llvm.org/D82967 --- clang/utils/analyzer/Dockerfile | 5 +- clang/utils/analyzer/SATestBuild.py | 72 +++++++++++++++-------- clang/utils/analyzer/SATestUtils.py | 84 ++++++++++++++++++++++++++- clang/utils/analyzer/requirements.txt | 4 ++ 4 files changed, 136 insertions(+), 29 deletions(-) create mode 100644 clang/utils/analyzer/requirements.txt diff --git a/clang/utils/analyzer/Dockerfile b/clang/utils/analyzer/Dockerfile index 30fb67cf93c8c3..21906011c7dc2a 100644 --- a/clang/utils/analyzer/Dockerfile +++ b/clang/utils/analyzer/Dockerfile @@ -54,8 +54,7 @@ ENV PATH="/analyzer/bin:${PATH}" ADD entrypoint.py /entrypoint.py -# Uncomment in case of requirements -# ADD requirements.txt /requirements.txt -# RUN pip3 install -r /requirements.txt +ADD requirements.txt /requirements.txt +RUN pip3 install -r /requirements.txt ENTRYPOINT ["python", "/entrypoint.py"] diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py index 7d337632744f09..ee510e03cc5a08 100644 --- a/clang/utils/analyzer/SATestBuild.py +++ b/clang/utils/analyzer/SATestBuild.py @@ -43,7 +43,7 @@ variable. It should contain a comma separated list. """ import CmpRuns -import SATestUtils +import SATestUtils as utils from ProjectMap import DownloadType, ProjectInfo import glob @@ -63,7 +63,7 @@ # and this is we can shush that false positive from plistlib import InvalidFileException # type:ignore from subprocess import CalledProcessError, check_call -from typing import Dict, IO, List, NamedTuple, Optional, TYPE_CHECKING +from typing import Dict, IO, List, NamedTuple, Optional, TYPE_CHECKING, Tuple ############################################################################### @@ -115,7 +115,7 @@ def stdout(message: str): if 'CC' in os.environ: cc_candidate: Optional[str] = os.environ['CC'] else: - cc_candidate = SATestUtils.which("clang", os.environ['PATH']) + cc_candidate = utils.which("clang", os.environ['PATH']) if not cc_candidate: stderr("Error: cannot find 'clang' in PATH") sys.exit(1) @@ -194,9 +194,9 @@ def run_cleanup_script(directory: str, build_log_file: IO): cwd = os.path.join(directory, PATCHED_SOURCE_DIR_NAME) script_path = os.path.join(directory, CLEANUP_SCRIPT) - SATestUtils.run_script(script_path, build_log_file, cwd, - out=LOCAL.stdout, err=LOCAL.stderr, - verbose=VERBOSE) + utils.run_script(script_path, build_log_file, cwd, + out=LOCAL.stdout, err=LOCAL.stderr, + verbose=VERBOSE) class TestInfo(NamedTuple): @@ -351,8 +351,6 @@ def get_output_dir(self) -> str: return OUTPUT_DIR_NAME def build(self, directory: str, output_dir: str): - time_start = time.time() - build_log_path = get_build_log_path(output_dir) stdout(f"Log file: {build_log_path}\n") @@ -375,19 +373,23 @@ def build(self, directory: str, output_dir: str): if self.project.mode == 1: self._download_and_patch(directory, build_log_file) run_cleanup_script(directory, build_log_file) - self.scan_build(directory, output_dir, build_log_file) + build_time, memory = self.scan_build(directory, output_dir, + build_log_file) else: - self.analyze_preprocessed(directory, output_dir) + build_time, memory = self.analyze_preprocessed(directory, + output_dir) if self.is_reference_build: run_cleanup_script(directory, build_log_file) normalize_reference_results(directory, output_dir, self.project.mode) - stdout(f"Build complete (time: {time.time() - time_start:.2f}). " + stdout(f"Build complete (time: {utils.time_to_str(build_time)}, " + f"peak memory: {utils.memory_to_str(memory)}). " f"See the log for more details: {build_log_path}\n") - def scan_build(self, directory: str, output_dir: str, build_log_file: IO): + def scan_build(self, directory: str, output_dir: str, + build_log_file: IO) -> Tuple[float, int]: """ Build the project with scan-build by reading in the commands and prefixing them with the scan-build options. @@ -416,6 +418,10 @@ def scan_build(self, directory: str, output_dir: str, build_log_file: IO): options += "--override-compiler " extra_env: Dict[str, str] = {} + + execution_time = 0.0 + peak_memory = 0 + try: command_file = open(build_script_path, "r") command_prefix = "scan-build " + options + " " @@ -451,11 +457,15 @@ def scan_build(self, directory: str, output_dir: str, build_log_file: IO): if VERBOSE >= 1: stdout(f" Executing: {command_to_run}\n") - check_call(command_to_run, cwd=cwd, - stderr=build_log_file, - stdout=build_log_file, - env=dict(os.environ, **extra_env), - shell=True) + time, mem = utils.check_and_measure_call( + command_to_run, cwd=cwd, + stderr=build_log_file, + stdout=build_log_file, + env=dict(os.environ, **extra_env), + shell=True) + + execution_time += time + peak_memory = max(peak_memory, mem) except CalledProcessError: stderr("Error: scan-build failed. Its output was: \n") @@ -463,7 +473,10 @@ def scan_build(self, directory: str, output_dir: str, build_log_file: IO): shutil.copyfileobj(build_log_file, LOCAL.stderr) sys.exit(1) - def analyze_preprocessed(self, directory: str, output_dir: str): + return execution_time, peak_memory + + def analyze_preprocessed(self, directory: str, + output_dir: str) -> Tuple[float, int]: """ Run analysis on a set of preprocessed files. """ @@ -487,14 +500,17 @@ def analyze_preprocessed(self, directory: str, output_dir: str): fail_path = os.path.join(plist_path, "failures") os.makedirs(fail_path) + execution_time = 0.0 + peak_memory = 0 + for full_file_name in glob.glob(directory + "/*"): file_name = os.path.basename(full_file_name) failed = False # Only run the analyzes on supported files. - if SATestUtils.has_no_extension(file_name): + if utils.has_no_extension(file_name): continue - if not SATestUtils.is_valid_single_input_file(file_name): + if not utils.is_valid_single_input_file(file_name): stderr(f"Error: Invalid single input file {full_file_name}.\n") raise Exception() @@ -509,8 +525,12 @@ def analyze_preprocessed(self, directory: str, output_dir: str): if VERBOSE >= 1: stdout(f" Executing: {command}\n") - check_call(command, cwd=directory, stderr=log_file, - stdout=log_file, shell=True) + time, mem = utils.check_and_measure_call( + command, cwd=directory, stderr=log_file, + stdout=log_file, shell=True) + + execution_time += time + peak_memory = max(peak_memory, mem) except CalledProcessError as e: stderr(f"Error: Analyzes of {full_file_name} failed. " @@ -522,6 +542,8 @@ def analyze_preprocessed(self, directory: str, output_dir: str): if not failed: os.remove(log_file.name) + return execution_time, peak_memory + def generate_config(self) -> str: out = "serialize-stats=true,stable-report-filename=true" @@ -598,9 +620,9 @@ def _unpack_zip(self, directory: str, build_log_file: IO): @staticmethod def _run_download_script(directory: str, build_log_file: IO): script_path = os.path.join(directory, DOWNLOAD_SCRIPT) - SATestUtils.run_script(script_path, build_log_file, directory, - out=LOCAL.stdout, err=LOCAL.stderr, - verbose=VERBOSE) + utils.run_script(script_path, build_log_file, directory, + out=LOCAL.stdout, err=LOCAL.stderr, + verbose=VERBOSE) @staticmethod def _apply_patch(directory: str, build_log_file: IO): diff --git a/clang/utils/analyzer/SATestUtils.py b/clang/utils/analyzer/SATestUtils.py index 4e126e66b869fe..3947e183d82f3d 100644 --- a/clang/utils/analyzer/SATestUtils.py +++ b/clang/utils/analyzer/SATestUtils.py @@ -1,8 +1,9 @@ import os import sys +import time from subprocess import CalledProcessError, check_call -from typing import List, IO, Optional +from typing import List, IO, Optional, Tuple def which(command: str, paths: Optional[str] = None) -> Optional[str]: @@ -47,6 +48,87 @@ def is_valid_single_input_file(file_name: str) -> bool: return ext in (".i", ".ii", ".c", ".cpp", ".m", "") +def time_to_str(time: float) -> str: + """ + Convert given time in seconds into a human-readable string. + """ + return f"{time:.2f}s" + + +def memory_to_str(memory: int) -> str: + """ + Convert given number of bytes into a human-readable string. + """ + if memory: + try: + import humanize + return humanize.naturalsize(memory, gnu=True) + except ImportError: + # no formatter installed, let's keep it in bytes + return f"{memory}B" + + # If memory is 0, we didn't succeed measuring it. + return "N/A" + + +def check_and_measure_call(*popenargs, **kwargs) -> Tuple[float, int]: + """ + Run command with arguments. Wait for command to complete and measure + execution time and peak memory consumption. + If the exit code was zero then return, otherwise raise + CalledProcessError. The CalledProcessError object will have the + return code in the returncode attribute. + + The arguments are the same as for the call and check_call functions. + + Return a tuple of execution time and peak memory. + """ + peak_mem = 0 + start_time = time.time() + + try: + import psutil as ps + + def get_memory(process: ps.Process) -> int: + mem = 0 + + # we want to gather memory usage from all of the child processes + descendants = list(process.children(recursive=True)) + descendants.append(process) + + for subprocess in descendants: + try: + mem += subprocess.memory_info().rss + except (ps.NoSuchProcess, ps.AccessDenied): + continue + + return mem + + with ps.Popen(*popenargs, **kwargs) as process: + # while the process is running calculate resource utilization. + while (process.is_running() and + process.status() != ps.STATUS_ZOMBIE): + # track the peak utilization of the process + peak_mem = max(peak_mem, get_memory(process)) + time.sleep(.5) + + if process.is_running(): + process.kill() + + if process.returncode != 0: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise CalledProcessError(process.returncode, cmd) + + except ImportError: + # back off to subprocess if we don't have psutil installed + peak_mem = 0 + check_call(*popenargs, **kwargs) + + return time.time() - start_time, peak_mem + + def run_script(script_path: str, build_log_file: IO, cwd: str, out=sys.stdout, err=sys.stderr, verbose: int = 0): """ diff --git a/clang/utils/analyzer/requirements.txt b/clang/utils/analyzer/requirements.txt new file mode 100644 index 00000000000000..ec4f669299523f --- /dev/null +++ b/clang/utils/analyzer/requirements.txt @@ -0,0 +1,4 @@ +graphviz +humanize +matplotlib +psutil From 9c7ff0a4aaef0a1efa33e9ff8a12a064087bded9 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Fri, 10 Jul 2020 11:20:20 +0300 Subject: [PATCH 03/10] [analyzer][tests] Make test interruption safe Differential Revision: https://reviews.llvm.org/D83373 --- clang/utils/analyzer/SATest.py | 44 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py index fb2e031c6ab600..16f1dce0c584e6 100755 --- a/clang/utils/analyzer/SATest.py +++ b/clang/utils/analyzer/SATest.py @@ -132,27 +132,35 @@ def docker_shell(args): pass finally: - print("Please wait for docker to clean up") - call("docker stop satest", shell=True) + docker_cleanup() def docker_run(args, command, docker_args=""): - return call("docker run --rm --name satest " - "-v {llvm}:/llvm-project " - "-v {build}:/build " - "-v {clang}:/analyzer " - "-v {scripts}:/scripts " - "-v {projects}:/projects " - "{docker_args} " - "satest-image:latest {command}" - .format(llvm=args.llvm_project_dir, - build=args.build_dir, - clang=args.clang_dir, - scripts=SCRIPTS_DIR, - projects=PROJECTS_DIR, - docker_args=docker_args, - command=command), - shell=True) + try: + return call("docker run --rm --name satest " + "-v {llvm}:/llvm-project " + "-v {build}:/build " + "-v {clang}:/analyzer " + "-v {scripts}:/scripts " + "-v {projects}:/projects " + "{docker_args} " + "satest-image:latest {command}" + .format(llvm=args.llvm_project_dir, + build=args.build_dir, + clang=args.clang_dir, + scripts=SCRIPTS_DIR, + projects=PROJECTS_DIR, + docker_args=docker_args, + command=command), + shell=True) + + except KeyboardInterrupt: + docker_cleanup() + + +def docker_cleanup(): + print("Please wait for docker to clean up") + call("docker stop satest", shell=True) def main(): From 00997d1cad9ecd40c92aeae40269f6bfb9e58d11 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Fri, 10 Jul 2020 11:20:36 +0300 Subject: [PATCH 04/10] [analyzer][tests] Fix zip unpacking Differential Revision: https://reviews.llvm.org/D83374 --- clang/utils/analyzer/SATestBuild.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py index ee510e03cc5a08..eefab869f6ef92 100644 --- a/clang/utils/analyzer/SATestBuild.py +++ b/clang/utils/analyzer/SATestBuild.py @@ -601,7 +601,7 @@ def _download_from_git(self, directory: str, build_log_file: IO): stdout=build_log_file, shell=True) def _unpack_zip(self, directory: str, build_log_file: IO): - zip_files = list(glob.glob(os.path.join(directory, "/*.zip"))) + zip_files = list(glob.glob(directory + "/*.zip")) if len(zip_files) == 0: raise ValueError( From 77133cc1e2c91678082d2098b959757e72dfce60 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 10 Jul 2020 09:33:10 +0100 Subject: [PATCH 05/10] [X86][AVX] Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)). Truncations lowered as shuffles of multiple (concatenated) vectors often leave us with lane-crossing shuffles that feed a PACKSS/PACKUS, if both shuffles are fed from the same 2 vector sources, then we can PACK the sources directly and shuffle the result instead. This is currently limited to whole i128 lanes in a 256-bit vector, but we can extend this if the need arises (but I'm not seeing many examples in real world code). --- llvm/lib/Target/X86/X86ISelLowering.cpp | 31 +++++++++++++++++++ llvm/test/CodeGen/X86/avg.ll | 20 +++++------- .../test/CodeGen/X86/bitcast-and-setcc-512.ll | 9 ++---- llvm/test/CodeGen/X86/packss.ll | 6 ++-- .../CodeGen/X86/vector-compare-results.ll | 10 +++--- llvm/test/CodeGen/X86/vector-pack-256.ll | 20 +++--------- 6 files changed, 52 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cdd7ba1ec4324f..4d3b0eda58f23d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41937,6 +41937,37 @@ static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) { } } + // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)). + // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles. + if (VT.is256BitVector()) { + if (auto *SVN0 = dyn_cast(N0)) { + if (auto *SVN1 = dyn_cast(N1)) { + SmallVector ShuffleMask0, ShuffleMask1; + if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) && + scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) { + SDValue Op00 = SVN0->getOperand(0); + SDValue Op01 = SVN0->getOperand(1); + SDValue Op10 = SVN1->getOperand(0); + SDValue Op11 = SVN1->getOperand(1); + if ((Op00 == Op11) && (Op01 == Op10)) { + std::swap(Op10, Op11); + ShuffleVectorSDNode::commuteMask(ShuffleMask1); + } + if ((Op00 == Op10) && (Op01 == Op11)) { + SmallVector ShuffleMask; + ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); + ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); + SDLoc DL(N); + SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); + Res = DAG.getBitcast(MVT::v4i64, Res); + Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); + } + } + } + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index c6c2230a1d77dc..d2638a1681e85d 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -233,9 +233,8 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -597,25 +596,22 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 +; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm0[2,3] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,2,1,3] ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll index 2424e1c3a74d82..bf251635ffb0db 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -251,15 +251,12 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vpacksswb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpacksswb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll index f4b601afb5696e..16349ae2c7f9b5 100644 --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -369,10 +369,8 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] ; AVX2-NEXT: ret{{[l|q]}} %1 = icmp eq <16 x i16> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i16> diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 80cfa365e0f1a0..568dff8e13585e 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -1453,15 +1453,13 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind { ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0 ; AVX2-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %edx ; AVX2-NEXT: shlq $32, %rdx ; AVX2-NEXT: orq %rcx, %rdx diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll index fb0e5d063f6136..6d1016ba1d737a 100644 --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -24,10 +24,7 @@ define <16 x i16> @trunc_concat_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounw ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_concat_packssdw_256: @@ -64,10 +61,7 @@ define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounw ; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_concat_packusdw_256: @@ -103,10 +97,7 @@ define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_concat_packsswb_256: @@ -155,10 +146,7 @@ define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_concat_packuswb_256: From c06417b24dfbfd62650ab05aff78e188b5d00681 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 10 Jul 2020 01:24:58 -0700 Subject: [PATCH 06/10] Fix check-all with -DLLVM_USE_SANITIZER=Address --- clang/test/SemaTemplate/stack-exhaustion.cpp | 3 +++ compiler-rt/cmake/config-ix.cmake | 2 +- llvm/test/tools/gold/lit.local.cfg | 7 ++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/clang/test/SemaTemplate/stack-exhaustion.cpp b/clang/test/SemaTemplate/stack-exhaustion.cpp index 1eb1b474ffa580..c7bfea4132d5ed 100644 --- a/clang/test/SemaTemplate/stack-exhaustion.cpp +++ b/clang/test/SemaTemplate/stack-exhaustion.cpp @@ -8,6 +8,9 @@ // implementation limits, just disable the test. // UNSUPPORTED: system-netbsd +// asan has own stack-overflow check. +// UNSUPPORTED: asan + // expected-warning@* 0-1{{stack nearly exhausted}} // expected-note@* 0+{{}} diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 1f3697ff6f6595..2edc1dabd90d3d 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -650,7 +650,7 @@ endif() # TODO: Add builtins support. -if (CRT_SUPPORTED_ARCH AND OS_NAME MATCHES "Linux") +if (CRT_SUPPORTED_ARCH AND OS_NAME MATCHES "Linux" AND NOT LLVM_USE_SANITIZER) set(COMPILER_RT_HAS_CRT TRUE) else() set(COMPILER_RT_HAS_CRT FALSE) diff --git a/llvm/test/tools/gold/lit.local.cfg b/llvm/test/tools/gold/lit.local.cfg index a704bb548b9dc7..6d2835ac8be64b 100644 --- a/llvm/test/tools/gold/lit.local.cfg +++ b/llvm/test/tools/gold/lit.local.cfg @@ -1,2 +1,7 @@ -if (not 'ld_plugin' in config.available_features): +if (not 'ld_plugin' in config.available_features): config.unsupported = True + +# gold can't load instrumented plugin. +for san in ['asan', 'msan', 'ubsan']: + if (san in config.available_features): + config.unsupported = True From 9a3e8b11a8317b1a3d7440b0585b011cc9527494 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 9 Jul 2020 17:36:26 +0100 Subject: [PATCH 07/10] extractConstantWithoutWrapping - use const APInt& returned by SCEVConstant::getAPInt() Avoids unnecessary APInt copies and silences clang tidy warning. --- llvm/lib/Analysis/ScalarEvolution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 75926aa3a96024..48c686b7326086 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -1353,7 +1353,7 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start, static APInt extractConstantWithoutWrapping(ScalarEvolution &SE, const SCEVConstant *ConstantTerm, const SCEVAddExpr *WholeAddExpr) { - const APInt C = ConstantTerm->getAPInt(); + const APInt &C = ConstantTerm->getAPInt(); const unsigned BitWidth = C.getBitWidth(); // Find number of trailing zeros of (x + y + ...) w/o the C first: uint32_t TZ = BitWidth; From cf40db21af489e3e5bc1c39cea855cc068c4ce48 Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin Date: Fri, 10 Jul 2020 11:32:32 +0200 Subject: [PATCH 08/10] [AMDGPU][GlobalISel] Fix G_AMDGPU_TBUFFER_STORE_FORMAT mapping Add missing mappings and tablegen definitions for TBUFFER_STORE_FORMAT. Differential Revision: https://reviews.llvm.org/D83240 --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 + .../llvm.amdgcn.raw.tbuffer.store.f16.ll | 492 +++++++++++++ .../llvm.amdgcn.raw.tbuffer.store.i8.ll | 284 ++++++++ .../llvm.amdgcn.raw.tbuffer.store.ll | 651 ++++++++++++++++++ 5 files changed, 1431 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0d7819bc144dbd..56bc0c44779d80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3839,6 +3839,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: case AMDGPU::G_AMDGPU_BUFFER_STORE: case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 4bc9fd04b3de4f..fa42ddc54b5655 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1799,12 +1799,14 @@ defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll new file mode 100644 index 00000000000000..987c839bf91c4c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -0,0 +1,492 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s + +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; FIXME: Crashes +; define amdgpu_ps void @raw_tbuffer_store_v3f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) +; ret void +; } + +define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec + ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Waterfall for rsrc +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc and soffset +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Waterfall for rsrc and soffset, copy for voffset +define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(half %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll new file mode 100644 index 00000000000000..e3a09d5dd36793 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -0,0 +1,284 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s + +define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; UNPACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; UNPACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; PACKED: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; PACKED: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc and soffset +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc and soffset, copy for voffset +define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i8 %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; UNPACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: successors: %bb.2(0x80000000) + ; UNPACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; UNPACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; UNPACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; UNPACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; UNPACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; UNPACKED: bb.2: + ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; UNPACKED: bb.3: + ; UNPACKED: successors: %bb.4(0x80000000) + ; UNPACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; UNPACKED: bb.4: + ; UNPACKED: S_ENDPGM 0 + ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: successors: %bb.2(0x80000000) + ; PACKED: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; PACKED: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; PACKED: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; PACKED: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; PACKED: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; PACKED: bb.2: + ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; PACKED: bb.3: + ; PACKED: successors: %bb.4(0x80000000) + ; PACKED: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; PACKED: bb.4: + ; PACKED: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.i8(i8, <4 x i32>, i32, i32, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll new file mode 100644 index 00000000000000..24047146ffb796 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -0,0 +1,651 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Natural mapping +define amdgpu_ps void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Natural mapping +define amdgpu_ps void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Natural mapping +define amdgpu_ps void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x float> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Copies for VGPR arguments +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr7 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) + ret void +} + +; Waterfall for rsrc and soffset +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; Waterfall for rsrc and soffset, copy for voffset +define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) + ret void +} + +; Natural mapping + glc +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) + ret void +} + +; Natural mapping + slc +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) + ret void +} + +; Natural mapping + glc + slc +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) + ret void +} + +; Natural mapping + dlc +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) + ret void +} + + + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffset0 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset = add i32 %voffset.base, 16 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset = add i32 %voffset.base, 4095 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset = add i32 %voffset.base, 4096 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %soffset = add i32 %soffset.base, 16 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %soffset = add i32 %soffset.base, 4095 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %soffset = add i32 %soffset.base, 4096 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000(float %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + %soffset = add i32 %soffset.base, 5000 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. +define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000(float %val, <4 x i32> %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: $vcc_hi = IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + %voffset = add i32 %voffset.base, 5000 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg, i32 immarg) From 264ab1e2c815728ede5d1fce257abbd04044cc27 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 10 Jul 2020 09:45:02 +0100 Subject: [PATCH 09/10] [LV] Pick vector loop body as insert point for SCEV expansion. Currently the DomTree is not kept up to date for additional blocks generated in the vector loop, for example when vectorizing with predication. SCEVExpander relies on dominance checks when looking for existing instructions to re-use and in some cases that can lead to the expander picking instructions that do not actually dominate their insert point (e.g. as in PR46525). Unfortunately keeping the DT up-to-date is a bit tricky, because the CFG is only patched up after generating code for a block. For now, we can just use the vector loop header, as this ensures the inserted instructions dominate all uses in the vector loop. There should be no noticeable impact on the generated code, as other passes should sink those instructions, if profitable. Fixes PR46525. Reviewers: Ayal, gilr, mkazantsev, dmgreen Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D83288 --- .../Transforms/Vectorize/LoopVectorize.cpp | 18 ++- .../pr46525-expander-insertpoint.ll | 114 ++++++++++++++++++ 2 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4998082f386890..10e690d56ffd16 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2877,6 +2877,18 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return B.CreateMul(X, Y); }; + // Get a suitable insert point for SCEV expansion. For blocks in the vector + // loop, choose the end of the vector loop header (=LoopVectorBody), because + // the DomTree is not kept up-to-date for additional blocks generated in the + // vector loop. By using the header as insertion point, we guarantee that the + // expanded instructions dominate all their uses. + auto GetInsertPoint = [this, &B]() { + BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); + if (InsertBB != LoopVectorBody && + LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) + return LoopVectorBody->getTerminator(); + return &*B.GetInsertPoint(); + }; switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && @@ -2884,7 +2896,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { @@ -2892,8 +2904,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex( "Expected constant step for pointer induction"); return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, - CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), - &*B.GetInsertPoint()))); + CreateMul(Index, + Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll new file mode 100644 index 00000000000000..98f394e2dc3d73 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -force-vector-width=2 -S -prefer-predicate-over-epilog %s | FileCheck %s + + +; Test case for PR46525. There are two candidates to pick for +; `udiv i64 %y, %add` when expanding SCEV expressions. Make sure we pick %div, +; which dominates the vector loop. + +define void @test(i16 %x, i64 %y, i32* %ptr) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV19:%.*]] = sext i16 [[X:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[CONV19]], 492802768830814067 +; CHECK-NEXT: br label [[LOOP_PREHEADER:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[Y:%.*]], [[ADD]] +; CHECK-NEXT: [[INC:%.*]] = add i64 [[DIV]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[DIV]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[INC]] +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[INC]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[INC]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> undef, i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INC]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 0, [[INC]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], [[TMP4]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.if5: +; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], [[INC]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[OFFSET_IDX7]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INC]] to i8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <2 x i8> undef, i8 [[TMP9]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT8]], <2 x i8> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> undef, i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT10]], <2 x i8> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i8> , [[DOTSPLAT11]] +; CHECK-NEXT: [[INDUCTION12:%.*]] = add <2 x i8> [[BROADCAST_SPLAT9]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i8 0, [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; +entry: + %conv19 = sext i16 %x to i64 + %add = add i64 %conv19, 492802768830814067 + br label %loop.preheader + +loop.preheader: + %div = udiv i64 %y, %add + %inc = add i64 %div, 1 + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %loop.preheader ] + store i32 0, i32* %ptr, align 4 + %v2 = trunc i64 %iv to i8 + %v3 = add i8 %v2, 1 + %cmp15 = icmp slt i8 %v3, 5 + %iv.next = add i64 %iv, %inc + br i1 %cmp15, label %loop, label %loop.exit + +loop.exit: + %div.1 = udiv i64 %y, %add + %v1 = add i64 %div.1, 1 + br label %loop.2 + +loop.2: + %iv.1 = phi i64 [ %iv.next.1, %loop.2 ], [ 0, %loop.exit ] + %iv.next.1 = add i64 %iv.1, %v1 + call void @use(i64 %iv.next.1) + %ec = icmp ult i64 %iv.next.1, 200 + br i1 %ec, label %loop.2, label %loop.2.exit + +loop.2.exit: + %c = call i1 @cond() + br i1 %c, label %loop.preheader, label %exit + +exit: + ret void +} + +declare void @use(i64) +declare i1 @cond() From 5f41ca48d1c46fc78958d47c0edfb2dbcde47217 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 10 Jul 2020 11:42:04 +0200 Subject: [PATCH 10/10] [clang-tidy] More strict on matching the standard memset function in memset-usage check. The check assumed the matched function call has 3 arguments, but the matcher didn't guaranteed that. Differential Revision: https://reviews.llvm.org/D83301 --- .../bugprone/SuspiciousMemsetUsageCheck.cpp | 16 ++++++++++++---- .../bugprone-suspicious-memset-usage.cpp | 5 +++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp index 9f98316984ed8d..37748d9fa8ccfb 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp @@ -20,11 +20,19 @@ namespace tidy { namespace bugprone { void SuspiciousMemsetUsageCheck::registerMatchers(MatchFinder *Finder) { - // Note: void *memset(void *buffer, int fill_char, size_t byte_count); + // Match the standard memset: + // void *memset(void *buffer, int fill_char, size_t byte_count); + auto MemsetDecl = + functionDecl(hasName("::memset"), + parameterCountIs(3), + hasParameter(0, hasType(pointerType(pointee(voidType())))), + hasParameter(1, hasType(isInteger())), + hasParameter(2, hasType(isInteger()))); + // Look for memset(x, '0', z). Probably memset(x, 0, z) was intended. Finder->addMatcher( callExpr( - callee(functionDecl(hasName("::memset"))), + callee(MemsetDecl), hasArgument(1, characterLiteral(equals(static_cast('0'))) .bind("char-zero-fill")), unless( @@ -36,14 +44,14 @@ void SuspiciousMemsetUsageCheck::registerMatchers(MatchFinder *Finder) { // Look for memset with an integer literal in its fill_char argument. // Will check if it gets truncated. - Finder->addMatcher(callExpr(callee(functionDecl(hasName("::memset"))), + Finder->addMatcher(callExpr(callee(MemsetDecl), hasArgument(1, integerLiteral().bind("num-fill")), unless(isInTemplateInstantiation())), this); // Look for memset(x, y, 0) as that is most likely an argument swap. Finder->addMatcher( - callExpr(callee(functionDecl(hasName("::memset"))), + callExpr(callee(MemsetDecl), unless(hasArgument(1, anyOf(characterLiteral(equals( static_cast('0'))), integerLiteral()))), diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-suspicious-memset-usage.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-suspicious-memset-usage.cpp index f33ae5ae10a846..9a7e423f40122e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-suspicious-memset-usage.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-suspicious-memset-usage.cpp @@ -75,3 +75,8 @@ void foo(int xsize, int ysize) { // despite v == 0. memset(p, -1, v); } + +void *memset(int); +void NoCrash() { + memset(1); +}