From 77fccb35ac08f66d52bb152735e27572bf9f3f93 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sun, 25 Aug 2024 04:30:40 +0900 Subject: [PATCH] [AArch64] Replace AND with LSL#2 for LDR target (#34101) (#89531) Currently, process of replacing bitwise operations consisting of `LSR`/`LSL` with `And` is performed by `DAGCombiner`. However, in certain cases, the `AND` generated by this process can be removed. Consider following case: ``` lsr x8, x8, #56 and x8, x8, #0xfc ldr w0, [x2, x8] ret ``` In this case, we can remove the `AND` by changing the target of `LDR` to `[X2, X8, LSL #2]` and right-shifting amount change to 56 to 58. after changed: ``` lsr x8, x8, #58 ldr w0, [x2, x8, lsl #2] ret ``` This patch checks to see if the `SHIFTING` + `AND` operation on load target can be optimized and optimizes it if it can. --- .../Target/AArch64/AArch64ISelLowering.cpp | 17 +++ llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll | 138 ++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8c2f85657ff87e..5ac5b7f8a5ab18 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18023,6 +18023,23 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); } + // We do not need to fold when this shifting used in specific load case: + // (ldr x, (add x, (shl (srl x, c1) 2))) + if (N->getOpcode() == ISD::SHL && N->hasOneUse()) { + if (auto C2 = dyn_cast(N->getOperand(1))) { + unsigned ShlAmt = C2->getZExtValue(); + if (auto ShouldADD = *N->use_begin(); + ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) { + if (auto ShouldLOAD = dyn_cast(*ShouldADD->use_begin())) { + unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8; + if ((1ULL << ShlAmt) == ByteVT && + isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT())) + return false; + } + } + } + } + return true; } diff --git a/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll new file mode 100644 index 00000000000000..9dfc8df703ce64 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s +; + +define i16 @load16_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i16 @load16_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i16 @load16_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i32 @load32_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i32 @load32_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i32 @load32_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i64 @load64_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +} + +define i64 @load64_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +} + +define i64 @load64_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +}