Adapt memset heuristics and tests for targets with unaligned accesses

Colin McEwan · Colin McEwan · commit 0fa7ee2154c7 · 2023-07-31T23:36:13.000+01:00
Targets with unaligned accesses can make a better job of inlining
memsets without knowing exact alignment.
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -37,11 +37,13 @@ class MemorySSAUpdater;
 class MemSetInst;
 class StoreInst;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 class Value;
 
 class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   MemoryDependenceResults *MD = nullptr;
   TargetLibraryInfo *TLI = nullptr;
+  TargetTransformInfo *TTI = nullptr;
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   DominatorTree *DT = nullptr;
@@ -56,7 +58,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   // Glue for the old PM.
   bool runImpl(Function &F, MemoryDependenceResults *MD, TargetLibraryInfo *TLI,
                AAResults *AA, AssumptionCache *AC, DominatorTree *DT,
-               MemorySSA *MSSA);
+               MemorySSA *MSSA, TargetTransformInfo *TTI);
 
 private:
   // Helper functions
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -111,12 +112,15 @@ struct MemsetRange {
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
 
-  bool isProfitableToUseMemset(const DataLayout &DL) const;
+  bool isProfitableToUseMemset(const DataLayout &DL, TargetTransformInfo *TTI,
+                               LLVMContext *Context) const;
 };
 
 } // end anonymous namespace
 
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL,
+                                          TargetTransformInfo *TTI,
+                                          LLVMContext *Context) const {
   // If the merged range will take more than 16 bytes, use
   // memset. This avoids the more expensive calculation of merged
   // stores.
@@ -135,47 +139,75 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // together if it wants to.
   if (TheStores.size() == 2) return false;
 
-  // Estimate the number of stores that will be used to implement a
-  // memset range after the DAG Combiner has merged naturally-aligned
-  // stores.
-  //
-  // This takes account of partial alignment information, which would
-  // be discarded by converting to a memset. For example:
-  //   struct A {
-  //     char a, b, c, d, e, f, g, h;
-  //     int counter;
-  //   } *Ap;
-  //    Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
-  //
-  // The overall structure alignment is 32-bits. Naively, we see 7
-  // single-byte stores, the first of which, b, is only known to be
-  // byte-aligned. However, since most architectures support 32-bit and
-  // 16-bit stores, these can be merged by DAGCombine into only 3
-  // naturally-aligned stores:
-  //   store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
-  //   store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
-  //   store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
-
-  int Offset = Start;
-  int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
-  int StoreCount = 0;
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
   unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+
+  bool AllowMisaligned = TTI->allowsMisalignedMemoryAccesses(
+    *Context, MaxIntSize);
+
+  if (AllowMisaligned) {
+    // Misaligned accesses are permitted. We can assume that inlining a
+    // memset() call can be inlined to MaxIntSize'd stores, plus single-byte
+    // stores, regardless of the alignment of the destination pointer.
 
-  while (Offset < End) {
-    unsigned StoreSize = 1;
-    for (unsigned NextStoreSize = 2;
-	 NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
-	 NextStoreSize *= 2) {
-      uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
-			     .value());
-      if (OffsetFromMaxAlign % StoreAlign == 0)
-	StoreSize = NextStoreSize;
+    unsigned Bytes = unsigned(End-Start);
+
+    unsigned NumPointerStores = Bytes / MaxIntSize;
+
+    // Assume the remaining bytes if any are done a byte at a time.
+    unsigned NumByteStores = Bytes % MaxIntSize;
+
+    // If we will reduce the # stores (according to this heuristic), do the
+    // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+    // etc.
+    return TheStores.size() > NumPointerStores+NumByteStores;
+  } else {
+    // Estimate the number of stores that would be used to implement the
+    // stores in the range after the DAG Combiner has merged any
+    // naturally-aligned stores.
+    //
+    // This takes account of partial alignment information, which would
+    // be discarded by converting to a memset. For example:
+    //   struct A {
+    //     char a, b, c, d, e, f, g, h;
+    //     int counter;
+    //   } *Ap;
+    //   Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
+    //
+    // The overall structure alignment is 32-bits. Naively, we see 7
+    // single-byte stores, the first of which, b, is only known to be
+    // byte-aligned. However, since most architectures support 32-bit and
+    // 16-bit stores, these can be merged by DAGCombine into only 3
+    // naturally-aligned stores:
+    //   store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
+    //   store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
+    //   store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
+
+    int Offset = Start;
+    int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
+    int StoreCount = 0;
+
+    while (Offset < End) {
+      unsigned StoreSize = 1;
+      for (unsigned NextStoreSize = 2;
+           NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
+           NextStoreSize *= 2) {
+        uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
+                               .value());
+        if (OffsetFromMaxAlign % StoreAlign == 0)
+          StoreSize = NextStoreSize;
+      }
+      OffsetFromMaxAlign += StoreSize;
+      Offset += StoreSize;
+      StoreCount++;
     }
-    OffsetFromMaxAlign += StoreSize;
-    Offset += StoreSize;
-    StoreCount++;
+    return StoreCount > 4;
   }
-  return StoreCount > 4;
 }
 
 namespace {
@@ -324,6 +356,7 @@ class MemCpyOptLegacyPass : public FunctionPass {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     if (!EnableMemorySSA)
       AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addPreserved<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -349,6 +382,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
                     false, false)
 
@@ -527,7 +561,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     if (Range.TheStores.size() == 1) continue;
 
     // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(DL))
+    if (!Range.isProfitableToUseMemset(DL, TTI, &StartInst->getContext()))
       continue;
 
     // Otherwise, we do want to transform this!  Create a new memset.
@@ -1780,11 +1814,12 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
   auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
                                : AM.getCachedResult<MemorySSAAnalysis>(F);
 
   bool MadeChange =
-      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
+      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr, TTI);
   if (!MadeChange)
     return PreservedAnalyses::all();
 
@@ -1800,14 +1835,15 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
 bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
                             TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                             AssumptionCache *AC_, DominatorTree *DT_,
-                            MemorySSA *MSSA_) {
+                            MemorySSA *MSSA_, TargetTransformInfo *TTI_) {
   bool MadeChange = false;
   MD = MD_;
   TLI = TLI_;
   AA = AA_;
   AC = AC_;
   DT = DT_;
   MSSA = MSSA_;
+  TTI = TTI_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = MSSA_ ? &MSSAU_ : nullptr;
   // If we don't have at least memset and memcpy, there is little point of doing
@@ -1841,10 +1877,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto *MSSAWP = EnableMemorySSA
       ? &getAnalysis<MemorySSAWrapperPass>()
       : getAnalysisIfAvailable<MemorySSAWrapperPass>();
 
   return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
-                      MSSAWP ? &MSSAWP->getMSSA() : nullptr);
+                      MSSAWP ? &MSSAWP->getMSSA() : nullptr, TTI);
 }
diff --git a/llvm/test/Transforms/MemCpyOpt/profitable-memset-aligned.ll b/llvm/test/Transforms/MemCpyOpt/profitable-memset-aligned.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Supported only on targets that don't have native misaligned access
+; REQUIRES: mips-registered-target
+; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
+; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+define void @foo(i64* nocapture %P) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P:%.*]] to i16*
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32*
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 3
+; CHECK-NEXT:    store i16 0, i16* [[TMP0]], align 2
+; CHECK-NEXT:    store i32 0, i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i16 0, i16* [[ARRAYIDX1]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = bitcast i16* %arrayidx to i32*
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
+  store i16 0, i16* %0, align 2
+  store i32 0, i32* %1, align 4
+  store i16 0, i16* %arrayidx1, align 2
+  ret void
+}
+
diff --git a/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll b/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; Unsupported on targets that don't have misaligned access support
+; REQUIRES: aarch64-registered-target
+; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
+; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"