Skip to content

Commit 0fa7ee2

Browse files
Colin McEwanColin McEwan
authored andcommitted
Adapt memset heuristics and tests for targets with unaligned accesses
Targets with unaligned accesses can make a better job of inlining memsets without knowing exact alignment.
1 parent 1d27eb6 commit 0fa7ee2

File tree

4 files changed

+118
-46
lines changed

4 files changed

+118
-46
lines changed

llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,13 @@ class MemorySSAUpdater;
3737
class MemSetInst;
3838
class StoreInst;
3939
class TargetLibraryInfo;
40+
class TargetTransformInfo;
4041
class Value;
4142

4243
class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
4344
MemoryDependenceResults *MD = nullptr;
4445
TargetLibraryInfo *TLI = nullptr;
46+
TargetTransformInfo *TTI = nullptr;
4547
AAResults *AA = nullptr;
4648
AssumptionCache *AC = nullptr;
4749
DominatorTree *DT = nullptr;
@@ -56,7 +58,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
5658
// Glue for the old PM.
5759
bool runImpl(Function &F, MemoryDependenceResults *MD, TargetLibraryInfo *TLI,
5860
AAResults *AA, AssumptionCache *AC, DominatorTree *DT,
59-
MemorySSA *MSSA);
61+
MemorySSA *MSSA, TargetTransformInfo *TTI);
6062

6163
private:
6264
// Helper functions

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 80 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "llvm/Analysis/MemorySSA.h"
2828
#include "llvm/Analysis/MemorySSAUpdater.h"
2929
#include "llvm/Analysis/TargetLibraryInfo.h"
30+
#include "llvm/Analysis/TargetTransformInfo.h"
3031
#include "llvm/Analysis/ValueTracking.h"
3132
#include "llvm/IR/Argument.h"
3233
#include "llvm/IR/BasicBlock.h"
@@ -111,12 +112,15 @@ struct MemsetRange {
111112
/// TheStores - The actual stores that make up this range.
112113
SmallVector<Instruction*, 16> TheStores;
113114

114-
bool isProfitableToUseMemset(const DataLayout &DL) const;
115+
bool isProfitableToUseMemset(const DataLayout &DL, TargetTransformInfo *TTI,
116+
LLVMContext *Context) const;
115117
};
116118

117119
} // end anonymous namespace
118120

119-
bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
121+
bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL,
122+
TargetTransformInfo *TTI,
123+
LLVMContext *Context) const {
120124
// If the merged range will take more than 16 bytes, use
121125
// memset. This avoids the more expensive calculation of merged
122126
// stores.
@@ -135,47 +139,75 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
135139
// together if it wants to.
136140
if (TheStores.size() == 2) return false;
137141

138-
// Estimate the number of stores that will be used to implement a
139-
// memset range after the DAG Combiner has merged naturally-aligned
140-
// stores.
141-
//
142-
// This takes account of partial alignment information, which would
143-
// be discarded by converting to a memset. For example:
144-
// struct A {
145-
// char a, b, c, d, e, f, g, h;
146-
// int counter;
147-
// } *Ap;
148-
// Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
149-
//
150-
// The overall structure alignment is 32-bits. Naively, we see 7
151-
// single-byte stores, the first of which, b, is only known to be
152-
// byte-aligned. However, since most architectures support 32-bit and
153-
// 16-bit stores, these can be merged by DAGCombine into only 3
154-
// naturally-aligned stores:
155-
// store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
156-
// store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
157-
// store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
158-
159-
int Offset = Start;
160-
int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
161-
int StoreCount = 0;
142+
// Since we don't have perfect knowledge here, make some assumptions: assume
143+
// the maximum GPR width is the same size as the largest legal integer
144+
// size. If so, check to see whether we will end up actually reducing the
145+
// number of stores used.
162146
unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
147+
if (MaxIntSize == 0)
148+
MaxIntSize = 1;
149+
150+
bool AllowMisaligned = TTI->allowsMisalignedMemoryAccesses(
151+
*Context, MaxIntSize);
152+
153+
if (AllowMisaligned) {
154+
// Misaligned accesses are permitted. We can assume that inlining a
155+
// memset() call can be inlined to MaxIntSize'd stores, plus single-byte
156+
// stores, regardless of the alignment of the destination pointer.
163157

164-
while (Offset < End) {
165-
unsigned StoreSize = 1;
166-
for (unsigned NextStoreSize = 2;
167-
NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
168-
NextStoreSize *= 2) {
169-
uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
170-
.value());
171-
if (OffsetFromMaxAlign % StoreAlign == 0)
172-
StoreSize = NextStoreSize;
158+
unsigned Bytes = unsigned(End-Start);
159+
160+
unsigned NumPointerStores = Bytes / MaxIntSize;
161+
162+
// Assume the remaining bytes if any are done a byte at a time.
163+
unsigned NumByteStores = Bytes % MaxIntSize;
164+
165+
// If we will reduce the # stores (according to this heuristic), do the
166+
// transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
167+
// etc.
168+
return TheStores.size() > NumPointerStores+NumByteStores;
169+
} else {
170+
// Estimate the number of stores that would be used to implement the
171+
// stores in the range after the DAG Combiner has merged any
172+
// naturally-aligned stores.
173+
//
174+
// This takes account of partial alignment information, which would
175+
// be discarded by converting to a memset. For example:
176+
// struct A {
177+
// char a, b, c, d, e, f, g, h;
178+
// int counter;
179+
// } *Ap;
180+
// Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
181+
//
182+
// The overall structure alignment is 32-bits. Naively, we see 7
183+
// single-byte stores, the first of which, b, is only known to be
184+
// byte-aligned. However, since most architectures support 32-bit and
185+
// 16-bit stores, these can be merged by DAGCombine into only 3
186+
// naturally-aligned stores:
187+
// store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
188+
// store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
189+
// store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
190+
191+
int Offset = Start;
192+
int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
193+
int StoreCount = 0;
194+
195+
while (Offset < End) {
196+
unsigned StoreSize = 1;
197+
for (unsigned NextStoreSize = 2;
198+
NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
199+
NextStoreSize *= 2) {
200+
uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
201+
.value());
202+
if (OffsetFromMaxAlign % StoreAlign == 0)
203+
StoreSize = NextStoreSize;
204+
}
205+
OffsetFromMaxAlign += StoreSize;
206+
Offset += StoreSize;
207+
StoreCount++;
173208
}
174-
OffsetFromMaxAlign += StoreSize;
175-
Offset += StoreSize;
176-
StoreCount++;
209+
return StoreCount > 4;
177210
}
178-
return StoreCount > 4;
179211
}
180212

181213
namespace {
@@ -324,6 +356,7 @@ class MemCpyOptLegacyPass : public FunctionPass {
324356
AU.addRequired<TargetLibraryInfoWrapperPass>();
325357
if (!EnableMemorySSA)
326358
AU.addRequired<MemoryDependenceWrapperPass>();
359+
AU.addRequired<TargetTransformInfoWrapperPass>();
327360
AU.addPreserved<MemoryDependenceWrapperPass>();
328361
AU.addRequired<AAResultsWrapperPass>();
329362
AU.addPreserved<AAResultsWrapperPass>();
@@ -349,6 +382,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
349382
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
350383
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
351384
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
385+
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
352386
INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
353387
false, false)
354388

@@ -527,7 +561,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
527561
if (Range.TheStores.size() == 1) continue;
528562

529563
// If it is profitable to lower this range to memset, do so now.
530-
if (!Range.isProfitableToUseMemset(DL))
564+
if (!Range.isProfitableToUseMemset(DL, TTI, &StartInst->getContext()))
531565
continue;
532566

533567
// Otherwise, we do want to transform this! Create a new memset.
@@ -1780,11 +1814,12 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
17801814
auto *AA = &AM.getResult<AAManager>(F);
17811815
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
17821816
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
1817+
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
17831818
auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
17841819
: AM.getCachedResult<MemorySSAAnalysis>(F);
17851820

17861821
bool MadeChange =
1787-
runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
1822+
runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr, TTI);
17881823
if (!MadeChange)
17891824
return PreservedAnalyses::all();
17901825

@@ -1800,14 +1835,15 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
18001835
bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
18011836
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
18021837
AssumptionCache *AC_, DominatorTree *DT_,
1803-
MemorySSA *MSSA_) {
1838+
MemorySSA *MSSA_, TargetTransformInfo *TTI_) {
18041839
bool MadeChange = false;
18051840
MD = MD_;
18061841
TLI = TLI_;
18071842
AA = AA_;
18081843
AC = AC_;
18091844
DT = DT_;
18101845
MSSA = MSSA_;
1846+
TTI = TTI_;
18111847
MemorySSAUpdater MSSAU_(MSSA_);
18121848
MSSAU = MSSA_ ? &MSSAU_ : nullptr;
18131849
// If we don't have at least memset and memcpy, there is little point of doing
@@ -1841,10 +1877,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
18411877
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
18421878
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
18431879
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1880+
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
18441881
auto *MSSAWP = EnableMemorySSA
18451882
? &getAnalysis<MemorySSAWrapperPass>()
18461883
: getAnalysisIfAvailable<MemorySSAWrapperPass>();
18471884

18481885
return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
1849-
MSSAWP ? &MSSAWP->getMSSA() : nullptr);
1886+
MSSAWP ? &MSSAWP->getMSSA() : nullptr, TTI);
18501887
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; Supported only on targets that don't have native misaligned access
3+
; REQUIRES: mips-registered-target
4+
; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
5+
; RUN: opt -mtriple mips < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
6+
7+
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
8+
9+
define void @foo(i64* nocapture %P) {
10+
; CHECK-LABEL: @foo(
11+
; CHECK-NEXT: entry:
12+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[P:%.*]] to i16*
13+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 1
14+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32*
15+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 3
16+
; CHECK-NEXT: store i16 0, i16* [[TMP0]], align 2
17+
; CHECK-NEXT: store i32 0, i32* [[TMP1]], align 4
18+
; CHECK-NEXT: store i16 0, i16* [[ARRAYIDX1]], align 2
19+
; CHECK-NEXT: ret void
20+
;
21+
entry:
22+
%0 = bitcast i64* %P to i16*
23+
%arrayidx = getelementptr inbounds i16, i16* %0, i64 1
24+
%1 = bitcast i16* %arrayidx to i32*
25+
%arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
26+
store i16 0, i16* %0, align 2
27+
store i32 0, i32* %1, align 4
28+
store i16 0, i16* %arrayidx1, align 2
29+
ret void
30+
}
31+

llvm/test/Transforms/MemCpyOpt/profitable-memset.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
3-
; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
2+
; Unsupported on targets that don't have misaligned access support
3+
; REQUIRES: aarch64-registered-target
4+
; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
5+
; RUN: opt -mtriple=aarch64 < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
46

57
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
68

0 commit comments

Comments
 (0)