2727#include " llvm/Analysis/MemorySSA.h"
2828#include " llvm/Analysis/MemorySSAUpdater.h"
2929#include " llvm/Analysis/TargetLibraryInfo.h"
30+ #include " llvm/Analysis/TargetTransformInfo.h"
3031#include " llvm/Analysis/ValueTracking.h"
3132#include " llvm/IR/Argument.h"
3233#include " llvm/IR/BasicBlock.h"
@@ -111,12 +112,15 @@ struct MemsetRange {
111112 // / TheStores - The actual stores that make up this range.
112113 SmallVector<Instruction*, 16 > TheStores;
113114
114- bool isProfitableToUseMemset (const DataLayout &DL) const ;
115+ bool isProfitableToUseMemset (const DataLayout &DL, TargetTransformInfo *TTI,
116+ LLVMContext *Context) const ;
115117};
116118
117119} // end anonymous namespace
118120
119- bool MemsetRange::isProfitableToUseMemset (const DataLayout &DL) const {
121+ bool MemsetRange::isProfitableToUseMemset (const DataLayout &DL,
122+ TargetTransformInfo *TTI,
123+ LLVMContext *Context) const {
120124 // If the merged range will take more than 16 bytes, use
121125 // memset. This avoids the more expensive calculation of merged
122126 // stores.
@@ -135,47 +139,75 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
135139 // together if it wants to.
136140 if (TheStores.size () == 2 ) return false ;
137141
138- // Estimate the number of stores that will be used to implement a
139- // memset range after the DAG Combiner has merged naturally-aligned
140- // stores.
141- //
142- // This takes account of partial alignment information, which would
143- // be discarded by converting to a memset. For example:
144- // struct A {
145- // char a, b, c, d, e, f, g, h;
146- // int counter;
147- // } *Ap;
148- // Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
149- //
150- // The overall structure alignment is 32-bits. Naively, we see 7
151- // single-byte stores, the first of which, b, is only known to be
152- // byte-aligned. However, since most architectures support 32-bit and
153- // 16-bit stores, these can be merged by DAGCombine into only 3
154- // naturally-aligned stores:
155- // store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
156- // store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
157- // store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
158-
159- int Offset = Start;
160- int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
161- int StoreCount = 0 ;
142+ // Since we don't have perfect knowledge here, make some assumptions: assume
143+ // the maximum GPR width is the same size as the largest legal integer
144+ // size. If so, check to see whether we will end up actually reducing the
145+ // number of stores used.
162146 unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits () / 8 ;
147+ if (MaxIntSize == 0 )
148+ MaxIntSize = 1 ;
149+
150+ bool AllowMisaligned = TTI->allowsMisalignedMemoryAccesses (
151+ *Context, MaxIntSize);
152+
153+ if (AllowMisaligned) {
154+ // Misaligned accesses are permitted. We can assume that inlining a
155+ // memset() call can be inlined to MaxIntSize'd stores, plus single-byte
156+ // stores, regardless of the alignment of the destination pointer.
163157
164- while (Offset < End) {
165- unsigned StoreSize = 1 ;
166- for (unsigned NextStoreSize = 2 ;
167- NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
168- NextStoreSize *= 2 ) {
169- uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment (8 * NextStoreSize)
170- .value ());
171- if (OffsetFromMaxAlign % StoreAlign == 0 )
172- StoreSize = NextStoreSize;
158+ unsigned Bytes = unsigned (End-Start);
159+
160+ unsigned NumPointerStores = Bytes / MaxIntSize;
161+
162+ // Assume the remaining bytes if any are done a byte at a time.
163+ unsigned NumByteStores = Bytes % MaxIntSize;
164+
165+ // If we will reduce the # stores (according to this heuristic), do the
166+ // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
167+ // etc.
168+ return TheStores.size () > NumPointerStores+NumByteStores;
169+ } else {
170+ // Estimate the number of stores that would be used to implement the
171+ // stores in the range after the DAG Combiner has merged any
172+ // naturally-aligned stores.
173+ //
174+ // This takes account of partial alignment information, which would
175+ // be discarded by converting to a memset. For example:
176+ // struct A {
177+ // char a, b, c, d, e, f, g, h;
178+ // int counter;
179+ // } *Ap;
180+ // Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
181+ //
182+ // The overall structure alignment is 32-bits. Naively, we see 7
183+ // single-byte stores, the first of which, b, is only known to be
184+ // byte-aligned. However, since most architectures support 32-bit and
185+ // 16-bit stores, these can be merged by DAGCombine into only 3
186+ // naturally-aligned stores:
187+ // store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
188+ // store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
189+ // store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
190+
191+ int Offset = Start;
192+ int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
193+ int StoreCount = 0 ;
194+
195+ while (Offset < End) {
196+ unsigned StoreSize = 1 ;
197+ for (unsigned NextStoreSize = 2 ;
198+ NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
199+ NextStoreSize *= 2 ) {
200+ uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment (8 * NextStoreSize)
201+ .value ());
202+ if (OffsetFromMaxAlign % StoreAlign == 0 )
203+ StoreSize = NextStoreSize;
204+ }
205+ OffsetFromMaxAlign += StoreSize;
206+ Offset += StoreSize;
207+ StoreCount++;
173208 }
174- OffsetFromMaxAlign += StoreSize;
175- Offset += StoreSize;
176- StoreCount++;
209+ return StoreCount > 4 ;
177210 }
178- return StoreCount > 4 ;
179211}
180212
181213namespace {
@@ -324,6 +356,7 @@ class MemCpyOptLegacyPass : public FunctionPass {
324356 AU.addRequired <TargetLibraryInfoWrapperPass>();
325357 if (!EnableMemorySSA)
326358 AU.addRequired <MemoryDependenceWrapperPass>();
359+ AU.addRequired <TargetTransformInfoWrapperPass>();
327360 AU.addPreserved <MemoryDependenceWrapperPass>();
328361 AU.addRequired <AAResultsWrapperPass>();
329362 AU.addPreserved <AAResultsWrapperPass>();
@@ -349,6 +382,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
349382INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
350383INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
351384INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
385+ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
352386INITIALIZE_PASS_END(MemCpyOptLegacyPass, " memcpyopt" , " MemCpy Optimization" ,
353387 false , false )
354388
@@ -527,7 +561,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
527561 if (Range.TheStores .size () == 1 ) continue ;
528562
529563 // If it is profitable to lower this range to memset, do so now.
530- if (!Range.isProfitableToUseMemset (DL))
564+ if (!Range.isProfitableToUseMemset (DL, TTI, &StartInst-> getContext () ))
531565 continue ;
532566
533567 // Otherwise, we do want to transform this! Create a new memset.
@@ -1780,11 +1814,12 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
17801814 auto *AA = &AM.getResult <AAManager>(F);
17811815 auto *AC = &AM.getResult <AssumptionAnalysis>(F);
17821816 auto *DT = &AM.getResult <DominatorTreeAnalysis>(F);
1817+ auto *TTI = &AM.getResult <TargetIRAnalysis>(F);
17831818 auto *MSSA = EnableMemorySSA ? &AM.getResult <MemorySSAAnalysis>(F)
17841819 : AM.getCachedResult <MemorySSAAnalysis>(F);
17851820
17861821 bool MadeChange =
1787- runImpl (F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA () : nullptr );
1822+ runImpl (F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA () : nullptr , TTI );
17881823 if (!MadeChange)
17891824 return PreservedAnalyses::all ();
17901825
@@ -1800,14 +1835,15 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
18001835bool MemCpyOptPass::runImpl (Function &F, MemoryDependenceResults *MD_,
18011836 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
18021837 AssumptionCache *AC_, DominatorTree *DT_,
1803- MemorySSA *MSSA_) {
1838+ MemorySSA *MSSA_, TargetTransformInfo *TTI_ ) {
18041839 bool MadeChange = false ;
18051840 MD = MD_;
18061841 TLI = TLI_;
18071842 AA = AA_;
18081843 AC = AC_;
18091844 DT = DT_;
18101845 MSSA = MSSA_;
1846+ TTI = TTI_;
18111847 MemorySSAUpdater MSSAU_ (MSSA_);
18121848 MSSAU = MSSA_ ? &MSSAU_ : nullptr ;
18131849 // If we don't have at least memset and memcpy, there is little point of doing
@@ -1841,10 +1877,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
18411877 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults ();
18421878 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache (F);
18431879 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree ();
1880+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI (F);
18441881 auto *MSSAWP = EnableMemorySSA
18451882 ? &getAnalysis<MemorySSAWrapperPass>()
18461883 : getAnalysisIfAvailable<MemorySSAWrapperPass>();
18471884
18481885 return Impl.runImpl (F, MDWP ? & MDWP->getMemDep () : nullptr , TLI, AA, AC, DT,
1849- MSSAWP ? &MSSAWP->getMSSA () : nullptr );
1886+ MSSAWP ? &MSSAWP->getMSSA () : nullptr , TTI );
18501887}
0 commit comments