Skip to content

Commit 24fb970

Browse files
cmeMilica Lazarevic
authored andcommitted
Merge pull request #5 from MediaTek-Labs/nanomips-llvm13-memcopyopt-heuristics
Improve MemCopyOpt heuristics to account for combinable stores
1 parent 5e58157 commit 24fb970

File tree

1 file changed

+59
-22
lines changed

1 file changed

+59
-22
lines changed

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 59 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,13 @@ struct MemsetRange {
9797
/// Alignment - The known alignment of the first store.
9898
MaybeAlign Alignment;
9999

100+
/// MaxAlignment - The maximum known alignment of any store in the range
101+
unsigned MaxAlignment;
102+
103+
/// MaxAlignmentOffset - The offset of the maximally-aligned store
104+
/// from the first
105+
unsigned MaxAlignmentOffset;
106+
100107
/// TheStores - The actual stores that make up this range.
101108
SmallVector<Instruction*, 16> TheStores;
102109

@@ -106,8 +113,10 @@ struct MemsetRange {
106113
} // end anonymous namespace
107114

108115
bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
109-
// If we found more than 4 stores to merge or 16 bytes, use memset.
110-
if (TheStores.size() >= 4 || End-Start >= 16) return true;
116+
// If the merged range will take more than 16 bytes, use
117+
// memset. This avoids the more expensive calculation of merged
118+
// stores.
119+
if (End-Start >= 16) return true;
111120

112121
// If there is nothing to merge, don't do anything.
113122
if (TheStores.size() < 2) return false;
@@ -122,29 +131,47 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
122131
// together if it wants to.
123132
if (TheStores.size() == 2) return false;
124133

125-
// If we have fewer than 8 stores, it can still be worthwhile to do this.
126-
// For example, merging 4 i8 stores into an i32 store is useful almost always.
127-
// However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
128-
// memset will be split into 2 32-bit stores anyway) and doing so can
129-
// pessimize the llvm optimizer.
134+
// Estimate the number of stores that will be used to implement a
135+
// memset range after the DAG Combiner has merged naturally-aligned
136+
// stores.
130137
//
131-
// Since we don't have perfect knowledge here, make some assumptions: assume
132-
// the maximum GPR width is the same size as the largest legal integer
133-
// size. If so, check to see whether we will end up actually reducing the
134-
// number of stores used.
135-
unsigned Bytes = unsigned(End-Start);
138+
// This takes account of partial alignment information, which would
139+
// be discarded by converting to a memset. For example:
140+
// struct A {
141+
// char a, b, c, d, e, f, g, h;
142+
// int counter;
143+
// } *Ap;
144+
// Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
145+
//
146+
// The overall structure alignment is 32-bits. Naively, we see 7
147+
// single-byte stores, the first of which, b, is only known to be
148+
// byte-aligned. However, since most architectures support 32-bit and
149+
// 16-bit stores, these can be merged by DAGCombine into only 3
150+
// naturally-aligned stores:
151+
// store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
152+
// store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
153+
// store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
154+
155+
int Offset = Start;
156+
int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
157+
int StoreCount = 0;
136158
unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
137-
if (MaxIntSize == 0)
138-
MaxIntSize = 1;
139-
unsigned NumPointerStores = Bytes / MaxIntSize;
140-
141-
// Assume the remaining bytes if any are done a byte at a time.
142-
unsigned NumByteStores = Bytes % MaxIntSize;
143159

144-
// If we will reduce the # stores (according to this heuristic), do the
145-
// transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
146-
// etc.
147-
return TheStores.size() > NumPointerStores+NumByteStores;
160+
while (Offset < End) {
161+
unsigned StoreSize = 1;
162+
for (unsigned NextStoreSize = 2;
163+
NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
164+
NextStoreSize *= 2) {
165+
uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment(8 * NextStoreSize)
166+
.value());
167+
if (OffsetFromMaxAlign % StoreAlign == 0)
168+
StoreSize = NextStoreSize;
169+
}
170+
OffsetFromMaxAlign += StoreSize;
171+
Offset += StoreSize;
172+
StoreCount++;
173+
}
174+
return StoreCount > 4;
148175
}
149176

150177
namespace {
@@ -210,6 +237,8 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
210237
R.End = End;
211238
R.StartPtr = Ptr;
212239
R.Alignment = Alignment;
240+
R.MaxAlignment = Alignment.valueOrOne().value();
241+
R.MaxAlignmentOffset = 0;
213242
R.TheStores.push_back(Inst);
214243
return;
215244
}
@@ -232,6 +261,14 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
232261
I->Start = Start;
233262
I->StartPtr = Ptr;
234263
I->Alignment = Alignment;
264+
I->MaxAlignmentOffset = (I->MaxAlignmentOffset + Size) % I->MaxAlignment;
265+
}
266+
267+
// Does this store provide a better alignment than we have
268+
// previously seen for this range?
269+
if (Alignment > I->MaxAlignment) {
270+
I->MaxAlignment = Alignment.valueOrOne().value();
271+
I->MaxAlignmentOffset = Start - I->Start;
235272
}
236273

237274
// Now we know that Start <= I->End and Start >= I->Start (so the startpoint

0 commit comments

Comments
 (0)