@@ -97,6 +97,13 @@ struct MemsetRange {
9797 // / Alignment - The known alignment of the first store.
9898 MaybeAlign Alignment;
9999
100+ // / MaxAlignment - The maximum known alignment of any store in the range
101+ unsigned MaxAlignment;
102+
103+ // / MaxAlignmentOffset - The offset of the maximally-aligned store
104+ // / from the first
105+ unsigned MaxAlignmentOffset;
106+
100107 // / TheStores - The actual stores that make up this range.
101108 SmallVector<Instruction*, 16 > TheStores;
102109
@@ -106,8 +113,10 @@ struct MemsetRange {
106113} // end anonymous namespace
107114
108115bool MemsetRange::isProfitableToUseMemset (const DataLayout &DL) const {
109- // If we found more than 4 stores to merge or 16 bytes, use memset.
110- if (TheStores.size () >= 4 || End-Start >= 16 ) return true ;
116+ // If the merged range will take more than 16 bytes, use
117+ // memset. This avoids the more expensive calculation of merged
118+ // stores.
119+ if (End-Start >= 16 ) return true ;
111120
112121 // If there is nothing to merge, don't do anything.
113122 if (TheStores.size () < 2 ) return false ;
@@ -122,29 +131,47 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
122131 // together if it wants to.
123132 if (TheStores.size () == 2 ) return false ;
124133
125- // If we have fewer than 8 stores, it can still be worthwhile to do this.
126- // For example, merging 4 i8 stores into an i32 store is useful almost always.
127- // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
128- // memset will be split into 2 32-bit stores anyway) and doing so can
129- // pessimize the llvm optimizer.
134+ // Estimate the number of stores that will be used to implement a
135+ // memset range after the DAG Combiner has merged naturally-aligned
136+ // stores.
130137 //
131- // Since we don't have perfect knowledge here, make some assumptions: assume
132- // the maximum GPR width is the same size as the largest legal integer
133- // size. If so, check to see whether we will end up actually reducing the
134- // number of stores used.
135- unsigned Bytes = unsigned (End-Start);
138+ // This takes account of partial alignment information, which would
139+ // be discarded by converting to a memset. For example:
140+ // struct A {
141+ // char a, b, c, d, e, f, g, h;
142+ // int counter;
143+ // } *Ap;
144+ // Ap->b = Ap->c = Ap->d = Ap->e = Ap->f = Ap->g = Ap->h = 0;
145+ //
146+ // The overall structure alignment is 32-bits. Naively, we see 7
147+ // single-byte stores, the first of which, b, is only known to be
148+ // byte-aligned. However, since most architectures support 32-bit and
149+ // 16-bit stores, these can be merged by DAGCombine into only 3
150+ // naturally-aligned stores:
151+ // store<(store (s8) into %ir.b...)> t0, Constant:i8<0>...
152+ // store<(store (s16) into %ir.c), trunc to i16> t0, Constant:i32<0>...
153+ // store<(store (s32) into %ir.e)> t0, Constant:i32<0>...
154+
155+ int Offset = Start;
156+ int OffsetFromMaxAlign = MaxAlignment - MaxAlignmentOffset;
157+ int StoreCount = 0 ;
136158 unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits () / 8 ;
137- if (MaxIntSize == 0 )
138- MaxIntSize = 1 ;
139- unsigned NumPointerStores = Bytes / MaxIntSize;
140-
141- // Assume the remaining bytes if any are done a byte at a time.
142- unsigned NumByteStores = Bytes % MaxIntSize;
143159
144- // If we will reduce the # stores (according to this heuristic), do the
145- // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
146- // etc.
147- return TheStores.size () > NumPointerStores+NumByteStores;
160+ while (Offset < End) {
161+ unsigned StoreSize = 1 ;
162+ for (unsigned NextStoreSize = 2 ;
163+ NextStoreSize <= MaxIntSize && End - Offset >= NextStoreSize;
164+ NextStoreSize *= 2 ) {
165+ uint64_t StoreAlign = (DL.getABIIntegerTypeAlignment (8 * NextStoreSize)
166+ .value ());
167+ if (OffsetFromMaxAlign % StoreAlign == 0 )
168+ StoreSize = NextStoreSize;
169+ }
170+ OffsetFromMaxAlign += StoreSize;
171+ Offset += StoreSize;
172+ StoreCount++;
173+ }
174+ return StoreCount > 4 ;
148175}
149176
150177namespace {
@@ -210,6 +237,8 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
210237 R.End = End;
211238 R.StartPtr = Ptr;
212239 R.Alignment = Alignment;
240+ R.MaxAlignment = Alignment.valueOrOne ().value ();
241+ R.MaxAlignmentOffset = 0 ;
213242 R.TheStores .push_back (Inst);
214243 return ;
215244 }
@@ -232,6 +261,14 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
232261 I->Start = Start;
233262 I->StartPtr = Ptr;
234263 I->Alignment = Alignment;
264+ I->MaxAlignmentOffset = (I->MaxAlignmentOffset + Size) % I->MaxAlignment ;
265+ }
266+
267+ // Does this store provide a better alignment than we have
268+ // previously seen for this range?
269+ if (Alignment > I->MaxAlignment ) {
270+ I->MaxAlignment = Alignment.valueOrOne ().value ();
271+ I->MaxAlignmentOffset = Start - I->Start ;
235272 }
236273
237274 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
0 commit comments