Skip to content

Commit 00d3b39

Browse files
authored
[AggressiveInstCombine] Implement store merge optimization (#147540)
Merge multiple small stores that were originally extracted from one value into a single store. This is the store equivalent of the load merge optimization that AggressiveInstCombine already performs. This implementation is something of an MVP, with various generalizations possible. Fixes #147456.
1 parent 7355ea3 commit 00d3b39

File tree

3 files changed

+1053
-0
lines changed

3 files changed

+1053
-0
lines changed

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,138 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
842842
return true;
843843
}
844844

845+
/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
846+
struct PartStore {
847+
Value *PtrBase;
848+
APInt PtrOffset;
849+
Value *Val;
850+
uint64_t ValOffset;
851+
uint64_t ValWidth;
852+
StoreInst *Store;
853+
854+
bool isCompatibleWith(const PartStore &Other) const {
855+
return PtrBase == Other.PtrBase && Val == Other.Val;
856+
}
857+
858+
bool operator<(const PartStore &Other) const {
859+
return PtrOffset.slt(Other.PtrOffset);
860+
}
861+
};
862+
863+
static std::optional<PartStore> matchPartStore(Instruction &I,
864+
const DataLayout &DL) {
865+
auto *Store = dyn_cast<StoreInst>(&I);
866+
if (!Store || !Store->isSimple())
867+
return std::nullopt;
868+
869+
Value *StoredVal = Store->getValueOperand();
870+
Type *StoredTy = StoredVal->getType();
871+
if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
872+
return std::nullopt;
873+
874+
uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
875+
uint64_t ValOffset = 0;
876+
Value *Val;
877+
if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
878+
m_ConstantInt(ValOffset))),
879+
m_Trunc(m_Value(Val)))))
880+
return std::nullopt;
881+
882+
Value *Ptr = Store->getPointerOperand();
883+
APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
884+
Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
885+
DL, PtrOffset, /*AllowNonInbounds=*/true);
886+
return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
887+
}
888+
889+
static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
890+
const DataLayout &DL, TargetTransformInfo &TTI) {
891+
if (Parts.size() < 2)
892+
return false;
893+
894+
// We now have multiple parts of the same value stored to the same pointer.
895+
// Sort the parts by pointer offset, and make sure they are consistent with
896+
// the value offsets. Also check that the value is fully covered without
897+
// overlaps.
898+
// FIXME: We could support merging stores for only part of the value here.
899+
llvm::sort(Parts);
900+
int64_t LastEndOffsetFromFirst = 0;
901+
const PartStore &First = Parts[0];
902+
for (const PartStore &Part : Parts) {
903+
APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset;
904+
int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset;
905+
if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
906+
LastEndOffsetFromFirst != ValOffsetFromFirst)
907+
return false;
908+
LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
909+
}
910+
911+
// Check whether combining the stores is profitable.
912+
// FIXME: We could generate smaller stores if we can't produce a large one.
913+
LLVMContext &Ctx = First.Store->getContext();
914+
Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst);
915+
unsigned Fast = 0;
916+
if (!TTI.isTypeLegal(NewTy) ||
917+
!TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst,
918+
First.Store->getPointerAddressSpace(),
919+
First.Store->getAlign(), &Fast) ||
920+
!Fast)
921+
return false;
922+
923+
// Generate the combined store.
924+
IRBuilder<> Builder(First.Store);
925+
Value *Val = First.Val;
926+
if (First.ValOffset != 0)
927+
Val = Builder.CreateLShr(Val, First.ValOffset);
928+
Val = Builder.CreateTrunc(Val, NewTy);
929+
StoreInst *Store = Builder.CreateAlignedStore(
930+
Val, First.Store->getPointerOperand(), First.Store->getAlign());
931+
932+
AAMDNodes AATags = First.Store->getAAMetadata();
933+
for (const PartStore &Part : drop_begin(Parts))
934+
AATags = AATags.concat(Part.Store->getAAMetadata());
935+
Store->setAAMetadata(AATags);
936+
937+
// Remove the old stores.
938+
for (const PartStore &Part : Parts)
939+
Part.Store->eraseFromParent();
940+
941+
return true;
942+
}
943+
944+
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
945+
TargetTransformInfo &TTI, AliasAnalysis &AA) {
946+
// FIXME: Add big endian support.
947+
if (DL.isBigEndian())
948+
return false;
949+
950+
SmallVector<PartStore, 8> Parts;
951+
bool MadeChange = false;
952+
for (Instruction &I : make_early_inc_range(BB)) {
953+
if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
954+
if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
955+
Parts.push_back(std::move(*Part));
956+
continue;
957+
}
958+
959+
MadeChange |= mergePartStores(Parts, DL, TTI);
960+
Parts.clear();
961+
Parts.push_back(std::move(*Part));
962+
continue;
963+
}
964+
965+
// FIXME: Use AA to make this more precise.
966+
if (I.mayReadOrWriteMemory() || I.mayThrow()) {
967+
MadeChange |= mergePartStores(Parts, DL, TTI);
968+
Parts.clear();
969+
continue;
970+
}
971+
}
972+
973+
MadeChange |= mergePartStores(Parts, DL, TTI);
974+
return MadeChange;
975+
}
976+
845977
/// Combine away instructions providing they are still equivalent when compared
846978
/// against 0. i.e do they have any bits set.
847979
static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) {
@@ -1330,6 +1462,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
13301462
// bugs.
13311463
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
13321464
}
1465+
1466+
// Do this separately to avoid redundantly scanning stores multiple times.
1467+
MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
13331468
}
13341469

13351470
// We're done with transforms, so remove dead instructions.
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s
3+
4+
; Pretend X86 is big endian.
5+
6+
; FIXME: Big endian not supported yet.
7+
8+
define void @test_i32_be(i32 %x, ptr %p) {
9+
; CHECK-LABEL: define void @test_i32_be(
10+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
11+
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
12+
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
13+
; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
14+
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
15+
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
16+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
17+
; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
18+
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
19+
; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
20+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
21+
; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
22+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 24
23+
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[TMP1]] to i8
24+
; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
25+
; CHECK-NEXT: ret void
26+
;
27+
%x.0 = trunc i32 %x to i8
28+
%gep.0 = getelementptr i8, ptr %p, i64 3
29+
store i8 %x.0, ptr %gep.0
30+
%shr.1 = lshr i32 %x, 8
31+
%x.1 = trunc i32 %shr.1 to i8
32+
%gep.1 = getelementptr i8, ptr %p, i64 2
33+
store i8 %x.1, ptr %gep.1
34+
%shr.2 = lshr i32 %x, 16
35+
%x.2 = trunc i32 %shr.2 to i8
36+
%gep.2 = getelementptr i8, ptr %p, i64 1
37+
store i8 %x.2, ptr %gep.2
38+
%shr.3 = lshr i32 %x, 24
39+
%x.3 = trunc i32 %shr.3 to i8
40+
store i8 %x.3, ptr %p
41+
ret void
42+
}
43+
44+
define void @test_i32_le(i32 %x, ptr %p) {
45+
; CHECK-LABEL: define void @test_i32_le(
46+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
47+
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
48+
; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
49+
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
50+
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
51+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
52+
; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
53+
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
54+
; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
55+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
56+
; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
57+
; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
58+
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
59+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
60+
; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
61+
; CHECK-NEXT: ret void
62+
;
63+
%x.0 = trunc i32 %x to i8
64+
store i8 %x.0, ptr %p
65+
%shr.1 = lshr i32 %x, 8
66+
%x.1 = trunc i32 %shr.1 to i8
67+
%gep.1 = getelementptr i8, ptr %p, i64 1
68+
store i8 %x.1, ptr %gep.1
69+
%shr.2 = lshr i32 %x, 16
70+
%x.2 = trunc i32 %shr.2 to i8
71+
%gep.2 = getelementptr i8, ptr %p, i64 2
72+
store i8 %x.2, ptr %gep.2
73+
%shr.3 = lshr i32 %x, 24
74+
%x.3 = trunc i32 %shr.3 to i8
75+
%gep.3 = getelementptr i8, ptr %p, i64 3
76+
store i8 %x.3, ptr %gep.3
77+
ret void
78+
}
79+
80+
define void @test_i32_mixed_parts(i32 %x, ptr %p) {
81+
; CHECK-LABEL: define void @test_i32_mixed_parts(
82+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
83+
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
84+
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
85+
; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
86+
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
87+
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
88+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
89+
; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2
90+
; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
91+
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
92+
; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
93+
; CHECK-NEXT: ret void
94+
;
95+
%x.0 = trunc i32 %x to i8
96+
%gep.0 = getelementptr i8, ptr %p, i64 3
97+
store i8 %x.0, ptr %gep.0
98+
%shr.1 = lshr i32 %x, 8
99+
%x.1 = trunc i32 %shr.1 to i16
100+
%gep.1 = getelementptr i8, ptr %p, i64 1
101+
store i16 %x.1, ptr %gep.1
102+
%shr.3 = lshr i32 %x, 24
103+
%x.3 = trunc i32 %shr.3 to i8
104+
store i8 %x.3, ptr %p
105+
ret void
106+
}

0 commit comments

Comments
 (0)