[InstCombine] Forward memcpy source to load instruction#140249
[InstCombine] Forward memcpy source to load instruction#140249
Conversation
Reducing the overhead caused by `make_range(++Load->getReverseIterator(), ScanBB->rend())`.
|
@llvm/pr-subscribers-llvm-transforms Author: dianqk (dianqk) ChangesFixes #137810. Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=059b0c2efbf30d986d812c4d2cf6d6c7876569fe&to=bb5edb394bb8983c5d3eacbaa3c3491504dd549b&stat=instructions%3Au. Full diff: https://github.com/llvm/llvm-project/pull/140249.diff 6 Files Affected:
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b..94d761379a9c5 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -153,9 +153,10 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// This overload provides a more efficient implementation of
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
-/// This overload cannot be used to scan across multiple blocks.
+/// This overload cannot be used to scan across multiple blocks. If a memcpy is
+/// returned, it indicates that we can load from its source.
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan = DefMaxInstsToScan);
/// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122cd..f766331dab2f1 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -713,8 +713,31 @@ Value *llvm::findAvailablePtrLoadStore(
return nullptr;
}
+static Value *availableMemCpySrc(LoadInst *LI, MemCpyInst *MemCpy,
+ int64_t &Offset) {
+ if (!LI->isSimple() || MemCpy->isVolatile())
+ return nullptr;
+ const DataLayout &DL = LI->getDataLayout();
+ u_int64_t Size = DL.getTypeStoreSize(LI->getType()).getKnownMinValue();
+ if (Size == 0)
+ return nullptr;
+ Value *OldSrc = LI->getPointerOperand();
+
+ if (OldSrc != MemCpy->getDest()) {
+ std::optional<int64_t> PointerOffset =
+ OldSrc->getPointerOffsetFrom(MemCpy->getDest(), DL);
+ if (!PointerOffset || *PointerOffset < 0)
+ return nullptr;
+ Offset = *PointerOffset;
+ }
+ auto *CopyLen = dyn_cast<ConstantInt>(MemCpy->getLength());
+ if (!CopyLen || CopyLen->getZExtValue() < Size + Offset)
+ return nullptr;
+ return MemCpy;
+}
+
Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -739,6 +762,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE);
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(&Inst))
+ Available = availableMemCpySrc(Load, MemCpy, Offset);
+
if (Available)
break;
@@ -753,6 +779,12 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
for (Instruction *Inst : MustNotAliasInsts)
if (isModSet(AA.getModRefInfo(Inst, Loc)))
return nullptr;
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(Available)) {
+ MemoryLocation Loc = MemoryLocation::getForSource(MemCpy);
+ for (Instruction *Inst : MustNotAliasInsts)
+ if (isModSet(AA.getModRefInfo(Inst, Loc)))
+ return nullptr;
+ }
}
return Available;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c29cba6f675c5..cf0ebc9fd043f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1053,13 +1053,45 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
BatchAAResults BatchAA(*AA);
- if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
+ int64_t Offset = 0;
+ if (Value *AvailableVal =
+ FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE, Offset)) {
if (IsLoadCSE)
combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
- return replaceInstUsesWith(
- LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
- LI.getName() + ".cast"));
+ /// Perform simplification of load's. If we have memcpy A which copies X to
+ /// Y, and load instruction B which loads from Y, then we can rewrite B to
+ /// be a load instruction loads from X. This allows later passes to remove
+ /// the memcpy A or identify the source of the load instruction.
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(AvailableVal)) {
+ Value *NewSrc = MemCpy->getSource();
+ Value *OldSrc = LI.getPointerOperand();
+ MaybeAlign NewAlign = MemCpy->getSourceAlign();
+ if (Offset != 0) {
+ if (NewAlign.has_value())
+ NewAlign = commonAlignment(*NewAlign, Offset);
+ // Avoid increasing instructions
+ if (isa<Instruction>(OldSrc) && OldSrc->hasOneUse())
+ NewSrc =
+ Builder.CreateInBoundsPtrAdd(NewSrc, Builder.getInt64(Offset));
+ else
+ NewSrc = nullptr;
+ }
+ // Avoid infinite loops
+ if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc))
+ AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign,
+ LI.getName());
+ else {
+ AvailableVal = nullptr;
+ if (NewSrc && NewSrc->use_empty())
+ cast<Instruction>(NewSrc)->eraseFromParent();
+ }
+ } else
+ AvailableVal = Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+ LI.getName() + ".cast");
+
+ if (AvailableVal)
+ return replaceInstUsesWith(LI, AvailableVal);
}
// None of the following transforms are legal for volatile/ordered atomic
diff --git a/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
new file mode 100644
index 0000000000000..7a56bb50b0903
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i24 @forward_load(ptr align 4 %src) {
+; CHECK-LABEL: define i24 @forward_load(
+; CHECK-SAME: ptr align 4 [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
+; CHECK-NEXT: ret i24 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i8 @forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i17 @forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i17 @forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL:%.*]] = load i17, ptr [[SRC]], align 1
+; CHECK-NEXT: ret i17 [[VAL]]
+;
+ %dest = alloca [5 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i17, ptr %dest
+ ret i17 %val
+}
+
+define <2 x i8> @forward_load_vector(ptr %src) {
+; CHECK-LABEL: define <2 x i8> @forward_load_vector(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[SRC]], align 1
+; CHECK-NEXT: ret <2 x i8> [[TMP1]]
+;
+ %dest = alloca <2 x i8>
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %val = load <2 x i8>, ptr %dest
+ ret <2 x i8> %val
+}
+
+; Negative tests
+
+define i24 @forward_load_volatile(ptr %src) {
+; CHECK-LABEL: define i24 @forward_load_volatile(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load volatile i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load volatile i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_src(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_src(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %src
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_dest(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_dest(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %dest
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i16 @failed_forward_load_size(ptr %src) {
+; CHECK-LABEL: define i16 @failed_forward_load_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT: ret i16 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+ %val = load i16, ptr %dest
+ ret i16 %val
+}
+
+define i8 @failed_forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 1
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i8 @failed_forward_load_gep_multi_use(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep_multi_use(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL1:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: call void @use_ptr(ptr nonnull [[GEP]])
+; CHECK-NEXT: ret i8 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ call void @use_ptr(ptr %gep)
+ ret i8 %val
+}
+
+define i24 @failed_forward_load_must_alias(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_must_alias(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST_GEP]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC_GEP]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST_GEP]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %src_gep = getelementptr inbounds i8, ptr %src, i64 2
+ %dest_gep = getelementptr inbounds i8, ptr %src, i64 2
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest_gep, ptr %src_gep, i64 3, i1 false)
+ %val = load i24, ptr %dest_gep
+ ret i24 %val
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @use_ptr(ptr)
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index f084fe38bb226..431870155ae83 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -208,8 +208,7 @@ define i32 @test_memcpy_after_phi(i1 %cond, ptr %ptr) {
; CHECK: join:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], [[IF]] ], [ [[PTR:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PHI]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PHI]], align 4
-; CHECK-NEXT: ret i32 [[V]]
+; CHECK-NEXT: ret i32 0
;
entry:
%a = alloca [32 x i8]
@@ -384,8 +383,7 @@ define i8 @select_after_memcpy_keep_alloca(i1 %cond, ptr %p) {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 1
; CHECK-NEXT: [[PTR:%.*]] = select i1 [[COND:%.*]], ptr [[ALLOCA]], ptr [[P:%.*]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT: ret i8 [[LOAD]]
+; CHECK-NEXT: ret i8 0
;
entry:
%alloca = alloca [32 x i8]
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
new file mode 100644
index 0000000000000..d5dc213e6d6b6
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+define i1 @main(ptr %i2) {
+; CHECK-LABEL: define noundef i1 @main(
+; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[COMMON_RET:.*:]]
+; CHECK-NEXT: store i8 0, ptr [[I2]], align 1
+; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT: store i8 1, ptr [[I3]], align 1
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT: store i8 2, ptr [[I4]], align 1
+; CHECK-NEXT: ret i1 true
+;
+ %i1 = alloca [3 x i8], align 1
+ store i8 0, ptr %i2, align 1
+ %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
+ store i8 1, ptr %i3, align 1
+ %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
+ store i8 2, ptr %i4, align 1
+ call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
+ call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
+ %i5 = load i8, ptr %i1, align 1
+ %i6 = icmp eq i8 %i5, 0
+ %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
+ %i8 = load i8, ptr %i7, align 1
+ %i9 = icmp eq i8 %i8, 1
+ %i10 = select i1 %i6, i1 %i9, i1 false
+ %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
+ %i12 = load i8, ptr %i11, align 1
+ %i13 = icmp eq i8 %i12, 2
+ %i14 = select i1 %i10, i1 %i13, i1 false
+ br i1 %i14, label %true, label %false
+
+true:
+ call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
+ ret i1 true
+
+false:
+ call void @assert_failed(ptr %i1)
+ ret i1 false
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.lifetime.start.p0(i64, ptr)
+declare void @llvm.lifetime.end.p0(i64, ptr)
+declare void @assert_failed(ptr)
|
|
@llvm/pr-subscribers-llvm-analysis Author: dianqk (dianqk) ChangesFixes #137810. Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=059b0c2efbf30d986d812c4d2cf6d6c7876569fe&to=bb5edb394bb8983c5d3eacbaa3c3491504dd549b&stat=instructions%3Au. Full diff: https://github.com/llvm/llvm-project/pull/140249.diff 6 Files Affected:
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b..94d761379a9c5 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -153,9 +153,10 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// This overload provides a more efficient implementation of
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
-/// This overload cannot be used to scan across multiple blocks.
+/// This overload cannot be used to scan across multiple blocks. If a memcpy is
+/// returned, it indicates that we can load from its source.
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan = DefMaxInstsToScan);
/// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122cd..f766331dab2f1 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -713,8 +713,31 @@ Value *llvm::findAvailablePtrLoadStore(
return nullptr;
}
+static Value *availableMemCpySrc(LoadInst *LI, MemCpyInst *MemCpy,
+ int64_t &Offset) {
+ if (!LI->isSimple() || MemCpy->isVolatile())
+ return nullptr;
+ const DataLayout &DL = LI->getDataLayout();
+ u_int64_t Size = DL.getTypeStoreSize(LI->getType()).getKnownMinValue();
+ if (Size == 0)
+ return nullptr;
+ Value *OldSrc = LI->getPointerOperand();
+
+ if (OldSrc != MemCpy->getDest()) {
+ std::optional<int64_t> PointerOffset =
+ OldSrc->getPointerOffsetFrom(MemCpy->getDest(), DL);
+ if (!PointerOffset || *PointerOffset < 0)
+ return nullptr;
+ Offset = *PointerOffset;
+ }
+ auto *CopyLen = dyn_cast<ConstantInt>(MemCpy->getLength());
+ if (!CopyLen || CopyLen->getZExtValue() < Size + Offset)
+ return nullptr;
+ return MemCpy;
+}
+
Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -739,6 +762,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE);
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(&Inst))
+ Available = availableMemCpySrc(Load, MemCpy, Offset);
+
if (Available)
break;
@@ -753,6 +779,12 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
for (Instruction *Inst : MustNotAliasInsts)
if (isModSet(AA.getModRefInfo(Inst, Loc)))
return nullptr;
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(Available)) {
+ MemoryLocation Loc = MemoryLocation::getForSource(MemCpy);
+ for (Instruction *Inst : MustNotAliasInsts)
+ if (isModSet(AA.getModRefInfo(Inst, Loc)))
+ return nullptr;
+ }
}
return Available;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c29cba6f675c5..cf0ebc9fd043f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1053,13 +1053,45 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
BatchAAResults BatchAA(*AA);
- if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
+ int64_t Offset = 0;
+ if (Value *AvailableVal =
+ FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE, Offset)) {
if (IsLoadCSE)
combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
- return replaceInstUsesWith(
- LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
- LI.getName() + ".cast"));
+ /// Perform simplification of load's. If we have memcpy A which copies X to
+ /// Y, and load instruction B which loads from Y, then we can rewrite B to
+ /// be a load instruction loads from X. This allows later passes to remove
+ /// the memcpy A or identify the source of the load instruction.
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(AvailableVal)) {
+ Value *NewSrc = MemCpy->getSource();
+ Value *OldSrc = LI.getPointerOperand();
+ MaybeAlign NewAlign = MemCpy->getSourceAlign();
+ if (Offset != 0) {
+ if (NewAlign.has_value())
+ NewAlign = commonAlignment(*NewAlign, Offset);
+ // Avoid increasing instructions
+ if (isa<Instruction>(OldSrc) && OldSrc->hasOneUse())
+ NewSrc =
+ Builder.CreateInBoundsPtrAdd(NewSrc, Builder.getInt64(Offset));
+ else
+ NewSrc = nullptr;
+ }
+ // Avoid infinite loops
+ if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc))
+ AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign,
+ LI.getName());
+ else {
+ AvailableVal = nullptr;
+ if (NewSrc && NewSrc->use_empty())
+ cast<Instruction>(NewSrc)->eraseFromParent();
+ }
+ } else
+ AvailableVal = Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+ LI.getName() + ".cast");
+
+ if (AvailableVal)
+ return replaceInstUsesWith(LI, AvailableVal);
}
// None of the following transforms are legal for volatile/ordered atomic
diff --git a/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
new file mode 100644
index 0000000000000..7a56bb50b0903
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i24 @forward_load(ptr align 4 %src) {
+; CHECK-LABEL: define i24 @forward_load(
+; CHECK-SAME: ptr align 4 [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
+; CHECK-NEXT: ret i24 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i8 @forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i17 @forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i17 @forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL:%.*]] = load i17, ptr [[SRC]], align 1
+; CHECK-NEXT: ret i17 [[VAL]]
+;
+ %dest = alloca [5 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i17, ptr %dest
+ ret i17 %val
+}
+
+define <2 x i8> @forward_load_vector(ptr %src) {
+; CHECK-LABEL: define <2 x i8> @forward_load_vector(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[SRC]], align 1
+; CHECK-NEXT: ret <2 x i8> [[TMP1]]
+;
+ %dest = alloca <2 x i8>
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %val = load <2 x i8>, ptr %dest
+ ret <2 x i8> %val
+}
+
+; Negative tests
+
+define i24 @forward_load_volatile(ptr %src) {
+; CHECK-LABEL: define i24 @forward_load_volatile(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load volatile i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load volatile i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_src(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_src(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %src
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_dest(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_dest(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %dest
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i16 @failed_forward_load_size(ptr %src) {
+; CHECK-LABEL: define i16 @failed_forward_load_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT: ret i16 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+ %val = load i16, ptr %dest
+ ret i16 %val
+}
+
+define i8 @failed_forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 1
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i8 @failed_forward_load_gep_multi_use(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep_multi_use(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL1:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: call void @use_ptr(ptr nonnull [[GEP]])
+; CHECK-NEXT: ret i8 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ call void @use_ptr(ptr %gep)
+ ret i8 %val
+}
+
+define i24 @failed_forward_load_must_alias(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_must_alias(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST_GEP]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC_GEP]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST_GEP]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %src_gep = getelementptr inbounds i8, ptr %src, i64 2
+ %dest_gep = getelementptr inbounds i8, ptr %src, i64 2
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest_gep, ptr %src_gep, i64 3, i1 false)
+ %val = load i24, ptr %dest_gep
+ ret i24 %val
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @use_ptr(ptr)
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index f084fe38bb226..431870155ae83 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -208,8 +208,7 @@ define i32 @test_memcpy_after_phi(i1 %cond, ptr %ptr) {
; CHECK: join:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], [[IF]] ], [ [[PTR:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PHI]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PHI]], align 4
-; CHECK-NEXT: ret i32 [[V]]
+; CHECK-NEXT: ret i32 0
;
entry:
%a = alloca [32 x i8]
@@ -384,8 +383,7 @@ define i8 @select_after_memcpy_keep_alloca(i1 %cond, ptr %p) {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 1
; CHECK-NEXT: [[PTR:%.*]] = select i1 [[COND:%.*]], ptr [[ALLOCA]], ptr [[P:%.*]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT: ret i8 [[LOAD]]
+; CHECK-NEXT: ret i8 0
;
entry:
%alloca = alloca [32 x i8]
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
new file mode 100644
index 0000000000000..d5dc213e6d6b6
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+define i1 @main(ptr %i2) {
+; CHECK-LABEL: define noundef i1 @main(
+; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[COMMON_RET:.*:]]
+; CHECK-NEXT: store i8 0, ptr [[I2]], align 1
+; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT: store i8 1, ptr [[I3]], align 1
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT: store i8 2, ptr [[I4]], align 1
+; CHECK-NEXT: ret i1 true
+;
+ %i1 = alloca [3 x i8], align 1
+ store i8 0, ptr %i2, align 1
+ %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
+ store i8 1, ptr %i3, align 1
+ %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
+ store i8 2, ptr %i4, align 1
+ call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
+ call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
+ %i5 = load i8, ptr %i1, align 1
+ %i6 = icmp eq i8 %i5, 0
+ %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
+ %i8 = load i8, ptr %i7, align 1
+ %i9 = icmp eq i8 %i8, 1
+ %i10 = select i1 %i6, i1 %i9, i1 false
+ %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
+ %i12 = load i8, ptr %i11, align 1
+ %i13 = icmp eq i8 %i12, 2
+ %i14 = select i1 %i10, i1 %i13, i1 false
+ br i1 %i14, label %true, label %false
+
+true:
+ call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
+ ret i1 true
+
+false:
+ call void @assert_failed(ptr %i1)
+ ret i1 false
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.lifetime.start.p0(i64, ptr)
+declare void @llvm.lifetime.end.p0(i64, ptr)
+declare void @assert_failed(ptr)
|
| } | ||
| // Avoid infinite loops | ||
| if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc)) | ||
| AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign, |
There was a problem hiding this comment.
Some metadata may be safe to propagate.
dtcxzyw
left a comment
There was a problem hiding this comment.
Consider the following case:
memcpy(dst, src, len);
%x = load dst
br %cond, %then, %else
then:
%y = load dst
This patch forwards the memcpy source pointer to %x, but not for %y. If %x cannot be simplified eventually, the resulting IR is:
memcpy(dst, src, len);
%x = load src
br %cond, %then, %else
then:
%y = load dst
This regression is common in real-world programs: dtcxzyw/llvm-opt-benchmark#2354 (comment)
Can we start in a more conservative way, by ensuring that the load can be CSEed (query FindAvailableLoadedValue recursively with forwarded memcpy source), or the memcpy has only one user in MemorySSA?
dtcxzyw
left a comment
There was a problem hiding this comment.
Crash reproducer:
; bin/opt -passes=instcombine reduced.ll -S
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.S0 = type { i32, i32, i16 }
@g_1092 = external global [1 x [4 x { i32, i32, i16, [2 x i8] }]]
define i8 @func_124(i32 %storemerge740) {
entry:
br label %for.cond554
for.cond554: ; preds = %for.cond554, %entry
%idxprom578 = zext i32 %storemerge740 to i64
%arrayidx584 = getelementptr [1 x [4 x %struct.S0]], ptr @g_1092, i64 0, i64 %idxprom578, i64 1
%arrayidx589 = getelementptr [1 x [4 x %struct.S0]], ptr @g_1092, i64 0, i64 %idxprom578, i64 1
call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx584, ptr %arrayidx589, i64 12, i1 false)
%f1597 = getelementptr [1 x [4 x %struct.S0]], ptr @g_1092, i64 0, i64 %idxprom578, i64 1, i32 1
%0 = load i32, ptr %f1597, align 4
%tobool598.not = icmp eq i32 %0, 0
br i1 %tobool598.not, label %for.cond601, label %for.cond554
for.cond601: ; preds = %for.cond554
ret i8 0
}
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0. Program arguments: bin/opt -passes=instcombine reduced.ll -S
1. Running pass "function(instcombine<max-iterations=1;verify-fixpoint>)" on module "reduced.ll"
2. Running pass "instcombine<max-iterations=1;verify-fixpoint>" on function "func_124"
#0 0x0000780f1ca26032 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.21.0git+0x226032)
#1 0x0000780f1ca22f0f llvm::sys::RunSignalHandlers() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMSupport.so.21.0git+0x222f0f)
#2 0x0000780f1ca23054 SignalHandler(int, siginfo_t*, void*) Signals.cpp:0:0
#3 0x0000780f1c445330 (/lib/x86_64-linux-gnu/libc.so.6+0x45330)
#4 0x0000780f13685421 llvm::Instruction::eraseFromParent() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.21.0git+0x285421)
#5 0x0000780f14e65666 llvm::InstCombinerImpl::eraseInstFromFunction(llvm::Instruction&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMInstCombine.so.21.0git+0x5c666)
#6 0x0000780f14e6fc31 llvm::InstCombinerImpl::run() (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMInstCombine.so.21.0git+0x66c31)
#7 0x0000780f14e70f23 combineInstructionsOverFunction(llvm::Function&, llvm::InstructionWorklist&, llvm::AAResults*, llvm::AssumptionCache&, llvm::TargetLibraryInfo&, llvm::TargetTransformInfo&, llvm::DominatorTree&, llvm::OptimizationRemarkEmitter&, llvm::BlockFrequencyInfo*, llvm::BranchProbabilityInfo*, llvm::ProfileSummaryInfo*, llvm::InstCombineOptions const&) InstructionCombining.cpp:0:0
#8 0x0000780f14e71f32 llvm::InstCombinePass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMInstCombine.so.21.0git+0x68f32)
#9 0x0000780f16dac615 llvm::detail::PassModel<llvm::Function, llvm::InstCombinePass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libPolly.so.21.0git+0x1ac615)
#10 0x0000780f1372a624 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.21.0git+0x32a624)
#11 0x0000780f1b6db3c5 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.21.0git+0xdb3c5)
#12 0x0000780f13728f00 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.21.0git+0x328f00)
#13 0x0000780f1b6dbd85 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMX86CodeGen.so.21.0git+0xdbd85)
#14 0x0000780f137294f5 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/../lib/libLLVMCore.so.21.0git+0x3294f5)
#15 0x0000780f1cc652e9 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.21.0git+0x2c2e9)
#16 0x0000780f1cc70306 optMain (/home/dtcxzyw/WorkSpace/Projects/compilers/LLVM/llvm-build/bin/../lib/libLLVMOptDriver.so.21.0git+0x37306)
#17 0x0000780f1c42a1ca __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:74:3
#18 0x0000780f1c42a28b call_init ./csu/../csu/libc-start.c:128:20
#19 0x0000780f1c42a28b __libc_start_main ./csu/../csu/libc-start.c:347:5
#20 0x0000567ad14d1095 _start (bin/opt+0x1095)
Segmentation fault (core dumped)
|
Oops, I should use |
IIUC, both cannot resolve llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll. (Please correct me if I've misunderstood your point.) The original IR only has this one |
|
The general guideline is that InstCombine load forwarding should be a subset of GVN load forwarding. New capabilities should generally be added to GVN/MDA first to handle the most generic case (including the cross-block one above). Support in InstCombine exists to avoid some phase ordering issues. (What I have in mind here is not to replace the load from dst with a load from src, but with the available value of the load from src if one exists. But this is probably not easy, because we don't really have a good way to handle pointer offsets that are not materialized in IR.) |
|
That makes sense, thanks. |
Fixes #137810.
I've already tried implementing this with MemCpyOpt in #138490, which resulted in unacceptable compile-time. Therefore, I implement this with InstCombine.
Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=059b0c2efbf30d986d812c4d2cf6d6c7876569fe&to=bb5edb394bb8983c5d3eacbaa3c3491504dd549b&stat=instructions%3Au.