-
Notifications
You must be signed in to change notification settings - Fork 14k
[InstCombine] remove dead loads, such as memcpy from undef #143958
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-clang Author: Jameson Nash (vtjnash) Changes@nikic The implements our discussion in #143782 (comment), extending Patch is 25.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143958.diff 14 Files Affected:
diff --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c
index 01643e6073b56..c1c620e52200d 100644
--- a/clang/test/Misc/loop-opt-setup.c
+++ b/clang/test/Misc/loop-opt-setup.c
@@ -15,7 +15,7 @@ int foo(void) {
// CHECK-NOT: br i1
void Helper(void) {
- const int *nodes[5];
+ const int *nodes[5] = {0};
int num_active = 5;
while (num_active)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index dc2a8cb0115e7..83ebf9e402c06 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3217,10 +3217,12 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
static bool isAllocSiteRemovable(Instruction *AI,
SmallVectorImpl<WeakTrackingVH> &Users,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI,
+ bool KnowInit) {
SmallVector<Instruction*, 4> Worklist;
const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
Worklist.push_back(AI);
+ ModRefInfo Access = KnowInit ? ModRefInfo::NoModRef : ModRefInfo::Mod;
do {
Instruction *PI = Worklist.pop_back_val();
@@ -3283,10 +3285,15 @@ static bool isAllocSiteRemovable(Instruction *AI,
case Intrinsic::memcpy:
case Intrinsic::memset: {
MemIntrinsic *MI = cast<MemIntrinsic>(II);
- if (MI->isVolatile() || MI->getRawDest() != PI)
+ if (MI->isVolatile())
return false;
+ // Note: this could also be ModRef, but we can still interpret that as just Mod in that case.
+ ModRefInfo NewAccess = MI->getRawDest() == PI ? ModRefInfo::Mod : ModRefInfo::Ref;
+ if ((Access & ~NewAccess) != ModRefInfo::NoModRef)
+ return false;
+ Access |= NewAccess;
+ }
[[fallthrough]];
- }
case Intrinsic::assume:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
@@ -3303,11 +3310,6 @@ static bool isAllocSiteRemovable(Instruction *AI,
}
}
- if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
- Users.emplace_back(I);
- continue;
- }
-
if (Family && getFreedOperand(cast<CallBase>(I), &TLI) == PI &&
getAllocationFamily(I, &TLI) == Family) {
Users.emplace_back(I);
@@ -3321,12 +3323,32 @@ static bool isAllocSiteRemovable(Instruction *AI,
continue;
}
+ if (!isRefSet(Access) && isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
+ Access |= ModRefInfo::Mod;
+ Users.emplace_back(I);
+ continue;
+ }
+
return false;
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(I);
if (SI->isVolatile() || SI->getPointerOperand() != PI)
return false;
+ if (isRefSet(Access))
+ return false;
+ Access |= ModRefInfo::Mod;
+ Users.emplace_back(I);
+ continue;
+ }
+
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(I);
+ if (LI->isVolatile() || LI->getPointerOperand() != PI)
+ return false;
+ if (isModSet(Access))
+ return false;
+ Access |= ModRefInfo::Ref;
Users.emplace_back(I);
continue;
}
@@ -3334,6 +3356,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
llvm_unreachable("missing a return?");
}
} while (!Worklist.empty());
+
return true;
}
@@ -3362,10 +3385,23 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
}
- if (isAllocSiteRemovable(&MI, Users, TLI)) {
+ // Determine what getInitialValueOfAllocation would return without actually allocating the result.
+ bool KnowInitUndef = isa<AllocaInst>(MI);
+ bool KnowInitZero = false;
+ if (!KnowInitUndef) {
+ Constant *Init = getInitialValueOfAllocation(&MI, &TLI, Type::getInt8Ty(MI.getContext()));
+ if (Init) {
+ if (isa<UndefValue>(Init))
+ KnowInitUndef = true;
+ else if (Init->isNullValue())
+ KnowInitZero = true;
+ }
+ }
+
+ if (isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef)) {
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
- // Lowering all @llvm.objectsize calls first because they may
- // use a bitcast/GEP of the alloca we are removing.
+ // Lowering all @llvm.objectsize and MTI calls first because they may use
+ // a bitcast/GEP of the alloca we are removing.
if (!Users[i])
continue;
@@ -3382,6 +3418,17 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
eraseInstFromFunction(*I);
Users[i] = nullptr; // Skip examining in the next loop.
}
+ if (auto *MTI = dyn_cast<MemTransferInst>(I)) {
+ if (KnowInitZero && getUnderlyingObject(MTI->getRawDest()) != &MI) {
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(MTI);
+ auto *M = Builder.CreateMemSet(MTI->getRawDest(),
+ ConstantInt::get(Type::getInt8Ty(MI.getContext()), 0),
+ MTI->getLength(),
+ MTI->getDestAlign());
+ M->copyMetadata(*MTI, LLVMContext::MD_DIAssignID);
+ }
+ }
}
}
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
@@ -3404,7 +3451,13 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
} else {
// Casts, GEP, or anything else: we're about to delete this instruction,
// so it can not have any valid uses.
- replaceInstUsesWith(*I, PoisonValue::get(I->getType()));
+ Constant *Replace;
+ if (isa<LoadInst>(I)) {
+ assert(KnowInitZero || KnowInitUndef);
+ Replace = KnowInitUndef ? UndefValue::get(I->getType()) : Constant::getNullValue(I->getType());
+ } else
+ Replace = PoisonValue::get(I->getType());
+ replaceInstUsesWith(*I, Replace);
}
eraseInstFromFunction(*I);
}
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 8824ae48417b0..26e0552886814 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -364,20 +364,8 @@ define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) {
define void @simplify_before_foldAndOfICmps(ptr %p) {
; CHECK-LABEL: @simplify_before_foldAndOfICmps(
-; CHECK-NEXT: [[A8:%.*]] = alloca i16, align 2
-; CHECK-NEXT: [[L7:%.*]] = load i16, ptr [[A8]], align 2
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[L7]], -1
-; CHECK-NEXT: [[B11:%.*]] = zext i1 [[TMP1]] to i16
-; CHECK-NEXT: [[C10:%.*]] = icmp ugt i16 [[L7]], [[B11]]
-; CHECK-NEXT: [[C7:%.*]] = icmp slt i16 [[L7]], 0
-; CHECK-NEXT: [[C3:%.*]] = and i1 [[C7]], [[C10]]
-; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[C10]], true
-; CHECK-NEXT: [[C18:%.*]] = or i1 [[C7]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[C3]] to i64
-; CHECK-NEXT: [[G26:%.*]] = getelementptr i1, ptr null, i64 [[TMP3]]
-; CHECK-NEXT: store i16 [[L7]], ptr [[P:%.*]], align 2
-; CHECK-NEXT: store i1 [[C18]], ptr [[P]], align 1
-; CHECK-NEXT: store ptr [[G26]], ptr [[P]], align 8
+; CHECK-NEXT: store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT: store ptr null, ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%A8 = alloca i16
diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll
index 3cc530bdbd021..d81f14a30cd99 100644
--- a/llvm/test/Transforms/InstCombine/apint-shift.ll
+++ b/llvm/test/Transforms/InstCombine/apint-shift.ll
@@ -564,7 +564,7 @@ define i40 @test26(i40 %A) {
; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9880
define i177 @ossfuzz_9880(i177 %X) {
; CHECK-LABEL: @ossfuzz_9880(
-; CHECK-NEXT: ret i177 0
+; CHECK-NEXT: ret i177 poison
;
%A = alloca i177
%L1 = load i177, ptr %A
diff --git a/llvm/test/Transforms/InstCombine/call-cast-target.ll b/llvm/test/Transforms/InstCombine/call-cast-target.ll
index 2cedc6c81d735..d18b70ff5ce9b 100644
--- a/llvm/test/Transforms/InstCombine/call-cast-target.ll
+++ b/llvm/test/Transforms/InstCombine/call-cast-target.ll
@@ -112,11 +112,7 @@ declare i1 @fn5(ptr byval({ i32, i32 }) align 4 %r)
define i1 @test5() {
; CHECK-LABEL: define i1 @test5() {
-; CHECK-NEXT: [[TMP1:%.*]] = alloca { i32, i32 }, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 4
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = call i1 @fn5(i32 [[TMP2]], i32 [[TMP4]])
+; CHECK-NEXT: [[TMP5:%.*]] = call i1 @fn5(i32 undef, i32 undef)
; CHECK-NEXT: ret i1 [[TMP5]]
;
%1 = alloca { i32, i32 }, align 4
diff --git a/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll b/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll
new file mode 100644
index 0000000000000..b135f76f709f1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare noalias ptr @calloc(i32, i32) nounwind allockind("alloc,zeroed") allocsize(0,1) "alloc-family"="malloc"
+declare void @free(ptr) allockind("free") "alloc-family"="malloc"
+
+; Test load from uninitialized alloca - should be removed and replaced with undef
+define i32 @test_load_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_uninitialized_alloca(
+; CHECK-NEXT: ret i32 undef
+;
+ %a = alloca i32
+ %v = load i32, ptr %a
+ ret i32 %v
+}
+
+; Test load from zero-initialized malloc - should be removed and replaced with zero
+define i32 @test_load_zero_initialized_malloc() {
+; CHECK-LABEL: @test_load_zero_initialized_malloc(
+; CHECK-NEXT: ret i32 0
+;
+ %a = call ptr @calloc(i32 1, i32 4)
+ %v = load i32, ptr %a
+ call void @free(ptr %a)
+ ret i32 %v
+}
+
+; Test memcpy from uninitialized source - should be removed
+define void @test_memcpy_from_uninitialized_alloca(ptr %dest) {
+; CHECK-LABEL: @test_memcpy_from_uninitialized_alloca(
+; CHECK-NEXT: ret void
+;
+ %src = alloca i32, align 1
+ call void @llvm.memcpy.p0.p0.i32(ptr %src, ptr %src, i32 4, i1 false)
+ call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 4, i1 false)
+ ret void
+}
+
+; Test memcpy from zeroed source - should transform to memset with zero
+define void @test_memcpy_from_uninitialized_calloc(ptr %dest) {
+; CHECK-LABEL: @test_memcpy_from_uninitialized_calloc(
+; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(16) [[DEST:%.*]], i8 0, i32 16, i1 false)
+; CHECK-NEXT: ret void
+;
+ %src = call ptr @calloc(i32 1, i32 16)
+ call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 16, i1 false)
+ call void @free(ptr %src)
+ ret void
+}
+
+; Test mixed read/write pattern - should not be removable due to write before read
+define i32 @test_write_then_read_alloca() {
+; CHECK-LABEL: @test_write_then_read_alloca(
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i8 42, ptr [[A]], align 1
+; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: ret i32 [[V]]
+;
+ %a = alloca i32
+ store i8 42, ptr %a
+ %v = load i32, ptr %a
+ ret i32 %v
+}
+
+; Test read then write pattern - should not be removable due to conflicting access
+define void @test_read_then_write_alloca() {
+; CHECK-LABEL: @test_read_then_write_alloca(
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[V8:%.*]] = trunc i32 [[V]] to i8
+; CHECK-NEXT: store i8 [[V8]], ptr [[A]], align 1
+; CHECK-NEXT: ret void
+;
+ %a = alloca i32
+ %v = load i32, ptr %a
+ %v8 = trunc i32 %v to i8
+ store i8 %v8, ptr %a
+ ret void
+}
+
+; Test load through GEP from uninitialized alloca
+define i8 @test_load_gep_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_gep_uninitialized_alloca(
+; CHECK-NEXT: ret i8 undef
+;
+ %a = alloca [4 x i8]
+ %gep = getelementptr [4 x i8], ptr %a, i32 0, i32 2
+ %v = load i8, ptr %gep
+ ret i8 %v
+}
+
+; Test load through bitcast from uninitialized alloca
+define i16 @test_load_bitcast_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_bitcast_uninitialized_alloca(
+; CHECK-NEXT: ret i16 undef
+;
+ %a = alloca i32
+ %bc = bitcast ptr %a to ptr
+ %v = load i16, ptr %bc
+ ret i16 %v
+}
+
+; Test memmove from zero-initialized malloc
+define void @test_memmove_from_zero_initialized_malloc(ptr %dest) {
+; CHECK-LABEL: @test_memmove_from_zero_initialized_malloc(
+; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[DEST:%.*]], i8 0, i32 32, i1 false)
+; CHECK-NEXT: ret void
+;
+ %src = call ptr @calloc(i32 8, i32 4)
+ call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 32, i1 false)
+ call void @free(ptr %src)
+ ret void
+}
+
+; Test multiple loads from same uninitialized alloca
+define { i32, i32 } @test_multiple_loads_uninitialized_alloca() {
+; CHECK-LABEL: @test_multiple_loads_uninitialized_alloca(
+; CHECK-NEXT: ret { i32, i32 } undef
+;
+ %a = alloca [2 x i32]
+ %gep1 = getelementptr [2 x i32], ptr %a, i32 0, i32 0
+ %gep2 = getelementptr [2 x i32], ptr %a, i32 0, i32 1
+ %v1 = load i32, ptr %gep1
+ %v2 = load i32, ptr %gep2
+ %ret = insertvalue { i32, i32 } { i32 undef, i32 poison }, i32 %v1, 0
+ %ret2 = insertvalue { i32, i32 } %ret, i32 %v2, 1
+ ret { i32, i32 } %ret2
+}
+
+; Test that volatile operations prevent removal
+define i32 @test_volatile_load_prevents_removal() {
+; CHECK-LABEL: @test_volatile_load_prevents_removal(
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[V:%.*]] = load volatile i32, ptr [[A]], align 4
+; CHECK-NEXT: ret i32 [[V]]
+;
+ %a = alloca i32
+ %v = load volatile i32, ptr %a
+ ret i32 %v
+}
diff --git a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
index 15eb3e15ea44a..12a554081ca79 100644
--- a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
@@ -15,10 +15,8 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
define void @bork() nounwind {
; CHECK-LABEL: @bork(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[COLOR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[COLOR]], align 4
; CHECK-NEXT: [[TMP105:%.*]] = load ptr, ptr @"\01L_OBJC_SELECTOR_REFERENCES_81", align 4
-; CHECK-NEXT: [[TMP107:%.*]] = call float @objc_msgSend_fpret(ptr [[TMP103]], ptr [[TMP105]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT: [[TMP107:%.*]] = call float @objc_msgSend_fpret(ptr undef, ptr [[TMP105]]) #[[ATTR0:[0-9]+]]
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 61236df80bfa6..19466e48fb73c 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -582,9 +582,7 @@ define i32 @test20_as1(ptr addrspace(1) %P, i32 %A, i32 %B) {
define i32 @test21() {
; CHECK-LABEL: @test21(
-; CHECK-NEXT: [[PBOB1:%.*]] = alloca [[INTSTRUCT:%.*]], align 8
-; CHECK-NEXT: [[RVAL:%.*]] = load i32, ptr [[PBOB1]], align 4
-; CHECK-NEXT: ret i32 [[RVAL]]
+; CHECK-NEXT: ret i32 undef
;
%pbob1 = alloca %intstruct
%pbob2 = getelementptr %intstruct, ptr %pbob1
@@ -657,11 +655,6 @@ define i1 @test26(ptr %arr) {
define i32 @test27(ptr %to, ptr %from) {
; CHECK-LABEL: @test27(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[FROM_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT: [[T344:%.*]] = load ptr, ptr [[FROM_ADDR]], align 8
-; CHECK-NEXT: [[T348:%.*]] = getelementptr i8, ptr [[T344]], i64 24
-; CHECK-NEXT: [[T351:%.*]] = load i32, ptr [[T348]], align 8
-; CHECK-NEXT: [[T360:%.*]] = call i32 asm sideeffect "...", "=r,ir,*m,i,0,~{dirflag},~{fpsr},~{flags}"(i32 [[T351]], ptr elementtype([[STRUCT___LARGE_STRUCT:%.*]]) null, i32 -14, i32 0) #[[ATTR0:[0-9]+]]
; CHECK-NEXT: unreachable
;
entry:
@@ -685,7 +678,7 @@ define i32 @test28() nounwind {
; CHECK-LABEL: @test28(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8
-; CHECK-NEXT: [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]]
+; CHECK-NEXT: [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0:[0-9]+]]
; CHECK-NEXT: [[T45:%.*]] = getelementptr inbounds nuw i8, ptr [[ORIENTATIONS]], i64 1
; CHECK-NEXT: br label [[BB10:%.*]]
; CHECK: bb10:
@@ -1345,10 +1338,7 @@ declare noalias ptr @malloc(i64) nounwind allockind("alloc,uninitialized") alloc
define i32 @test_gep_bitcast_malloc(ptr %a) {
; CHECK-LABEL: @test_gep_bitcast_malloc(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = call noalias dereferenceable_or_null(16) ptr @malloc(i64 16)
-; CHECK-NEXT: [[G3:%.*]] = getelementptr i8, ptr [[CALL]], i64 12
-; CHECK-NEXT: [[A_C:%.*]] = load i32, ptr [[G3]], align 4
-; CHECK-NEXT: ret i32 [[A_C]]
+; CHECK-NEXT: ret i32 undef
;
entry:
%call = call noalias ptr @malloc(i64 16) #2
diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll
index 1f556821270a2..989074f97aaf6 100644
--- a/llvm/test/Transforms/InstCombine/malloc-free.ll
+++ b/llvm/test/Transforms/InstCombine/malloc-free.ll
@@ -133,17 +133,13 @@ define void @test4() {
define void @test5(ptr %ptr, ptr %esc) {
; CHECK-LABEL: @test5(
-; CHECK-NEXT: [[A:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
-; CHECK-NEXT: [[B:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
; CHECK-NEXT: [[C:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
; CHECK-NEXT: [[D:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
; CHECK-NEXT: [[E:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
; CHECK-NEXT: [[F:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
; CHECK-NEXT: [[G:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[PTR:%.*]], ptr noundef nonnull align 1 dereferenceable(32) [[A]], i32 32, i1 false)
-; CHECK-NEXT: call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 1 dereferenceable(32) [[B]], i32 32, i1 false)
; CHECK-NEXT: store ptr [[C]], ptr [[ESC:%.*]], align 4
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr [[D]], ptr [[PTR]], i32 32, i1 true)
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr [[D]], ptr [[PTR:%.*]], i32 32, i1 true)
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i32(ptr [[E]], ptr [[PTR]], i32 32, i1 true)
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr [[F]], i8 5, i32 32, i1 true)
; CHECK-NEXT: store volatile i8 4, ptr [[G]], align 1
diff --git a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
index 38fca0314ae11..a0f4f571a4b93 100644
--- a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
+++ b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
@@ -3,14 +3,6 @@
define void @PR35618(ptr %st1, ptr %st2) {
; CHECK-LABEL: @PR35618(
-; CHECK-NEXT: [[Y1:%.*]] = alloca double, align 8
-; CHECK-NEXT: [[Z1:%.*]] = alloca double, align 8
-; CHECK-NEXT: [[LD1:%.*]] = load double, ptr [[Y1]], align 8
-; CHECK-NEXT: [[LD2:%.*]] = load double, ptr [[Z1]], align 8
-; CHECK-NEXT: [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
-; CHECK-NEXT: [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
-; CHECK-NEXT: ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/Transforms/InstCombine/dead-alloc-elim.ll clang/test/Misc/loop-opt-setup.c llvm/lib/Transforms/InstCombine/InstructionCombining.cpp llvm/test/Transforms/InstCombine/and-or-icmps.ll llvm/test/Transforms/InstCombine/apint-shift.ll llvm/test/Transforms/InstCombine/call-cast-target.ll llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll llvm/test/Transforms/InstCombine/getelementptr.ll llvm/test/Transforms/InstCombine/malloc-free.ll llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll llvm/test/Transforms/InstCombine/objsize.ll llvm/test/Transforms/InstCombine/select-load.ll llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll llvm/test/Transforms/InstCombine/vscale_gep.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp,c -- clang/test/Misc/loop-opt-setup.c llvm/lib/Transforms/InstCombine/InstructionCombining.cpp View the diff from clang-format here.diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 93a8e8364..73444e226 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3244,9 +3244,9 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
return Dest && Dest->Ptr == UsedV;
}
-static std::optional<ModRefInfo> isAllocSiteRemovable(Instruction *AI,
- SmallVectorImpl<WeakTrackingVH> &Users,
- const TargetLibraryInfo &TLI, bool KnowInit) {
+static std::optional<ModRefInfo>
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+ const TargetLibraryInfo &TLI, bool KnowInit) {
SmallVector<Instruction*, 4> Worklist;
const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
Worklist.push_back(AI);
@@ -3421,8 +3421,8 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
// allocating the result.
bool KnowInitUndef = false;
bool KnowInitZero = false;
- Constant *Init = getInitialValueOfAllocation(
- &MI, &TLI, Type::getInt8Ty(MI.getContext()));
+ Constant *Init =
+ getInitialValueOfAllocation(&MI, &TLI, Type::getInt8Ty(MI.getContext()));
if (Init) {
if (isa<UndefValue>(Init))
KnowInitUndef = true;
@@ -3436,7 +3436,8 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
F.hasFnAttribute(Attribute::SanitizeAddress))
KnowInitUndef = false;
- auto Removable = isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+ auto Removable =
+ isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
if (Removable) {
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
// Lowering all @llvm.objectsize and MTI calls first because they may use
|
// Note: this could also be ModRef, but we can still interpret that as just Mod in that case. | ||
ModRefInfo NewAccess = MI->getRawDest() == PI ? ModRefInfo::Mod : ModRefInfo::Ref; | ||
if ((Access & ~NewAccess) != ModRefInfo::NoModRef) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than doing these individual checks, can we have a centralized check in the loop that does the early bailout for Access == ModRef
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I originally was going to have it use AA to compute ModRef at the top, but that seemed potentially slower (since we already know PI aliases). I wasn't sure how else to centralize the test and also preserve exit early unless I just duplicate it at the top and after the loop (if (Access == ModRefInfo::ModRef) return false
)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I originally was going to have it use AA to compute ModRef at the top, but that seemed potentially slower (since we already know PI aliases).
Yeah, we shouldn't use AA here.
I wasn't sure how else to centralize the test and also preserve exit early unless I just duplicate it at the top and after the loop (
if (Access == ModRefInfo::ModRef) return false
)?
Yeah, that's what I had in mind, but no strong opinion. We should probably at least assert that it's not ModRef after the loop though, to make sure no case was missed.
@@ -3362,10 +3385,23 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { | |||
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); | |||
} | |||
|
|||
if (isAllocSiteRemovable(&MI, Users, TLI)) { | |||
// Determine what getInitialValueOfAllocation would return without actually allocating the result. | |||
bool KnowInitUndef = isa<AllocaInst>(MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need the special case for AllocaInst, getInitialValueOfAllocation will handle it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems like a good idea, but it doesn't look like it: https://llvm.org/doxygen/namespacellvm.html#a108b6e33f153eda5019d322f0ac909b0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should work after 3824a2d.
ConstantInt::get(Type::getInt8Ty(MI.getContext()), 0), | ||
MTI->getLength(), | ||
MTI->getDestAlign()); | ||
M->copyMetadata(*MTI, LLVMContext::MD_DIAssignID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should probably copy all the metadata? E.g. to preserve AA metadata.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was wondering about that too. The current MemCpyOptimizer pass usually drops all metadata information (which is where I copied this from to be consistent) but I couldn't really justify that to myself either.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This depends a lot on the specific transform. Dropping or adjusting metadata is often required when we're merging operations in some way (e.g. memcpy to memcpy forwarding).
However, I believe that for this specific case where we're converting memcpy to memset, preserving metadata should be fine. (We're only removing a read from a location.)
@@ -364,20 +364,8 @@ define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) { | |||
|
|||
define void @simplify_before_foldAndOfICmps(ptr %p) { | |||
; CHECK-LABEL: @simplify_before_foldAndOfICmps( | |||
; CHECK-NEXT: [[A8:%.*]] = alloca i16, align 2 | |||
; CHECK-NEXT: [[L7:%.*]] = load i16, ptr [[A8]], align 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should probably replace this load with an argument to retain test behavior?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to modify this test to preserve behavior. Also the next one, etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've tried to adjust most of those to avoid alloca, though it is triggering some sort of bug with simplify_before_foldAndOfICmps
which I'll have to investigate later
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like the simplify_before_foldAndOfICmps
crash was a pre-existing bug (I've added an xfail test to represent it so that it can be addressed separately). I think this PR is ready to go, with 2 followup items for future PRs:
- preserve metadata when creating MemSet, here and in MemCpyOpt, unless that is shown to be invalid
- fix the unrelated simplify_before_foldAndOfICmps test failure preserved as xfail here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the continued stream of comments, but it turns out this test is supposed to crash, per 4189584, so I've removed the XFAIL and fixed the test invocation to avoid the crash
@@ -3362,10 +3385,23 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { | |||
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); | |||
} | |||
|
|||
if (isAllocSiteRemovable(&MI, Users, TLI)) { | |||
// Determine what getInitialValueOfAllocation would return without actually allocating the result. | |||
bool KnowInitUndef = isa<AllocaInst>(MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should work after 3824a2d.
ConstantInt::get(Type::getInt8Ty(MI.getContext()), 0), | ||
MTI->getLength(), | ||
MTI->getDestAlign()); | ||
M->copyMetadata(*MTI, LLVMContext::MD_DIAssignID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This depends a lot on the specific transform. Dropping or adjusting metadata is often required when we're merging operations in some way (e.g. memcpy to memcpy forwarding).
However, I believe that for this specific case where we're converting memcpy to memset, preserving metadata should be fine. (We're only removing a read from a location.)
@@ -3382,6 +3422,17 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { | |||
eraseInstFromFunction(*I); | |||
Users[i] = nullptr; // Skip examining in the next loop. | |||
} | |||
if (auto *MTI = dyn_cast<MemTransferInst>(I)) { | |||
if (KnowInitZero && getUnderlyingObject(MTI->getRawDest()) != &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this check isn't entirely right, in case we can't see through to the underlying object (e.g. a very long gep chain). In that case even though we're writing to the alloca, we'd leave a memset here.
I think it would be more robust if isAllocSiteRemovable returned the ModRefInfo and then you made the decision here on whether this is a Mod or Ref alloca.
; on an 'and' that should have been killed. It's not obvious | ||
; why, but removing anything hides the bug, hence the long test. | ||
|
||
define void @simplify_before_foldAndOfICmps(ptr %p, ptr %A8) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define void @simplify_before_foldAndOfICmps(ptr %p, ptr %A8) { | |
define void @simplify_before_foldAndOfICmps(ptr %p, ptr %A8) "instcombine-no-verify-fixpoint" { |
Use the attribute instead. Then you can also leave the test where it was.
// Note: this could also be ModRef, but we can still interpret that as just Mod in that case. | ||
ModRefInfo NewAccess = MI->getRawDest() == PI ? ModRefInfo::Mod : ModRefInfo::Ref; | ||
if ((Access & ~NewAccess) != ModRefInfo::NoModRef) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I originally was going to have it use AA to compute ModRef at the top, but that seemed potentially slower (since we already know PI aliases).
Yeah, we shouldn't use AA here.
I wasn't sure how else to centralize the test and also preserve exit early unless I just duplicate it at the top and after the loop (
if (Access == ModRefInfo::ModRef) return false
)?
Yeah, that's what I had in mind, but no strong opinion. We should probably at least assert that it's not ModRef after the loop though, to make sure no case was missed.
dtcxzyw/llvm-opt-benchmark#2433 has a few cases that regress for a reason that is not immediately obvious to me, if you want to take a look. (But this is overwhelmingly an improvement, so I don't particularly care.) There are a number of test failures for asan and msan. I think we will want to disable this variant of the transform at least for sanitize_memory, because this will obviously hide uninitialized memory reads. Less sure about asan, but might be simplest to just disable it for it as well. |
…when run with an explicit passes pipeline Per 4189584
d158fbb
to
5d5cd6b
Compare
@nikic The implements our discussion in #143782 (comment), extending
isAllocSiteRemovable
to be able to check if the ModRef info indicates the alloca is only Ref or only Mod, and be able to remove it accordingly. It seemed that there were a surprising number of benchmarks with this pattern which weren't getting optimized previously (due to MemorySSA walk limits). There were somewhat more existing tests than I'd like to have modified which were simply doing exactly this pattern. This PR was going to be an experiment with using Claude to write code, but I didn't like its implementation (I asked it to do the wrong approach), so the code is mine. Though Claude did contribute the tests and found an important typo that I'd made.