Skip to content

[InstCombine] Forward memcpy source to load instruction #140249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/include/llvm/Analysis/Loads.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,10 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// This overload provides a more efficient implementation of
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
/// This overload cannot be used to scan across multiple blocks.
/// This overload cannot be used to scan across multiple blocks. If a memcpy is
/// returned, it indicates that we can load from its source.
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
bool *IsLoadCSE,
bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan = DefMaxInstsToScan);

/// Scan backwards to see if we have the value of the given pointer available
Expand Down
34 changes: 33 additions & 1 deletion llvm/lib/Analysis/Loads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,31 @@ Value *llvm::findAvailablePtrLoadStore(
return nullptr;
}

static Value *availableMemCpySrc(LoadInst *LI, MemCpyInst *MemCpy,
int64_t &Offset) {
if (!LI->isSimple() || MemCpy->isVolatile())
return nullptr;
const DataLayout &DL = LI->getDataLayout();
u_int64_t Size = DL.getTypeStoreSize(LI->getType()).getKnownMinValue();
if (Size == 0)
return nullptr;
Value *OldSrc = LI->getPointerOperand();

if (OldSrc != MemCpy->getDest()) {
std::optional<int64_t> PointerOffset =
OldSrc->getPointerOffsetFrom(MemCpy->getDest(), DL);
if (!PointerOffset || *PointerOffset < 0)
return nullptr;
Offset = *PointerOffset;
}
auto *CopyLen = dyn_cast<ConstantInt>(MemCpy->getLength());
if (!CopyLen || CopyLen->getZExtValue() < Size + Offset)
return nullptr;
return MemCpy;
}

Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
bool *IsLoadCSE,
bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
Expand All @@ -739,6 +762,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,

Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE);
if (auto *MemCpy = dyn_cast<MemCpyInst>(&Inst))
Available = availableMemCpySrc(Load, MemCpy, Offset);

if (Available)
break;

Expand All @@ -753,6 +779,12 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
for (Instruction *Inst : MustNotAliasInsts)
if (isModSet(AA.getModRefInfo(Inst, Loc)))
return nullptr;
if (auto *MemCpy = dyn_cast<MemCpyInst>(Available)) {
MemoryLocation Loc = MemoryLocation::getForSource(MemCpy);
for (Instruction *Inst : MustNotAliasInsts)
if (isModSet(AA.getModRefInfo(Inst, Loc)))
return nullptr;
}
}

return Available;
Expand Down
40 changes: 36 additions & 4 deletions llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1053,13 +1053,45 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
BatchAAResults BatchAA(*AA);
if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
int64_t Offset = 0;
if (Value *AvailableVal =
FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE, Offset)) {
if (IsLoadCSE)
combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);

return replaceInstUsesWith(
LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
LI.getName() + ".cast"));
/// Perform simplification of load's. If we have memcpy A which copies X to
/// Y, and load instruction B which loads from Y, then we can rewrite B to
/// be a load instruction loads from X. This allows later passes to remove
/// the memcpy A or identify the source of the load instruction.
if (auto *MemCpy = dyn_cast<MemCpyInst>(AvailableVal)) {
Value *NewSrc = MemCpy->getSource();
Value *OldSrc = LI.getPointerOperand();
MaybeAlign NewAlign = MemCpy->getSourceAlign();
if (Offset != 0) {
if (NewAlign.has_value())
NewAlign = commonAlignment(*NewAlign, Offset);
// Avoid increasing instructions
if (isa<Instruction>(OldSrc) && OldSrc->hasOneUse())
NewSrc =
Builder.CreateInBoundsPtrAdd(NewSrc, Builder.getInt64(Offset));
else
NewSrc = nullptr;
}
// Avoid infinite loops
if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc))
AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some metadata may be safe to propagate.

LI.getName());
else {
AvailableVal = nullptr;
if (NewSrc && NewSrc->use_empty())
cast<Instruction>(NewSrc)->eraseFromParent();
}
} else
AvailableVal = Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
LI.getName() + ".cast");

if (AvailableVal)
return replaceInstUsesWith(LI, AvailableVal);
}

// None of the following transforms are legal for volatile/ordered atomic
Expand Down
169 changes: 169 additions & 0 deletions llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

define i24 @forward_load(ptr align 4 %src) {
; CHECK-LABEL: define i24 @forward_load(
; CHECK-SAME: ptr align 4 [[SRC:%.*]]) {
; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
; CHECK-NEXT: ret i24 [[VAL1]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
%val = load i24, ptr %dest
ret i24 %val
}

define i8 @forward_load_gep(ptr %src) {
; CHECK-LABEL: define i8 @forward_load_gep(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
; CHECK-NEXT: ret i8 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
%gep = getelementptr inbounds i8, ptr %dest, i64 2
%val = load i8, ptr %gep
ret i8 %val
}

define i17 @forward_load_padding(ptr %src) {
; CHECK-LABEL: define i17 @forward_load_padding(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[VAL:%.*]] = load i17, ptr [[SRC]], align 1
; CHECK-NEXT: ret i17 [[VAL]]
;
%dest = alloca [5 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
%val = load i17, ptr %dest
ret i17 %val
}

define <2 x i8> @forward_load_vector(ptr %src) {
; CHECK-LABEL: define <2 x i8> @forward_load_vector(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[SRC]], align 1
; CHECK-NEXT: ret <2 x i8> [[TMP1]]
;
%dest = alloca <2 x i8>
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
%val = load <2 x i8>, ptr %dest
ret <2 x i8> %val
}

; Negative tests

define i24 @forward_load_volatile(ptr %src) {
; CHECK-LABEL: define i24 @forward_load_volatile(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
; CHECK-NEXT: [[VAL:%.*]] = load volatile i24, ptr [[DEST]], align 4
; CHECK-NEXT: ret i24 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
%val = load volatile i24, ptr %dest
ret i24 %val
}

define i24 @failed_forward_load_write_src(ptr %src) {
; CHECK-LABEL: define i24 @failed_forward_load_write_src(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1
; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
; CHECK-NEXT: ret i24 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
store i1 true, ptr %src
%val = load i24, ptr %dest
ret i24 %val
}

define i24 @failed_forward_load_write_dest(ptr %src) {
; CHECK-LABEL: define i24 @failed_forward_load_write_dest(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
; CHECK-NEXT: store i1 true, ptr [[DEST]], align 1
; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
; CHECK-NEXT: ret i24 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
store i1 true, ptr %dest
%val = load i24, ptr %dest
ret i24 %val
}

define i16 @failed_forward_load_size(ptr %src) {
; CHECK-LABEL: define i16 @failed_forward_load_size(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
; CHECK-NEXT: store i8 [[TMP1]], ptr [[DEST]], align 1
; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
; CHECK-NEXT: ret i16 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
%val = load i16, ptr %dest
ret i16 %val
}

define i8 @failed_forward_load_gep(ptr %src) {
; CHECK-LABEL: define i8 @failed_forward_load_gep(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 1
; CHECK-NEXT: store i16 [[TMP1]], ptr [[DEST]], align 1
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
; CHECK-NEXT: ret i8 [[VAL]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
%gep = getelementptr inbounds i8, ptr %dest, i64 2
%val = load i8, ptr %gep
ret i8 %val
}

define i8 @failed_forward_load_gep_multi_use(ptr %src) {
; CHECK-LABEL: define i8 @failed_forward_load_gep_multi_use(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
; CHECK-NEXT: [[VAL1:%.*]] = load i8, ptr [[GEP]], align 1
; CHECK-NEXT: call void @use_ptr(ptr nonnull [[GEP]])
; CHECK-NEXT: ret i8 [[VAL1]]
;
%dest = alloca [3 x i8]
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
%gep = getelementptr inbounds i8, ptr %dest, i64 2
%val = load i8, ptr %gep
call void @use_ptr(ptr %gep)
ret i8 %val
}

define i24 @failed_forward_load_must_alias(ptr %src) {
; CHECK-LABEL: define i24 @failed_forward_load_must_alias(
; CHECK-SAME: ptr [[SRC:%.*]]) {
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST_GEP]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC_GEP]], i64 3, i1 false)
; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST_GEP]], align 4
; CHECK-NEXT: ret i24 [[VAL]]
;
%src_gep = getelementptr inbounds i8, ptr %src, i64 2
%dest_gep = getelementptr inbounds i8, ptr %src, i64 2
call void @llvm.memcpy.p0.p0.i64(ptr %dest_gep, ptr %src_gep, i64 3, i1 false)
%val = load i24, ptr %dest_gep
ret i24 %val
}

declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
declare void @use_ptr(ptr)
6 changes: 2 additions & 4 deletions llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,7 @@ define i32 @test_memcpy_after_phi(i1 %cond, ptr %ptr) {
; CHECK: join:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], [[IF]] ], [ [[PTR:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PHI]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PHI]], align 4
; CHECK-NEXT: ret i32 [[V]]
; CHECK-NEXT: ret i32 0
;
entry:
%a = alloca [32 x i8]
Expand Down Expand Up @@ -384,8 +383,7 @@ define i8 @select_after_memcpy_keep_alloca(i1 %cond, ptr %p) {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 1
; CHECK-NEXT: [[PTR:%.*]] = select i1 [[COND:%.*]], ptr [[ALLOCA]], ptr [[P:%.*]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR]], align 1
; CHECK-NEXT: ret i8 [[LOAD]]
; CHECK-NEXT: ret i8 0
;
entry:
%alloca = alloca [32 x i8]
Expand Down
47 changes: 47 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -O2 -S < %s | FileCheck %s

define i1 @main(ptr %i2) {
; CHECK-LABEL: define noundef i1 @main(
; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[COMMON_RET:.*:]]
; CHECK-NEXT: store i8 0, ptr [[I2]], align 1
; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
; CHECK-NEXT: store i8 1, ptr [[I3]], align 1
; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
; CHECK-NEXT: store i8 2, ptr [[I4]], align 1
; CHECK-NEXT: ret i1 true
;
%i1 = alloca [3 x i8], align 1
store i8 0, ptr %i2, align 1
%i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
store i8 1, ptr %i3, align 1
%i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
store i8 2, ptr %i4, align 1
call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
%i5 = load i8, ptr %i1, align 1
%i6 = icmp eq i8 %i5, 0
%i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
%i8 = load i8, ptr %i7, align 1
%i9 = icmp eq i8 %i8, 1
%i10 = select i1 %i6, i1 %i9, i1 false
%i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
%i12 = load i8, ptr %i11, align 1
%i13 = icmp eq i8 %i12, 2
%i14 = select i1 %i10, i1 %i13, i1 false
br i1 %i14, label %true, label %false

true:
call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
ret i1 true

false:
call void @assert_failed(ptr %i1)
ret i1 false
}

declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
declare void @llvm.lifetime.start.p0(i64, ptr)
declare void @llvm.lifetime.end.p0(i64, ptr)
declare void @assert_failed(ptr)
Loading