-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LowerMemIntrinsics] Respect the volatile argument of llvm.memmove #97545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
So far, we ignored if a memmove intrinsic is volatile when lowering it to loops in the IR. This change generates volatile loads and stores in this case (similar to how memcpy is handled) and adds tests for volatile memmoves and memcpys.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: Fabian Ritter (ritter-x2a) ChangesSo far, we ignored if a memmove intrinsic is volatile when lowering it to loops in the IR. This change generates volatile loads and stores in this case (similar to how memcpy is handled) and adds tests for volatile memmoves and memcpys. Full diff: https://github.com/llvm/llvm-project/pull/97545.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d84e9f094e03a..d2814f07530d8 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -422,10 +422,10 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
Value *Element = LoopBuilder.CreateAlignedLoad(
EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
- PartSrcAlign, "element");
+ PartSrcAlign, SrcIsVolatile, "element");
LoopBuilder.CreateAlignedStore(
Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
- PartDstAlign);
+ PartDstAlign, DstIsVolatile);
LoopBuilder.CreateCondBr(
LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
ExitBB, LoopBB);
@@ -440,10 +440,11 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
IRBuilder<> FwdLoopBuilder(FwdLoopBB);
PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
- Value *FwdElement =
- FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
+ Value *FwdElement = FwdLoopBuilder.CreateAlignedLoad(
+ EltTy, SrcGEP, PartSrcAlign, SrcIsVolatile, "element");
Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
- FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
+ FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign,
+ DstIsVolatile);
Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 1c4e4b8602ff8..d53db69f9f2e0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1776,6 +1776,67 @@ entry:
ret void
}
+define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
+; MAX1024-LABEL: @memmove_volatile(
+; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_volatile(
+; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 64, 0
+; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
+; ALL: copy_backwards:
+; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
+; ALL: copy_backwards_loop:
+; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 64, [[COPY_BACKWARDS]] ]
+; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
+; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
+; ALL-NEXT: [[ELEMENT:%.*]] = load volatile i8, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
+; ALL-NEXT: store volatile i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
+; ALL: copy_forward:
+; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
+; ALL: copy_forward_loop:
+; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
+; ALL-NEXT: [[ELEMENT2:%.*]] = load volatile i8, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
+; ALL-NEXT: store volatile i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
+; ALL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 64
+; ALL-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL: memmove_done:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true)
+ ret void
+}
+
+define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
+; MAX1024-LABEL: @memcpy_volatile(
+; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memcpy_volatile(
+; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
+; ALL: load-store-loop:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP2:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: store volatile <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 4
+; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL: memcpy-split:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true)
+ ret void
+}
+
declare i64 @llvm.umin.i64(i64, i64)
attributes #0 = { nounwind }
|
…lvm#97545) So far, we ignored if a memmove intrinsic is volatile when lowering it to loops in the IR. This change generates volatile loads and stores in this case (similar to how memcpy is handled) and adds tests for volatile memmoves and memcpys.
…lvm#97545) So far, we ignored if a memmove intrinsic is volatile when lowering it to loops in the IR. This change generates volatile loads and stores in this case (similar to how memcpy is handled) and adds tests for volatile memmoves and memcpys.
So far, we ignored if a memmove intrinsic is volatile when lowering it to loops in the IR. This change generates volatile loads and stores in this case (similar to how memcpy is handled) and adds tests for volatile memmoves and memcpys.