-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SROA] Vector promote some memsets #133301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: None (macurtis-amd) ChangesPatch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
*p = 0;
}
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
typedef struct {
long a, b, c, d;
private char *p;
} StructTy3;
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
StructTy3 S3 = {0, 0, 0, 0, 0};
*ptr = S3;
}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+ const DataLayout &DL) {
+ const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (!Length)
+ return nullptr;
+
+ APInt Val = Length->getValue();
+ if (Val.ugt(std::numeric_limits<unsigned>::max()))
+ return nullptr;
+
+ auto *VTy =
+ FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+ if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+ return nullptr;
+
+ return VTy;
+}
+
/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
return Base::visitGetElementPtrInst(GEPI);
}
+ bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+ return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+ }
+
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
// We allow splitting of non-volatile loads and stores where the type is an
// integer type. These may be used to implement 'memcpy' or other "transfer
// of bits" patterns.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
- insertUse(I, Offset, Size, IsSplittable);
+ insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
}
void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);
+ auto IsSplittable = [&]() {
+ FixedVectorType *VTy = getVectorTypeFor(II, DL);
+ Type *ATy = AS.AI.getAllocatedType();
+
+ if (!Length)
+ return false;
+ if (!VTy)
+ return true;
+ if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+ return true;
+ return isSplittableMemOp(ATy, II.isVolatile());
+ };
+
insertUse(II, Offset,
Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
- (bool)Length);
+ IsSplittable());
}
void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
+
+ auto *II = dyn_cast<MemSetInst>(U->getUser());
+ if (!II && !S.isSplittable()) {
+ // Skip any non-memset unsplittable intrinsics.
+ return false;
+ }
+ if (II) {
+ // For memset, allow if we have a suitable vector type
+ Type *VTy = getVectorTypeFor(*II, DL);
+ if (!VTy)
+ return false;
+ if (!canConvertValue(DL, SliceTy, VTy))
+ return false;
+ }
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
- Type *Ty;
+ Type *Ty = nullptr;
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
Ty = LI->getType();
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
Ty = SI->getValueOperand()->getType();
- else
+ else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+ Ty = getVectorTypeFor(*II, DL);
+
+ if (!Ty)
continue;
auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
;; Allocas have been promoted - the linked dbg.assigns have been removed.
;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
;; point.z = 5000;
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
;; local.other.x = global.other.x
;; local.other.y = global.other.y
;; local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
;; other is now 3 scalars:
;; point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
;;
;; point.z = other.y
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
;
; There should be no debug info for the padding.
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
%struct.prog_src_register = type { i32, i24 }
; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
define ptr @test10() {
; CHECK-LABEL: @test10(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT: ret ptr null
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: ret ptr [[TMP1]]
;
entry:
%a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
;
; CHECK-LABEL: @PR14059.1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
; CHECK-NEXT: [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
; CHECK-NEXT: [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
; CHECK-NEXT: store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
define void @memset_fp80_padding() {
; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT: store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT: [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT: store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
; CHECK-NEXT: ret void
;
%x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
define amdgpu_kernel void @test_half_array() #0 {
; CHECK-LABEL: @test_half_array(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float undef to i32
; CHECK-NEXT: [[DATA:%.*]] = load [4 x float], ptr undef, align 4
; CHECK-NEXT: [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT: store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT: store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT: ret void
;
entry:
@@ -285,17 +291,17 @@ bb:
define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LAB...
[truncated]
|
@llvm/pr-subscribers-clang Author: None (macurtis-amd) ChangesPatch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133301.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index a0c106bca83c9..927cb3f38fa9c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -503,21 +503,19 @@ void cast_bool_generic(generic char* p) {
*p = 0;
}
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
+// Test initialization of a struct with a private member.
typedef struct {
long a, b, c, d;
private char *p;
} StructTy3;
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// CHECK-LABEL: test_struct_private_member
+// CHECK: store <32 x i8> zeroinitializer, ptr addrspace(5) {{.*}}, align 8
// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
+void test_struct_private_member(private StructTy3 *ptr) {
StructTy3 S3 = {0, 0, 0, 0, 0};
*ptr = S3;
}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 86be20c799a68..3ded637a5c63b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1011,6 +1011,26 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
return foldSelectInst(cast<SelectInst>(I));
}
+/// Returns a fixed vector type equivalent to the memory set by II or nullptr if
+/// unable to do so.
+static FixedVectorType *getVectorTypeFor(const MemSetInst &II,
+ const DataLayout &DL) {
+ const ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (!Length)
+ return nullptr;
+
+ APInt Val = Length->getValue();
+ if (Val.ugt(std::numeric_limits<unsigned>::max()))
+ return nullptr;
+
+ auto *VTy =
+ FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue());
+ if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy))
+ return nullptr;
+
+ return VTy;
+}
+
/// Builder for the alloca slices.
///
/// This class builds a set of alloca slices by recursively visiting the uses
@@ -1099,15 +1119,16 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
return Base::visitGetElementPtrInst(GEPI);
}
+ bool isSplittableMemOp(Type *Ty, bool IsVolatile) {
+ return Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
+ }
+
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
// We allow splitting of non-volatile loads and stores where the type is an
// integer type. These may be used to implement 'memcpy' or other "transfer
// of bits" patterns.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
-
- insertUse(I, Offset, Size, IsSplittable);
+ insertUse(I, Offset, Size, isSplittableMemOp(Ty, IsVolatile));
}
void visitLoadInst(LoadInst &LI) {
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);
+ auto IsSplittable = [&]() {
+ FixedVectorType *VTy = getVectorTypeFor(II, DL);
+ Type *ATy = AS.AI.getAllocatedType();
+
+ if (!Length)
+ return false;
+ if (!VTy)
+ return true;
+ if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy))
+ return true;
+ return isSplittableMemOp(ATy, II.isVolatile());
+ };
+
insertUse(II, Offset,
Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
- (bool)Length);
+ IsSplittable());
}
void visitMemTransferInst(MemTransferInst &II) {
@@ -2072,8 +2106,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
if (MI->isVolatile())
return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
+
+ auto *II = dyn_cast<MemSetInst>(U->getUser());
+ if (!II && !S.isSplittable()) {
+ // Skip any non-memset unsplittable intrinsics.
+ return false;
+ }
+ if (II) {
+ // For memset, allow if we have a suitable vector type
+ Type *VTy = getVectorTypeFor(*II, DL);
+ if (!VTy)
+ return false;
+ if (!canConvertValue(DL, SliceTy, VTy))
+ return false;
+ }
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
return false;
@@ -2316,12 +2362,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// Put load and store types into a set for de-duplication.
for (const Slice &S : P) {
- Type *Ty;
+ Type *Ty = nullptr;
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
Ty = LI->getType();
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
Ty = SI->getValueOperand()->getType();
- else
+ else if (auto *II = dyn_cast<MemSetInst>(S.getUse()->getUser()))
+ Ty = getVectorTypeFor(*II, DL);
+
+ if (!Ty)
continue;
auto CandTy = Ty->getScalarType();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index af3070511e345..a9d0b10586583 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -21,8 +21,7 @@
;; Allocas have been promoted - the linked dbg.assigns have been removed.
;; | V3i point = {0, 0, 0};
-; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
-; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: #dbg_value(<16 x i8> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128),
;; point.z = 5000;
; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
@@ -32,17 +31,20 @@
;; local.other.x = global.other.x
;; local.other.y = global.other.y
;; local.other.z = global.other.z
-; CHECK-NEXT: %other.sroa.0.0.copyload = load i64, ptr @__const._Z3funv.other
+; CHECK-NEXT: %other.sroa.0.0.copyload = load <8 x i8>, ptr @__const._Z3funv.other
; CHECK-NEXT: %other.sroa.2.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 8)
; CHECK-NEXT: %other.sroa.3.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const._Z3funv.other, i64 16)
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
+; CHECK-NEXT: #dbg_value(<8 x i8> %other.sroa.0.0.copyload, ![[other:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2);
;; other is now 3 scalars:
;; point.y = other.x
-; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vec.expand = shufflevector <8 x i8> %other.sroa.0.0.copyload, <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+; CHECK-NEXT: %point.sroa.0.sroa.0.8.vecblend = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %point.sroa.0.sroa.0.8.vec.expand, <16 x i8> zeroinitializer,
+; CHECK-NEXT: #dbg_value(<16 x i8> %point.sroa.0.sroa.0.8.vecblend, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64),
+
;;
;; point.z = other.y
; CHECK-NEXT: #dbg_value(i64 %other.sroa.2.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64),
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..d2ecc9598e3c4 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -21,10 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
;
; There should be no debug info for the padding.
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: ![[a:[0-9]+]], !DIExpression(),
; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: ![[a]] = !DILocalVariable(name: "a",
%struct.prog_src_register = type { i32, i24 }
; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 145da5259fab3..03590bbce146a 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -529,8 +529,9 @@ entry:
define ptr @test10() {
; CHECK-LABEL: @test10(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr null to i64
-; CHECK-NEXT: ret ptr null
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> zeroinitializer to i64
+; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: ret ptr [[TMP1]]
;
entry:
%a = alloca [8 x i8]
@@ -1075,26 +1076,13 @@ define void @PR14059.1(ptr %d) {
;
; CHECK-LABEL: @PR14059.1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double undef to i64
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_MASK:%.*]] = and i64 [[TMP0]], -4294967296
-; CHECK-NEXT: [[X_SROA_0_I_0_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_0_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[X_SROA_0_I_0_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_MASK:%.*]] = and i64 [[TMP2]], -281474976645121
-; CHECK-NEXT: [[X_SROA_0_I_2_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_2_INSERT_MASK]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[X_SROA_0_I_2_INSERT_INSERT]] to double
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_COPYLOAD:%.*]] = load i32, ptr [[D:%.*]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double 0.000000e+00 to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_EXT:%.*]] = zext i32 [[X_SROA_0_I_4_COPYLOAD]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_SHIFT:%.*]] = shl i64 [[X_SROA_0_I_4_INSERT_EXT]], 32
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK3:%.*]] = and i64 [[TMP5]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT4:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK3]], [[X_SROA_0_I_4_INSERT_SHIFT]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT4]] to double
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to i64
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_MASK:%.*]] = and i64 [[TMP7]], 4294967295
-; CHECK-NEXT: [[X_SROA_0_I_4_INSERT_INSERT:%.*]] = or i64 [[X_SROA_0_I_4_INSERT_MASK]], 4607182418800017408
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[X_SROA_0_I_4_INSERT_INSERT]] to double
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i8> undef
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_2_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef>, <8 x i8> [[X_SROA_0_I_SROA_0_0_VECBLEND]]
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_COPYLOAD:%.*]] = load <4 x i8>, ptr [[D:%.*]], align 1
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <4 x i8> [[X_SROA_0_I_SROA_0_4_COPYLOAD]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND2:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VEC_EXPAND]], <8 x i8> zeroinitializer
+; CHECK-NEXT: [[X_SROA_0_I_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 0), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 1), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 2), i8 extractelement (<4 x i8> bitcast (<1 x i32> splat (i32 1072693248) to <4 x i8>), i32 3)>, <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND2]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[X_SROA_0_I_SROA_0_4_VECBLEND]] to double
; CHECK-NEXT: [[ACCUM_REAL_I:%.*]] = load double, ptr [[D]], align 8
; CHECK-NEXT: [[ADD_R_I:%.*]] = fadd double [[ACCUM_REAL_I]], [[TMP8]]
; CHECK-NEXT: store double [[ADD_R_I]], ptr [[D]], align 8
@@ -1332,10 +1320,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..63362534ff812 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -68,9 +68,8 @@ define void @memcpy_fp80_padding() {
define void @memset_fp80_padding() {
; CHECK-LABEL: @memset_fp80_padding(
-; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca x86_fp80, align 16
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[X_SROA_0]], i8 -1, i32 16, i1 false)
-; CHECK-NEXT: store i64 -1, ptr @i64_sink, align 4
+; CHECK-NEXT: [[X_SROA_0_16_VEC_EXTRACT:%.*]] = extractelement <4 x i64> splat (i64 -1), i32 2
+; CHECK-NEXT: store i64 [[X_SROA_0_16_VEC_EXTRACT]], ptr @i64_sink, align 4
; CHECK-NEXT: ret void
;
%x = alloca %union.Foo
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 72014912edd20..62df5121215bf 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -245,25 +245,31 @@ bb:
define amdgpu_kernel void @test_half_array() #0 {
; CHECK-LABEL: @test_half_array(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float undef to i32
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float undef to i32
; CHECK-NEXT: [[DATA:%.*]] = load [4 x float], ptr undef, align 4
; CHECK-NEXT: [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
-; CHECK-NEXT: store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[DATA_FCA_0_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND:%.*]] = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXPAND]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
-; CHECK-NEXT: store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[DATA_FCA_1_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_0_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[DATA_FCA_2_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_4_VECBLEND]]
; CHECK-NEXT: [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[DATA_FCA_3_EXTRACT]] to <2 x i16>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND:%.*]] = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VEC_EXPAND]], <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_8_VECBLEND]]
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
-; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[B_BLOCKWISE_COPY_SROA_0_12_VECBLEND]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT: ret void
;
entry:
@@ -285,17 +291,17 @@ bb:
define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LAB...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing new tests? I'd expect to see a few new targeted tests stressing different vector sizes and alignments, and not just updates of existing tests
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
|
||
auto *VTy = | ||
FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue()); | ||
if (DL.getTypeStoreSizeInBits(VTy) != DL.getTypeAllocSizeInBits(VTy)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DL.getTypeAllocSizeInBits is implemented with getTypeStoreSizeInBits and I find it confusing to combine the two, use explicit alignment checks if you need both
I also do not think you should need to consider the type alignment. The resulting store should have an explicit alignment, which does not need to match the type's natural alignment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a workaround.
Latest revision adds a comment to make this explicit and also cleans up the usage of getType*.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
if (DL.getTypeAllocSize(VTy) != DL.getTypeAllocSize(ATy)) | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Further duplicated size checks, it's hard to follow the flow
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cleaned up the code. Hopefully better now.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
@@ -1170,10 +1191,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { | |||
if (!IsOffsetKnown) | |||
return PI.setAborted(&II); | |||
|
|||
auto IsSplittable = [&]() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you move this to a separate utility function? The Length capture is slightly confusing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cleaned up code removes the helper altogether. Hopefully better now.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
|
||
if (!Ty) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Leave as else continue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated
096fde6
to
a9b215b
Compare
Added a new test. Thanks for the review! |
; CHECK-NEXT: [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <32 x i8> zeroinitializer, i8 [[TMP3]], i32 0 | ||
; CHECK-NEXT: ret void | ||
; | ||
%2 = alloca %struct.a, align 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values in tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to named values.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
if (!Length) | ||
return nullptr; | ||
|
||
APInt Val = Length->getValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
APInt Val = Length->getValue(); | |
const APInt &Val = Length->getValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to suggestion
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
if (Val.ugt(std::numeric_limits<unsigned>::max())) | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't understand this limit. Is this the maximum number of vector elements? Should avoid hardcoding that
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
return nullptr; | ||
|
||
uint64_t MemSetLen = Val.getZExtValue(); | ||
auto *VTy = FixedVectorType::get(II.getValue()->getType(), MemSetLen); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The element will always be i8. TODO to support llvm.experimental.memset.pattern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added comment
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
if (Val.ugt(std::numeric_limits<unsigned>::max())) | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this just because the maximum number of vector elts? Can you put this value into a helper on FixedVectorType instead of hardcoding unsigned here?
Although we probably shouldn't be trying to promote anything that's anything close to that big.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I decided to move the limit from checkVectorTypeForPromotion into a helper function and use it here.
Let me know if there is a more sensible limit to use instead.
|
||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test with the number of elements equalling and exceeding 32-bit limit case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added test for upper limit
7075b30
to
f1d777e
Compare
@arsenm Any recommendations for appeasing the undef deprecator? |
I don't think you did anything other than update existing tests, I would ignore it for the purposes of this change |
@arsenm Are you okay with the latest revision? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing PR description. The tests also don't make any sense to me, seeing as they compile down to a constant, unused insertelement.
Thanks Nikita for taking a look at this.
I've updated the PR description.
Yes. Tests are very synthetic. Reduced from the already reduced example given in the PR description. I'll see if I can produce something of about the same size but more practical. Do you think it would make sense to add that example as a test case? |
f1d777e
to
fe79e66
Compare
@nikic I cleaned up the test function a bit, again starting from the original example. They exercise the new code paths, though they still result in an unused insertelement. Is that okay? |
@nikic ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So if I understand correctly, you are marking memsets as unsplittable, lowering them to vector zero and smaller accesses to inserts/extracts.
I don't think your general approach here is going to work. We need to be careful about introducing vector operations out of thin air, because LLVM is not going to second guess them. If you convert a memset to <32768 x i8> ops here, LLVM is going to carry those all they down, even though this is almost certainly not performant. Additionally, you are breaking the ability for SROA to split the alloca and fully promote parts of it.
llvm/lib/Transforms/Scalar/SROA.cpp
Outdated
bool Splittable; | ||
|
||
if (getVectorTypeFor(II, DL)) | ||
Splittable = isSplittableMemOp(AS.AI.getAllocatedType(), II.isVolatile()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think checking AI.getAllocatedType() here makes a lot of sense, seeing as how the memset may only be operating on part of it.
I should preface this by mentioning that I'm not all that familiar with SROA, so thank you for your patience.
Yes. As a naive attempt to produce the desired result, I was merely trying to mimic the behavior of
Is there an appropriate place in the code to decide if a splittable memset should be promoted to a vector operation? |
Basically: We should only do it if there already is an existing vector operation. If we have a memset and then a later load of e.g. |
fe79e66
to
6248b40
Compare
@nikic Latest revision only promotes memsets where the region being set is within an existing vector in the underlying alloca (as determined by getTypePartition). It is much more constrained as you can see by the much smaller number of tests that are now affected, while still improving the original gemm kernel. Is this okay? |
@nikic ping |
@nikic ping ping |
@nikic ping ping ping |
@nikic ping |
Teach SROA to vector promote rather than integer promote additional memsets.
Reduced function derived from user gemm kernel can be seen here: https://godbolt.org/z/8ebcTEjTs.
Replacing the memsets with stores results in vector promotion: https://godbolt.org/z/G9f4vPhs6.
Point of divergence between these two examples is whether the Use is considered splittable. This change makes certain memsets non-splittable resulting a vector promotion.