-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AArch64][SVE] Fix hang in VECTOR_HISTOGRAM DAG combine #152539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The histogram DAG combine went into an infinite loop of creating the same histogram node due to an incorrect use of the `refineUniformBase` and `refineIndexType` APIs. These APIs take SDValues by reference (SDValue&) and return `true` if they were "refined" (i.e., set them to new values). Previously, this DAG combine would create the `Ops` array (used to create the new histogram node) before calling the `refine*` APIs, which copies the SDValues into the array, meaning the updated values were not used to create the new histogram node. Reproducer: https://godbolt.org/z/hsGWhTaqY (it will timeout)
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThe histogram DAG combine went into an infinite loop of creating the same histogram node due to an incorrect use of the These APIs take SDValues by reference (SDValue&) and return Previously, this DAG combine would create the Reproducer: https://godbolt.org/z/hsGWhTaqY (it will timeout) Full diff: https://github.com/llvm/llvm-project/pull/152539.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 734191447d67f..5f1e38a33c891 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12843,22 +12843,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
SDLoc DL(HG);
EVT MemVT = HG->getMemoryVT();
+ EVT DataVT = Index.getValueType();
MachineMemOperand *MMO = HG->getMemOperand();
ISD::MemIndexType IndexType = HG->getIndexType();
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
return Chain;
- SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
- HG->getScale(), HG->getIntID()};
- if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+ if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+ refineIndexType(Index, IndexType, DataVT, DAG)) {
+ SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
+ HG->getScale(), HG->getIntID()};
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
MMO, IndexType);
+ }
- EVT DataVT = Index.getValueType();
- if (refineIndexType(Index, IndexType, DataVT, DAG))
- return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
- MMO, IndexType);
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
new file mode 100644
index 0000000000000..da04c67aa6c5c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2 -verify-machineinstrs < %s -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test is reduced from a real world example that would cause the DAGCombiner to hang.
+
+define void @histcnt_loop(ptr %0, i64 %1, ptr %2, i64 %3, i64 %4) {
+; CHECK-LABEL: histcnt_loop:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, #1 // =0x1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: .LBB0_1: // %loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: lsl x10, x8, #1
+; CHECK-NEXT: add x11, x0, x10
+; CHECK-NEXT: add x10, x9, x10
+; CHECK-NEXT: lsl z1.d, z1.d, #1
+; CHECK-NEXT: ld1h { z4.d }, p0/z, [x11, #1, mul vl]
+; CHECK-NEXT: ld1h { z5.d }, p0/z, [x10, #1, mul vl]
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x9, x8, lsl #1]
+; CHECK-NEXT: add x8, x8, x3
+; CHECK-NEXT: cmp x4, x8
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z4.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z4.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z4.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z3.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z5.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %6 = phi i64 [ 0, %entry ], [ %15, %loop ]
+ %7 = getelementptr inbounds nuw i16, ptr %0, i64 %6
+ %8 = getelementptr inbounds nuw i8, ptr %7, i64 %1
+ %9 = load <vscale x 4 x i16>, ptr %7, align 2
+ %10 = load <vscale x 4 x i16>, ptr %8, align 2
+ %11 = zext <vscale x 4 x i16> %9 to <vscale x 4 x i64>
+ %12 = zext <vscale x 4 x i16> %10 to <vscale x 4 x i64>
+ %13 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %11
+ %14 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %12
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %13, i16 1, <vscale x 4 x i1> splat (i1 true))
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %14, i16 1, <vscale x 4 x i1> splat (i1 true))
+ %15 = add nuw i64 %6, %3
+ %16 = icmp eq i64 %15, %4
+ br i1 %16, label %exit, label %loop
+
+exit:
+ ret void
+}
|
@llvm/pr-subscribers-llvm-selectiondag Author: Benjamin Maxwell (MacDue) ChangesThe histogram DAG combine went into an infinite loop of creating the same histogram node due to an incorrect use of the These APIs take SDValues by reference (SDValue&) and return Previously, this DAG combine would create the Reproducer: https://godbolt.org/z/hsGWhTaqY (it will timeout) Full diff: https://github.com/llvm/llvm-project/pull/152539.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 734191447d67f..5f1e38a33c891 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12843,22 +12843,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
SDLoc DL(HG);
EVT MemVT = HG->getMemoryVT();
+ EVT DataVT = Index.getValueType();
MachineMemOperand *MMO = HG->getMemOperand();
ISD::MemIndexType IndexType = HG->getIndexType();
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
return Chain;
- SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
- HG->getScale(), HG->getIntID()};
- if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
+ if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) ||
+ refineIndexType(Index, IndexType, DataVT, DAG)) {
+ SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
+ HG->getScale(), HG->getIntID()};
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
MMO, IndexType);
+ }
- EVT DataVT = Index.getValueType();
- if (refineIndexType(Index, IndexType, DataVT, DAG))
- return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
- MMO, IndexType);
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
new file mode 100644
index 0000000000000..da04c67aa6c5c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-histcnt-dag-combine-hang.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2 -verify-machineinstrs < %s -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test is reduced from a real world example that would cause the DAGCombiner to hang.
+
+define void @histcnt_loop(ptr %0, i64 %1, ptr %2, i64 %3, i64 %4) {
+; CHECK-LABEL: histcnt_loop:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, #1 // =0x1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: .LBB0_1: // %loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: lsl x10, x8, #1
+; CHECK-NEXT: add x11, x0, x10
+; CHECK-NEXT: add x10, x9, x10
+; CHECK-NEXT: lsl z1.d, z1.d, #1
+; CHECK-NEXT: ld1h { z4.d }, p0/z, [x11, #1, mul vl]
+; CHECK-NEXT: ld1h { z5.d }, p0/z, [x10, #1, mul vl]
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x9, x8, lsl #1]
+; CHECK-NEXT: add x8, x8, x3
+; CHECK-NEXT: cmp x4, x8
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z4.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z4.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z4.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z3.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: lsl z1.d, z5.d, #1
+; CHECK-NEXT: histcnt z2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT: mad z2.d, p0/m, z0.d, z3.d
+; CHECK-NEXT: st1h { z2.d }, p0, [x2, z1.d]
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %6 = phi i64 [ 0, %entry ], [ %15, %loop ]
+ %7 = getelementptr inbounds nuw i16, ptr %0, i64 %6
+ %8 = getelementptr inbounds nuw i8, ptr %7, i64 %1
+ %9 = load <vscale x 4 x i16>, ptr %7, align 2
+ %10 = load <vscale x 4 x i16>, ptr %8, align 2
+ %11 = zext <vscale x 4 x i16> %9 to <vscale x 4 x i64>
+ %12 = zext <vscale x 4 x i16> %10 to <vscale x 4 x i64>
+ %13 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %11
+ %14 = getelementptr inbounds nuw [16 x i16], ptr %2, i64 0, <vscale x 4 x i64> %12
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %13, i16 1, <vscale x 4 x i1> splat (i1 true))
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %14, i16 1, <vscale x 4 x i1> splat (i1 true))
+ %15 = add nuw i64 %6, %3
+ %16 = icmp eq i64 %15, %4
+ br i1 %16, label %exit, label %loop
+
+exit:
+ ret void
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice find, LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/22304 Here is the relevant piece of the build log for the reference
|
The histogram DAG combine went into an infinite loop of creating the same histogram node due to an incorrect use of the
refineUniformBase
andrefineIndexType
APIs.These APIs take SDValues by reference (SDValue&) and return
true
if they were "refined" (i.e., set to new values).Previously, this DAG combine would create the
Ops
array (used to create the new histogram node) before calling therefine*
APIs, which copies the SDValues into the array, meaning the updated values were not used to create the new histogram node.Reproducer: https://godbolt.org/z/hsGWhTaqY (it will timeout)