Skip to content

Commit b8cdc26

Browse files
authored
[DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. (#80473)
1 parent cf94e00 commit b8cdc26

File tree

3 files changed

+31
-15
lines changed

3 files changed

+31
-15
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
1114211142
SDValue DAGCombiner::visitCTPOP(SDNode *N) {
1114311143
SDValue N0 = N->getOperand(0);
1114411144
EVT VT = N->getValueType(0);
11145+
unsigned NumBits = VT.getScalarSizeInBits();
1114511146
SDLoc DL(N);
1114611147

1114711148
// fold (ctpop c1) -> c2
1114811149
if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0}))
1114911150
return C;
11151+
11152+
// If the upper bits are known to be zero, then see if its profitable to
11153+
// only count the lower bits.
11154+
if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) {
11155+
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2);
11156+
if (hasOperation(ISD::CTPOP, HalfVT) &&
11157+
TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) &&
11158+
TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT)) {
11159+
APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2);
11160+
if (DAG.MaskedValueIsZero(N0, UpperBits)) {
11161+
SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT,
11162+
DAG.getZExtOrTrunc(N0, DL, HalfVT));
11163+
return DAG.getZExtOrTrunc(PopCnt, DL, VT);
11164+
}
11165+
}
11166+
}
11167+
1115011168
return SDValue();
1115111169
}
1115211170

llvm/test/CodeGen/AMDGPU/ctpop64.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
452452
; SI-NEXT: s_mov_b32 s2, -1
453453
; SI-NEXT: s_waitcnt lgkmcnt(0)
454454
; SI-NEXT: s_mov_b32 s0, s4
455+
; SI-NEXT: s_and_b32 s4, s8, 0xff
455456
; SI-NEXT: s_mov_b32 s1, s5
456-
; SI-NEXT: s_and_b32 s4, s8, 1
457-
; SI-NEXT: s_mov_b32 s5, 0
458-
; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
459-
; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
460-
; SI-NEXT: s_add_i32 s4, s6, s4
457+
; SI-NEXT: s_bcnt1_i32_b32 s4, s4
458+
; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
459+
; SI-NEXT: s_add_i32 s4, s5, s4
461460
; SI-NEXT: v_mov_b32_e32 v0, s4
462461
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
463462
; SI-NEXT: s_endpgm
@@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
470469
; VI-NEXT: s_mov_b32 s2, -1
471470
; VI-NEXT: s_waitcnt lgkmcnt(0)
472471
; VI-NEXT: s_mov_b32 s0, s4
472+
; VI-NEXT: s_and_b32 s4, s8, 0xff
473473
; VI-NEXT: s_mov_b32 s1, s5
474-
; VI-NEXT: s_and_b32 s4, s8, 1
475-
; VI-NEXT: s_mov_b32 s5, 0
476-
; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
477-
; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
478-
; VI-NEXT: s_add_i32 s4, s6, s4
474+
; VI-NEXT: s_bcnt1_i32_b32 s4, s4
475+
; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
476+
; VI-NEXT: s_add_i32 s4, s5, s4
479477
; VI-NEXT: v_mov_b32_e32 v0, s4
480478
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
481479
; VI-NEXT: s_endpgm

llvm/test/CodeGen/X86/ctpop-mask.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
2525
; X64-POPCOUNT-LABEL: ctpop_mask2:
2626
; X64-POPCOUNT: # %bb.0:
2727
; X64-POPCOUNT-NEXT: andl $3, %edi
28-
; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
28+
; X64-POPCOUNT-NEXT: popcntl %edi, %eax
2929
; X64-POPCOUNT-NEXT: retq
3030
;
3131
; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
@@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
189189
; X64-POPCOUNT-LABEL: ctpop_mask4:
190190
; X64-POPCOUNT: # %bb.0:
191191
; X64-POPCOUNT-NEXT: andl $15, %edi
192-
; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
192+
; X64-POPCOUNT-NEXT: popcntl %edi, %eax
193193
; X64-POPCOUNT-NEXT: retq
194194
;
195195
; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
@@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
271271
; X64-POPCOUNT-LABEL: ctpop_mask5:
272272
; X64-POPCOUNT: # %bb.0:
273273
; X64-POPCOUNT-NEXT: andl $31, %edi
274-
; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
274+
; X64-POPCOUNT-NEXT: popcntl %edi, %eax
275275
; X64-POPCOUNT-NEXT: retq
276276
;
277277
; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
@@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
392392
; X64-POPCOUNT-LABEL: ctpop_shifted_mask6:
393393
; X64-POPCOUNT: # %bb.0:
394394
; X64-POPCOUNT-NEXT: andl $26112, %edi # imm = 0x6600
395-
; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
395+
; X64-POPCOUNT-NEXT: popcntl %edi, %eax
396396
; X64-POPCOUNT-NEXT: retq
397397
;
398398
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
@@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
556556
; X64-POPCOUNT-LABEL: ctpop_shifted_mask8:
557557
; X64-POPCOUNT: # %bb.0:
558558
; X64-POPCOUNT-NEXT: andl $65280, %edi # imm = 0xFF00
559-
; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
559+
; X64-POPCOUNT-NEXT: popcntl %edi, %eax
560560
; X64-POPCOUNT-NEXT: retq
561561
;
562562
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:

0 commit comments

Comments
 (0)