From 5fd945166862377be390948c9ab7dab2f47df217 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 21 Mar 2022 17:38:06 +0000 Subject: [PATCH] [X86][AVX512] lower1BitShuffle - fold broadcast(setcc(x,y)) -> setcc(broadcast(x),broadcast(y)) (PR52500) AVX512 has excellent broadcast ops for everything but vXi1 bool vectors - so if we're broadcasting a comparison result, see if we can broadcast the comparison operands instead. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++++++++- llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 23 ++++++++-------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cbb718b0f0d75..991a70a499a76 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18942,7 +18942,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, Offset += NumElts; // Increment for next iteration. } - + // If we're broadcasting a SETCC result, try to broadcast the ops instead. + // TODO: What other unary shuffles would benefit from this? + if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC && + V1->hasOneUse()) { + SDValue Op0 = V1.getOperand(0); + SDValue Op1 = V1.getOperand(1); + ISD::CondCode CC = cast(V1.getOperand(2))->get(); + EVT OpVT = Op0.getValueType(); + return DAG.getSetCC( + DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask), + DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC); + } MVT ExtVT; switch (VT.SimpleTy) { diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 9af454b3cc984..cb36c8a6c2277 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -919,10 +919,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512F-NEXT: movl $789, %eax # imm = 0x315 ; AVX512F-NEXT: vmovd %eax, %xmm1 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -937,10 +935,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512VL-NEXT: movl $789, %eax # imm = 0x315 ; AVX512VL-NEXT: vmovd %eax, %xmm1 ; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -949,16 +945,13 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; VL_BW_DQ-LABEL: PR52500: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovd %edi, %xmm2 +; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1 +; VL_BW_DQ-NEXT: vmovd %edi, %xmm0 ; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315 -; VL_BW_DQ-NEXT: vmovd %eax, %xmm3 -; VL_BW_DQ-NEXT: vpmulld %xmm3, %xmm2, %xmm2 -; VL_BW_DQ-NEXT: vptestnmd %zmm2, %zmm2, %k0 -; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm2 -; VL_BW_DQ-NEXT: vpbroadcastd %xmm2, %zmm2 -; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k1 -; VL_BW_DQ-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} +; VL_BW_DQ-NEXT: vmovd %eax, %xmm1 +; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 +; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1} ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq