Skip to content

Commit abd3149

Browse files
authored
[X86] Use GFNI for vXi8 shifts/rotates (#89115)
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case. Based off the original patch here: https://reviews.llvm.org/D137026
1 parent 6ce0474 commit abd3149

23 files changed

+1474
-2123
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29022,6 +29022,29 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
2902229022
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
2902329023
}
2902429024

29025+
// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
29026+
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
29027+
assert((Amt < 8) && "Shift/Rotation amount out of range");
29028+
switch (Opcode) {
29029+
case ISD::BITREVERSE:
29030+
return 0x8040201008040201ULL;
29031+
case ISD::SHL:
29032+
return ((0x0102040810204080ULL >> (Amt)) &
29033+
(0x0101010101010101ULL * (0xFF >> (Amt))));
29034+
case ISD::SRL:
29035+
return ((0x0102040810204080ULL << (Amt)) &
29036+
(0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29037+
case ISD::SRA:
29038+
return (getGFNICtrlImm(ISD::SRL, Amt) |
29039+
(0x8080808080808080ULL >> (64 - (8 * Amt))));
29040+
case ISD::ROTL:
29041+
return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29042+
case ISD::ROTR:
29043+
return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29044+
}
29045+
llvm_unreachable("Unsupported GFNI opcode");
29046+
}
29047+
2902529048
// Return true if the required (according to Opcode) shift-imm form is natively
2902629049
// supported by the Subtarget
2902729050
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29209,6 +29232,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
2920929232
if (VT == MVT::v16i8 && Subtarget.hasXOP())
2921029233
return SDValue();
2921129234

29235+
if (Subtarget.hasGFNI()) {
29236+
uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
29237+
MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29238+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
29239+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29240+
DAG.getTargetConstant(0, dl, MVT::i8));
29241+
}
29242+
2921229243
if (Op.getOpcode() == ISD::SHL) {
2921329244
// Make a large shift.
2921429245
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
@@ -29892,13 +29923,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
2989229923
uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
2989329924
uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
2989429925
assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29926+
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2989529927

29896-
if (EltSizeInBits == 8 && ShXAmt > 1 &&
29897-
(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
29928+
if (EltSizeInBits == 8 &&
29929+
(Subtarget.hasXOP() ||
29930+
(useVPTERNLOG(Subtarget, VT) &&
29931+
supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
2989829932
// For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
2989929933
// bit-select - lower using vXi16 shifts and then perform the bitmask at
2990029934
// the original vector width to handle cases where we split.
29901-
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2990229935
APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
2990329936
APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
2990429937
SDValue ShX =
@@ -30103,6 +30136,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
3010330136
DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
3010430137
}
3010530138

30139+
// Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30140+
if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30141+
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
30142+
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30143+
uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
30144+
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
30145+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
30146+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30147+
DAG.getTargetConstant(0, DL, MVT::i8));
30148+
}
30149+
3010630150
// Split 256-bit integers on XOP/pre-AVX2 targets.
3010730151
if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
3010830152
return splitVectorIntBinary(Op, DAG, DL);
@@ -31426,7 +31470,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
3142631470
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
3142731471
if (Subtarget.hasGFNI()) {
3142831472
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31429-
SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31473+
SDValue Matrix =
31474+
DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
3143031475
Matrix = DAG.getBitcast(VT, Matrix);
3143131476
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
3143231477
DAG.getTargetConstant(0, DL, MVT::i8));

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
345345
Op1Info.getNoProps(), Op2Info.getNoProps());
346346
}
347347

348+
static const CostKindTblEntry GFNIUniformConstCostTable[] = {
349+
{ ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
350+
{ ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351+
{ ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352+
{ ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353+
{ ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354+
{ ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355+
{ ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356+
{ ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357+
{ ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358+
};
359+
360+
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
361+
if (const auto *Entry =
362+
CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
363+
if (auto KindCost = Entry->Cost[CostKind])
364+
return LT.first * *KindCost;
365+
348366
static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349367
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350368
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
@@ -3869,6 +3887,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
38693887
{ ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
38703888
{ ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
38713889
{ ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3890+
{ X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3891+
{ X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3892+
{ X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
38723893
};
38733894
static const CostKindTblEntry GLMCostTbl[] = {
38743895
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss

llvm/test/Analysis/CostModel/X86/fshl-codesize.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15971597
;
15981598
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15991599
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1600-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1600+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
16031603
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
16041604
;
16051605
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28712871
;
28722872
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28732873
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2874-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2874+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28772877
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28782878
;
28792879
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

llvm/test/Analysis/CostModel/X86/fshl-latency.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15491549
;
15501550
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15511551
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1552-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1552+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
15551555
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
15561556
;
15571557
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28232823
;
28242824
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28252825
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2826-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2826+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28292829
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28302830
;
28312831
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

0 commit comments

Comments
 (0)