Skip to content

Commit e3f71b1

Browse files
committed
Allow morph to recognize more places that its beneficial to produce masks
1 parent 6a9a0b0 commit e3f71b1

File tree

3 files changed

+274
-0
lines changed

3 files changed

+274
-0
lines changed

src/coreclr/jit/hwintrinsic.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ struct HWIntrinsicInfo
543543
FloatComparisonMode comparison,
544544
var_types simdBaseType,
545545
unsigned simdSize);
546+
static NamedIntrinsic lookupEvexMaskId(NamedIntrinsic intrinsic);
546547
#endif
547548

548549
// Member lookup

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,172 @@ NamedIntrinsic HWIntrinsicInfo::lookupIdForFloatComparisonMode(NamedIntrinsic
806806
}
807807
}
808808

809+
//------------------------------------------------------------------------
810+
// lookupEvexId: Get the EVEX intrinsic ID to use for a given intrinsic
811+
//
812+
// Arguments:
813+
// intrinsic -- The base intrinsic that is being converted to its EVEX form
814+
//
815+
// Return Value:
816+
// The EVEX intrinsic ID to use instead of intrinsic
817+
//
818+
NamedIntrinsic HWIntrinsicInfo::lookupEvexMaskId(NamedIntrinsic intrinsic)
819+
{
820+
switch (intrinsic)
821+
{
822+
case NI_SSE_And:
823+
case NI_SSE2_And:
824+
case NI_AVX_And:
825+
case NI_AVX2_And:
826+
case NI_AVX512F_And:
827+
case NI_AVX512DQ_And:
828+
case NI_AVX10v1_V512_And:
829+
{
830+
return NI_EVEX_AndMask;
831+
}
832+
833+
case NI_SSE_AndNot:
834+
case NI_SSE2_AndNot:
835+
case NI_AVX_AndNot:
836+
case NI_AVX2_AndNot:
837+
case NI_AVX512F_AndNot:
838+
case NI_AVX512DQ_AndNot:
839+
case NI_AVX10v1_V512_AndNot:
840+
{
841+
return NI_EVEX_AndNotMask;
842+
}
843+
844+
case NI_SSE41_BlendVariable:
845+
case NI_AVX_BlendVariable:
846+
case NI_AVX2_BlendVariable:
847+
{
848+
return NI_EVEX_BlendVariableMask;
849+
}
850+
851+
case NI_AVX_Compare:
852+
{
853+
return NI_EVEX_CompareMask;
854+
}
855+
856+
case NI_SSE_CompareEqual:
857+
case NI_SSE2_CompareEqual:
858+
case NI_SSE41_CompareEqual:
859+
case NI_AVX_CompareEqual:
860+
case NI_AVX2_CompareEqual:
861+
{
862+
return NI_EVEX_CompareEqualMask;
863+
}
864+
865+
case NI_SSE_CompareGreaterThan:
866+
case NI_SSE2_CompareGreaterThan:
867+
case NI_SSE42_CompareGreaterThan:
868+
case NI_AVX_CompareGreaterThan:
869+
case NI_AVX2_CompareGreaterThan:
870+
{
871+
return NI_EVEX_CompareGreaterThanMask;
872+
}
873+
874+
case NI_SSE_CompareGreaterThanOrEqual:
875+
case NI_SSE2_CompareGreaterThanOrEqual:
876+
case NI_AVX_CompareGreaterThanOrEqual:
877+
{
878+
return NI_EVEX_CompareGreaterThanOrEqualMask;
879+
}
880+
881+
case NI_SSE_CompareLessThan:
882+
case NI_SSE2_CompareLessThan:
883+
case NI_SSE42_CompareLessThan:
884+
case NI_AVX_CompareLessThan:
885+
{
886+
return NI_EVEX_CompareLessThanMask;
887+
}
888+
889+
case NI_SSE_CompareLessThanOrEqual:
890+
case NI_SSE2_CompareLessThanOrEqual:
891+
case NI_AVX_CompareLessThanOrEqual:
892+
{
893+
return NI_EVEX_CompareLessThanOrEqualMask;
894+
}
895+
896+
case NI_SSE_CompareNotEqual:
897+
case NI_SSE2_CompareNotEqual:
898+
case NI_AVX_CompareNotEqual:
899+
{
900+
return NI_EVEX_CompareNotEqualMask;
901+
}
902+
903+
case NI_SSE_CompareNotGreaterThan:
904+
case NI_SSE2_CompareNotGreaterThan:
905+
case NI_AVX_CompareNotGreaterThan:
906+
{
907+
return NI_EVEX_CompareNotGreaterThanMask;
908+
}
909+
910+
case NI_SSE_CompareNotGreaterThanOrEqual:
911+
case NI_SSE2_CompareNotGreaterThanOrEqual:
912+
case NI_AVX_CompareNotGreaterThanOrEqual:
913+
{
914+
return NI_EVEX_CompareNotGreaterThanOrEqualMask;
915+
}
916+
917+
case NI_SSE_CompareNotLessThan:
918+
case NI_SSE2_CompareNotLessThan:
919+
case NI_AVX_CompareNotLessThan:
920+
{
921+
return NI_EVEX_CompareNotLessThanMask;
922+
}
923+
924+
case NI_SSE_CompareNotLessThanOrEqual:
925+
case NI_SSE2_CompareNotLessThanOrEqual:
926+
case NI_AVX_CompareNotLessThanOrEqual:
927+
{
928+
return NI_EVEX_CompareNotLessThanOrEqualMask;
929+
}
930+
931+
case NI_SSE_CompareOrdered:
932+
case NI_SSE2_CompareOrdered:
933+
case NI_AVX_CompareOrdered:
934+
{
935+
return NI_EVEX_CompareOrderedMask;
936+
}
937+
938+
case NI_SSE_CompareUnordered:
939+
case NI_SSE2_CompareUnordered:
940+
case NI_AVX_CompareUnordered:
941+
{
942+
return NI_EVEX_CompareUnorderedMask;
943+
}
944+
945+
case NI_SSE_Or:
946+
case NI_SSE2_Or:
947+
case NI_AVX_Or:
948+
case NI_AVX2_Or:
949+
case NI_AVX512F_Or:
950+
case NI_AVX512DQ_Or:
951+
case NI_AVX10v1_V512_Or:
952+
{
953+
return NI_EVEX_OrMask;
954+
}
955+
956+
case NI_SSE_Xor:
957+
case NI_SSE2_Xor:
958+
case NI_AVX_Xor:
959+
case NI_AVX2_Xor:
960+
case NI_AVX512F_Xor:
961+
case NI_AVX512DQ_Xor:
962+
case NI_AVX10v1_V512_Xor:
963+
{
964+
return NI_EVEX_XorMask;
965+
}
966+
967+
default:
968+
{
969+
assert(!"Unexpected intrinsic when resolving EVEX alternative");
970+
return NI_Illegal;
971+
}
972+
}
973+
}
974+
809975
//------------------------------------------------------------------------
810976
// isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented
811977
//

src/coreclr/jit/morph.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9918,6 +9918,107 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
99189918
default:
99199919
{
99209920
#if defined(FEATURE_MASKED_HW_INTRINSICS)
9921+
#if defined(TARGET_XARCH)
9922+
bool isCndSel = (intrinsicId == NI_Vector128_ConditionalSelect) ||
9923+
(intrinsicId == NI_Vector256_ConditionalSelect) ||
9924+
(intrinsicId == NI_Vector512_ConditionalSelect);
9925+
9926+
if (isCndSel || node->OperIsConvertVectorToMask())
9927+
{
9928+
GenTree* op1 = node->Op(1);
9929+
9930+
if (!op1->IsVectorPerElementMask(simdBaseType, simdSize))
9931+
{
9932+
break;
9933+
}
9934+
9935+
if (!op1->OperIsHWIntrinsic())
9936+
{
9937+
break;
9938+
}
9939+
9940+
GenTreeHWIntrinsic* op1Intrin = op1->AsHWIntrinsic();
9941+
9942+
if (!isCndSel)
9943+
{
9944+
// CndSel knows how to handle mismatched mask sizes, but not all consumers can
9945+
9946+
if (genTypeSize(op1Intrin->GetSimdBaseType()) != genTypeSize(simdBaseType))
9947+
{
9948+
break;
9949+
}
9950+
}
9951+
9952+
if (!canUseEvexEncoding())
9953+
{
9954+
break;
9955+
}
9956+
9957+
// We have something expecting a mask and have a case where we could be producing a mask directly
9958+
9959+
NamedIntrinsic op1IntrinId = op1Intrin->GetHWIntrinsicId();
9960+
9961+
NamedIntrinsic evexIntrinId = HWIntrinsicInfo::lookupEvexMaskId(op1IntrinId);
9962+
9963+
if (evexIntrinId != NI_Illegal)
9964+
{
9965+
GenTree* cvtNode;
9966+
9967+
op1Intrin->ChangeHWIntrinsicId(evexIntrinId);
9968+
op1Intrin->gtType = TYP_MASK;
9969+
9970+
switch (op1IntrinId)
9971+
{
9972+
case NI_EVEX_AndMask:
9973+
case NI_EVEX_AndNotMask:
9974+
case NI_EVEX_OrMask:
9975+
case NI_EVEX_XorMask:
9976+
{
9977+
// There's a few special nodes which are allowed to combine masks
9978+
// and so we handle these by inserting a CvtVectorToMask over each
9979+
// operand and remorphing, which will get us the optimized sequence
9980+
9981+
cvtNode = op1Intrin->Op(1);
9982+
cvtNode = gtNewSimdCvtVectorToMaskNode(TYP_MASK, cvtNode, simdBaseJitType, simdSize);
9983+
cvtNode = fgOptimizeHWIntrinsic(cvtNode->AsHWIntrinsic());
9984+
9985+
op1Intrin->Op(1) = cvtNode;
9986+
9987+
cvtNode = op1Intrin->Op(2);
9988+
cvtNode = gtNewSimdCvtVectorToMaskNode(TYP_MASK, cvtNode, simdBaseJitType, simdSize);
9989+
cvtNode = fgOptimizeHWIntrinsic(cvtNode->AsHWIntrinsic());
9990+
9991+
op1Intrin->Op(2) = cvtNode;
9992+
9993+
op1 = fgOptimizeHWIntrinsic(op1Intrin);
9994+
break;
9995+
}
9996+
9997+
default:
9998+
{
9999+
break;
10000+
}
10001+
}
10002+
10003+
if (isCndSel)
10004+
{
10005+
// This will allow lowering to emit a vblendm and potentially do embedded masking
10006+
10007+
cvtNode = gtNewSimdCvtMaskToVectorNode(retType, op1, simdBaseJitType, simdSize);
10008+
cvtNode = fgOptimizeHWIntrinsic(cvtNode->AsHWIntrinsic());
10009+
10010+
node->Op(1) = cvtNode;
10011+
return node;
10012+
}
10013+
else
10014+
{
10015+
DEBUG_DESTROY_NODE(node);
10016+
return op1;
10017+
}
10018+
}
10019+
}
10020+
#endif // TARGET_XARCH
10021+
992110022
bool isScalar = false;
992210023
genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
992310024
genTreeOps oper = actualOper;
@@ -9945,6 +10046,12 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
994510046

994610047
// We need both operands to be ConvertMaskToVector in
994710048
// order to optimize this to a direct mask operation
10049+
//
10050+
// Noting that we could handle broader scenarios by
10051+
// checking IsVectorPerElementMask instead, but that
10052+
// could regress code size unnecessarily if we aren't
10053+
// consumed as a mask as well. We handle the case where
10054+
// we're consumed as a mask elsewhere in morph instead.
994810055

994910056
if (!op1->OperIsConvertMaskToVector())
995010057
{

0 commit comments

Comments
 (0)