Skip to content

Commit 3aa1ec5

Browse files
Improve codegen for Vector512.ExtractMostSignificatBits (#110662)
1 parent d6c034d commit 3aa1ec5

File tree

3 files changed

+111
-12
lines changed

3 files changed

+111
-12
lines changed

src/coreclr/jit/decomposelongs.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,6 +1707,11 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
17071707
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
17081708
}
17091709

1710+
case NI_EVEX_MoveMask:
1711+
{
1712+
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
1713+
}
1714+
17101715
default:
17111716
{
17121717
noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition");
@@ -1830,6 +1835,106 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
18301835
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
18311836
}
18321837

1838+
//------------------------------------------------------------------------
1839+
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
1840+
//
1841+
// Decompose a MoveMask(x) node on Vector512<*>. For:
1842+
//
1843+
// GT_HWINTRINSIC{MoveMask}[*](simd_var)
1844+
//
1845+
// create:
1846+
//
1847+
// tmp_simd_var = simd_var
1848+
// tmp_simd_lo = GT_HWINTRINSIC{GetLower}(tmp_simd_var)
1849+
// lo_result = GT_HWINTRINSIC{MoveMask}(tmp_simd_lo)
1850+
// tmp_simd_hi = GT_HWINTRINSIC{GetUpper}(tmp_simd_var)
1851+
// hi_result = GT_HWINTRINSIC{MoveMask}(tmp_simd_hi)
1852+
// return: GT_LONG(lo_result, hi_result)
1853+
//
1854+
// Noting that for all types except byte/sbyte, hi_result will be exclusively
1855+
// zero and so we can actually optimize this a bit more directly
1856+
//
1857+
// Arguments:
1858+
// use - the LIR::Use object for the def that needs to be decomposed.
1859+
// node - the hwintrinsic node to decompose
1860+
//
1861+
// Return Value:
1862+
// The next node to process.
1863+
//
1864+
GenTree* DecomposeLongs::DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node)
1865+
{
1866+
assert(node == use.Def());
1867+
assert(varTypeIsLong(node));
1868+
assert(node->GetHWIntrinsicId() == NI_EVEX_MoveMask);
1869+
1870+
GenTree* op1 = node->Op(1);
1871+
CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
1872+
var_types simdBaseType = node->GetSimdBaseType();
1873+
unsigned simdSize = node->GetSimdSize();
1874+
1875+
assert(varTypeIsArithmetic(simdBaseType));
1876+
assert(op1->TypeGet() == TYP_MASK);
1877+
assert(simdSize == 64);
1878+
1879+
GenTree* loResult = nullptr;
1880+
GenTree* hiResult = nullptr;
1881+
1882+
if (varTypeIsByte(simdBaseType))
1883+
{
1884+
// Create:
1885+
// simdTmpVar = op1
1886+
1887+
GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1));
1888+
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
1889+
JITDUMP("[DecomposeHWIntrinsicMoveMask]: Saving op1 tree to a temp var:\n");
1890+
DISPTREERANGE(Range(), simdTmpVar);
1891+
Range().Remove(simdTmpVar);
1892+
1893+
Range().InsertBefore(node, simdTmpVar);
1894+
1895+
// Create:
1896+
// loResult = GT_HWINTRINSIC{MoveMask}(simdTmpVar)
1897+
1898+
loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, NI_EVEX_MoveMask, simdBaseJitType, 32);
1899+
Range().InsertBefore(node, loResult);
1900+
1901+
simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
1902+
Range().InsertBefore(node, simdTmpVar);
1903+
1904+
// Create:
1905+
// simdTmpVar = GT_HWINTRINSIC{ShiftRightMask}(simdTmpVar, 32)
1906+
// hiResult = GT_HWINTRINSIC{MoveMask}(simdTmpVar)
1907+
1908+
GenTree* shiftIcon = m_compiler->gtNewIconNode(32, TYP_INT);
1909+
Range().InsertBefore(node, shiftIcon);
1910+
1911+
simdTmpVar = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, simdTmpVar, shiftIcon, NI_EVEX_ShiftRightMask,
1912+
simdBaseJitType, 64);
1913+
Range().InsertBefore(node, simdTmpVar);
1914+
1915+
hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, NI_EVEX_MoveMask, simdBaseJitType, 32);
1916+
Range().InsertBefore(node, hiResult);
1917+
}
1918+
else
1919+
{
1920+
// Create:
1921+
// loResult = GT_HWINTRINSIC{MoveMask}(op1)
1922+
1923+
loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, op1, NI_EVEX_MoveMask, simdBaseJitType, simdSize);
1924+
Range().InsertBefore(node, loResult);
1925+
1926+
// Create:
1927+
// hiResult = GT_ICON(0)
1928+
1929+
hiResult = m_compiler->gtNewZeroConNode(TYP_INT);
1930+
Range().InsertBefore(node, hiResult);
1931+
}
1932+
1933+
// Done with the original tree; remove it.
1934+
Range().Remove(node);
1935+
1936+
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
1937+
}
18331938
#endif // FEATURE_HW_INTRINSICS
18341939

18351940
//------------------------------------------------------------------------

src/coreclr/jit/decomposelongs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class DecomposeLongs
6464
#ifdef FEATURE_HW_INTRINSICS
6565
GenTree* DecomposeHWIntrinsic(LIR::Use& use);
6666
GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
67+
GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
6768
#endif // FEATURE_HW_INTRINSICS
6869

6970
GenTree* OptimizeCastFromDecomposedLong(GenTreeCast* cast, GenTree* nextNode);

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2511,14 +2511,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
25112511
break;
25122512
}
25132513

2514+
case NI_Vector128_ExtractMostSignificantBits:
2515+
case NI_Vector256_ExtractMostSignificantBits:
25142516
case NI_Vector512_ExtractMostSignificantBits:
25152517
{
2516-
#if defined(TARGET_X86)
2517-
// TODO-XARCH-CQ: It may be beneficial to decompose this operation
2518-
break;
2519-
#endif // TARGET_X86
2518+
assert(sig->numArgs == 1);
25202519

2521-
if (IsBaselineVector512IsaSupportedOpportunistically())
2520+
if ((simdSize == 64) || (varTypeIsShort(simdBaseType) && canUseEvexEncoding()))
25222521
{
25232522
op1 = impSIMDPopStack();
25242523

@@ -2527,14 +2526,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
25272526
op1 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize);
25282527
}
25292528
retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_EVEX_MoveMask, simdBaseJitType, simdSize);
2529+
break;
25302530
}
2531-
break;
2532-
}
2533-
2534-
case NI_Vector128_ExtractMostSignificantBits:
2535-
case NI_Vector256_ExtractMostSignificantBits:
2536-
{
2537-
assert(sig->numArgs == 1);
25382531

25392532
if ((simdSize != 32) || varTypeIsFloating(simdBaseType) ||
25402533
compOpportunisticallyDependsOn(InstructionSet_AVX2))

0 commit comments

Comments
 (0)