Skip to content

Commit 3a130c8

Browse files
committed
unblock long xplat intrinsics on x86
1 parent 2cb402c commit 3a130c8

13 files changed

+605
-699
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5674,6 +5674,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
56745674
// These intrinsics are "ins reg/mem, xmm"
56755675
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
56765676
attr = emitActualTypeSize(baseType);
5677+
#if defined(TARGET_X86)
5678+
if (varTypeIsLong(baseType))
5679+
{
5680+
ins = INS_movq;
5681+
attr = EA_8BYTE;
5682+
}
5683+
#endif // TARGET_X86
56775684
break;
56785685
}
56795686

src/coreclr/jit/decomposelongs.cpp

Lines changed: 173 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
9090

9191
//------------------------------------------------------------------------
9292
// DecomposeLongs::DecomposeRangeHelper:
93-
// Decompiose each node in the current range.
93+
// Decompose each node in the current range.
9494
//
9595
// Decomposition is done as an execution-order walk. Decomposition of
9696
// a particular node can create new nodes that need to be further
@@ -122,44 +122,92 @@ void DecomposeLongs::DecomposeRangeHelper()
122122
GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
123123
{
124124
// Handle the case where we are implicitly using the lower half of a long lclVar.
125-
if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal())
125+
if (tree->TypeIs(TYP_INT) && tree->OperIsLocal())
126126
{
127127
LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon());
128128
if (varTypeIsLong(varDsc) && varDsc->lvPromoted)
129129
{
130-
#ifdef DEBUG
131-
if (m_compiler->verbose)
132-
{
133-
printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
134-
"half:\n");
135-
m_compiler->gtDispTreeRange(Range(), tree);
136-
}
137-
#endif // DEBUG
130+
JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
131+
"half:\n");
132+
DISPTREERANGE(Range(), tree);
133+
138134
unsigned loVarNum = varDsc->lvFieldLclStart;
139135
tree->AsLclVarCommon()->SetLclNum(loVarNum);
140136
return tree->gtNext;
141137
}
142138
}
143139

144-
if (tree->TypeGet() != TYP_LONG)
140+
if (!tree->TypeIs(TYP_LONG))
145141
{
146142
return tree->gtNext;
147143
}
148144

149-
#ifdef DEBUG
150-
if (m_compiler->verbose)
151-
{
152-
printf("Decomposing TYP_LONG tree. BEFORE:\n");
153-
m_compiler->gtDispTreeRange(Range(), tree);
154-
}
155-
#endif // DEBUG
156-
157145
LIR::Use use;
158146
if (!Range().TryGetUse(tree, &use))
159147
{
160148
LIR::Use::MakeDummyUse(Range(), tree, &use);
161149
}
162150

151+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
152+
if (!use.IsDummyUse())
153+
{
154+
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
155+
// Here we do a conservative check for specific cases where it is certain the load/store
156+
// can be contained. In those cases, we can skip decomposition.
157+
158+
GenTree* user = use.User();
159+
160+
if (user->OperIsHWIntrinsic())
161+
{
162+
if (tree->OperIs(GT_CNS_LNG) || (tree->OperIs(GT_IND, GT_LCL_FLD) && (user == tree->gtNext)))
163+
{
164+
return tree->gtNext;
165+
}
166+
}
167+
else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled())
168+
{
169+
// We're looking for this common pattern, with operands in either order in the LIR sequence:
170+
// t1 = * HWINTRINSIC long ToScalar
171+
// t0 = LCL_VAR byref
172+
// /--* t0 byref
173+
// +--* t1 long
174+
// * STOREIND long
175+
176+
GenTree* next = tree->gtNext;
177+
if ((user != next) && !m_compiler->gtTreeHasSideEffects(next, GTF_SIDE_EFFECT))
178+
{
179+
next = next->gtNext;
180+
}
181+
182+
if (user == next)
183+
{
184+
NamedIntrinsic intrinsic = tree->AsHWIntrinsic()->GetHWIntrinsicId();
185+
186+
if ((intrinsic == NI_Vector128_ToScalar) || (intrinsic == NI_Vector256_ToScalar) ||
187+
(intrinsic == NI_Vector512_ToScalar))
188+
{
189+
return tree->gtNext;
190+
}
191+
}
192+
}
193+
}
194+
195+
if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic())
196+
{
197+
#if DEBUG
198+
// We should only get here if we matched the second pattern above.
199+
NamedIntrinsic intrinsic = tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId();
200+
assert((intrinsic == NI_Vector128_ToScalar) || (intrinsic == NI_Vector256_ToScalar) ||
201+
(intrinsic == NI_Vector512_ToScalar));
202+
#endif // DEBUG
203+
204+
return tree->gtNext;
205+
}
206+
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
207+
208+
JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n");
209+
DISPTREERANGE(Range(), tree);
210+
163211
GenTree* nextNode = nullptr;
164212
switch (tree->OperGet())
165213
{
@@ -270,19 +318,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
270318

271319
// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
272320
// element into two elements: one for each half of the GT_LONG.
273-
if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST))
321+
if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST))
274322
{
275323
DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp());
276324
}
277325

278-
#ifdef DEBUG
279-
if (m_compiler->verbose)
280-
{
281-
// NOTE: st_lcl_var doesn't dump properly afterwards.
282-
printf("Decomposing TYP_LONG tree. AFTER:\n");
283-
m_compiler->gtDispTreeRange(Range(), use.Def());
284-
}
285-
#endif
326+
// NOTE: st_lcl_var doesn't dump properly afterwards.
327+
JITDUMP("Decomposing TYP_LONG tree. AFTER:\n");
328+
DISPTREERANGE(Range(), use.Def());
286329

287330
// When casting from a decomposed long to a smaller integer we can discard the high part.
288331
if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) &&
@@ -1707,6 +1750,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
17071750
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
17081751
}
17091752

1753+
case NI_Vector128_ToScalar:
1754+
case NI_Vector256_ToScalar:
1755+
case NI_Vector512_ToScalar:
1756+
{
1757+
return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree);
1758+
}
1759+
17101760
case NI_EVEX_MoveMask:
17111761
{
17121762
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
@@ -1835,6 +1885,94 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
18351885
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
18361886
}
18371887

1888+
//------------------------------------------------------------------------
1889+
// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
1890+
//
1891+
// create:
1892+
//
1893+
// tmp_simd_var = simd_var
1894+
// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
1895+
// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
1896+
// - or -
1897+
// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
1898+
// return: GT_LONG(lo_result, hi_result)
1899+
//
1900+
// Arguments:
1901+
// use - the LIR::Use object for the def that needs to be decomposed.
1902+
// node - the hwintrinsic node to decompose
1903+
//
1904+
// Return Value:
1905+
// The GT_LONG node wrapping the upper and lower halves.
1906+
//
1907+
GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node)
1908+
{
1909+
assert(node == use.Def());
1910+
assert(varTypeIsLong(node));
1911+
1912+
GenTree* op1 = node->Op(1);
1913+
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
1914+
var_types simdBaseType = node->GetSimdBaseType();
1915+
unsigned simdSize = node->GetSimdSize();
1916+
1917+
assert(varTypeIsLong(simdBaseType));
1918+
assert(varTypeIsSIMD(op1));
1919+
assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar) ||
1920+
(intrinsicId == NI_Vector512_ToScalar));
1921+
1922+
GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1));
1923+
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
1924+
JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n");
1925+
DISPTREERANGE(Range(), simdTmpVar);
1926+
1927+
GenTreeHWIntrinsic* loResult =
1928+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, intrinsicId, CORINFO_TYPE_INT, simdSize);
1929+
Range().InsertAfter(simdTmpVar, loResult);
1930+
1931+
simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
1932+
Range().InsertAfter(loResult, simdTmpVar);
1933+
1934+
GenTreeHWIntrinsic* hiResult;
1935+
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
1936+
{
1937+
NamedIntrinsic getElement = NI_Illegal;
1938+
switch (simdSize)
1939+
{
1940+
case 16:
1941+
getElement = NI_Vector128_GetElement;
1942+
break;
1943+
case 32:
1944+
getElement = NI_Vector256_GetElement;
1945+
break;
1946+
case 64:
1947+
getElement = NI_Vector512_GetElement;
1948+
break;
1949+
default:
1950+
unreached();
1951+
}
1952+
1953+
GenTree* one = m_compiler->gtNewIconNode(1);
1954+
hiResult =
1955+
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, one, getElement, CORINFO_TYPE_INT, simdSize);
1956+
1957+
Range().InsertAfter(simdTmpVar, one, hiResult);
1958+
}
1959+
else
1960+
{
1961+
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));
1962+
1963+
GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
1964+
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
1965+
node->GetSimdBaseJitType(), simdSize);
1966+
hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, shift, intrinsicId, CORINFO_TYPE_INT, simdSize);
1967+
1968+
Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
1969+
}
1970+
1971+
Range().Remove(node);
1972+
1973+
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
1974+
}
1975+
18381976
//------------------------------------------------------------------------
18391977
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
18401978
//
@@ -2262,6 +2400,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
22622400
{
22632401
return;
22642402
}
2403+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
2404+
if (varDsc->lvIsParam)
2405+
{
2406+
// Promotion blocks combined read optimizations for SIMD loads of long params
2407+
return;
2408+
}
2409+
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
22652410

22662411
varDsc->lvFieldCnt = 2;
22672412
varDsc->lvFieldLclStart = m_compiler->lvaCount;

src/coreclr/jit/decomposelongs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class DecomposeLongs
6464
#ifdef FEATURE_HW_INTRINSICS
6565
GenTree* DecomposeHWIntrinsic(LIR::Use& use);
6666
GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
67+
GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node);
6768
GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
6869
#endif // FEATURE_HW_INTRINSICS
6970

src/coreclr/jit/emitxarch.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20611,22 +20611,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
2061120611
break;
2061220612

2061320613
case INS_movd:
20614-
case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead)
20614+
case INS_movq:
2061520615
if (memAccessKind == PERFSCORE_MEMORY_NONE)
2061620616
{
20617-
// movd r32, xmm or xmm, r32
20618-
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
20619-
result.insLatency = PERFSCORE_LATENCY_3C;
20617+
if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2()))
20618+
{
20619+
// movq xmm, xmm
20620+
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
20621+
result.insLatency = PERFSCORE_LATENCY_1C;
20622+
}
20623+
else
20624+
{
20625+
// movd r32/64, xmm or xmm, r32/64
20626+
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
20627+
result.insLatency = PERFSCORE_LATENCY_3C;
20628+
}
2062020629
}
2062120630
else if (memAccessKind == PERFSCORE_MEMORY_READ)
2062220631
{
20623-
// movd xmm, m32
20632+
// ins xmm, m32/64
2062420633
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
2062520634
result.insLatency += PERFSCORE_LATENCY_2C;
2062620635
}
2062720636
else
2062820637
{
20629-
// movd m32, xmm
20638+
// ins m32/64, xmm
2063020639
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
2063120640
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
2063220641
result.insLatency += PERFSCORE_LATENCY_2C;

0 commit comments

Comments
 (0)