Skip to content

Commit 16236fd

Browse files
authored
JIT: Unblock Vector###<long> intrinsics on x86 (#112728)
* unblock long xplat intrinsics on x86 * tidying * tidying2 * remove CreateScalarUnsafe opt for small loads * skip more redundant casts for CreateScalar of small types * use temp reg for CreateScalar float SSE fallback * formatting patch * simplify storeind containment of ToScalar * don't use temp reg for CreateScalar float SSE fallback * skip cast on other memory loads * use proper containment check * add more validation, remove CreateSequence restriction * use appropriate helpers for decomposing ToScalar
1 parent 67c10c8 commit 16236fd

16 files changed

+667
-807
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5685,6 +5685,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
56855685
// These intrinsics are "ins reg/mem, xmm"
56865686
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
56875687
attr = emitActualTypeSize(baseType);
5688+
#if defined(TARGET_X86)
5689+
if (varTypeIsLong(baseType))
5690+
{
5691+
ins = INS_movq;
5692+
attr = EA_8BYTE;
5693+
}
5694+
#endif // TARGET_X86
56885695
break;
56895696
}
56905697

src/coreclr/jit/decomposelongs.cpp

Lines changed: 141 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -78,19 +78,19 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block)
7878
// Return Value:
7979
// None.
8080
//
81-
void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
81+
void DecomposeLongs::DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range)
8282
{
8383
assert(compiler != nullptr);
8484

85-
DecomposeLongs decomposer(compiler);
85+
DecomposeLongs decomposer(compiler, lowering);
8686
decomposer.m_range = &range;
8787

8888
decomposer.DecomposeRangeHelper();
8989
}
9090

9191
//------------------------------------------------------------------------
9292
// DecomposeLongs::DecomposeRangeHelper:
93-
// Decompiose each node in the current range.
93+
// Decompose each node in the current range.
9494
//
9595
// Decomposition is done as an execution-order walk. Decomposition of
9696
// a particular node can create new nodes that need to be further
@@ -122,44 +122,76 @@ void DecomposeLongs::DecomposeRangeHelper()
122122
GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
123123
{
124124
// Handle the case where we are implicitly using the lower half of a long lclVar.
125-
if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal())
125+
if (tree->TypeIs(TYP_INT) && tree->OperIsLocal())
126126
{
127127
LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon());
128128
if (varTypeIsLong(varDsc) && varDsc->lvPromoted)
129129
{
130-
#ifdef DEBUG
131-
if (m_compiler->verbose)
132-
{
133-
printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
134-
"half:\n");
135-
m_compiler->gtDispTreeRange(Range(), tree);
136-
}
137-
#endif // DEBUG
130+
JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
131+
"half:\n");
132+
DISPTREERANGE(Range(), tree);
133+
138134
unsigned loVarNum = varDsc->lvFieldLclStart;
139135
tree->AsLclVarCommon()->SetLclNum(loVarNum);
140136
return tree->gtNext;
141137
}
142138
}
143139

144-
if (tree->TypeGet() != TYP_LONG)
140+
if (!tree->TypeIs(TYP_LONG))
145141
{
146142
return tree->gtNext;
147143
}
148144

149-
#ifdef DEBUG
150-
if (m_compiler->verbose)
151-
{
152-
printf("Decomposing TYP_LONG tree. BEFORE:\n");
153-
m_compiler->gtDispTreeRange(Range(), tree);
154-
}
155-
#endif // DEBUG
156-
157145
LIR::Use use;
158146
if (!Range().TryGetUse(tree, &use))
159147
{
160148
LIR::Use::MakeDummyUse(Range(), tree, &use);
161149
}
162150

151+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
152+
if (!use.IsDummyUse())
153+
{
154+
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
155+
// Here we do a conservative check for specific cases where it is certain the load/store
156+
// can be contained. In those cases, we can skip decomposition.
157+
158+
GenTree* user = use.User();
159+
160+
if (user->OperIsHWIntrinsic())
161+
{
162+
if (tree->OperIs(GT_CNS_LNG) ||
163+
(tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree)))
164+
{
165+
NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId();
166+
assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) ||
167+
HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) ||
168+
HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId));
169+
170+
return tree->gtNext;
171+
}
172+
}
173+
else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled())
174+
{
175+
NamedIntrinsic intrinsicId = tree->AsHWIntrinsic()->GetHWIntrinsicId();
176+
if (HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && m_lowering->IsSafeToContainMem(user, tree))
177+
{
178+
return tree->gtNext;
179+
}
180+
}
181+
}
182+
183+
if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic())
184+
{
185+
// We should only get here if we matched the second pattern above.
186+
assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId()));
187+
188+
return tree->gtNext;
189+
}
190+
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
191+
192+
JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n");
193+
DISPTREERANGE(Range(), tree);
194+
163195
GenTree* nextNode = nullptr;
164196
switch (tree->OperGet())
165197
{
@@ -270,19 +302,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
270302

271303
// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
272304
// element into two elements: one for each half of the GT_LONG.
273-
if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST))
305+
if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST))
274306
{
275307
DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp());
276308
}
277309

278-
#ifdef DEBUG
279-
if (m_compiler->verbose)
280-
{
281-
// NOTE: st_lcl_var doesn't dump properly afterwards.
282-
printf("Decomposing TYP_LONG tree. AFTER:\n");
283-
m_compiler->gtDispTreeRange(Range(), use.Def());
284-
}
285-
#endif
310+
// NOTE: st_lcl_var doesn't dump properly afterwards.
311+
JITDUMP("Decomposing TYP_LONG tree. AFTER:\n");
312+
DISPTREERANGE(Range(), use.Def());
286313

287314
// When casting from a decomposed long to a smaller integer we can discard the high part.
288315
if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) &&
@@ -1707,6 +1734,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
17071734
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
17081735
}
17091736

1737+
case NI_Vector128_ToScalar:
1738+
case NI_Vector256_ToScalar:
1739+
case NI_Vector512_ToScalar:
1740+
{
1741+
return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree);
1742+
}
1743+
17101744
case NI_EVEX_MoveMask:
17111745
{
17121746
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
@@ -1751,9 +1785,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
17511785
{
17521786
assert(node == use.Def());
17531787
assert(varTypeIsLong(node));
1754-
assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) ||
1755-
(node->GetHWIntrinsicId() == NI_Vector256_GetElement) ||
1756-
(node->GetHWIntrinsicId() == NI_Vector512_GetElement));
1788+
assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId()));
17571789

17581790
GenTree* op1 = node->Op(1);
17591791
GenTree* op2 = node->Op(2);
@@ -1835,6 +1867,75 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
18351867
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
18361868
}
18371869

1870+
//------------------------------------------------------------------------
1871+
// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
1872+
//
1873+
// create:
1874+
//
1875+
// tmp_simd_var = simd_var
1876+
// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
1877+
// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
1878+
// - or -
1879+
// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
1880+
// return: GT_LONG(lo_result, hi_result)
1881+
//
1882+
// Arguments:
1883+
// use - the LIR::Use object for the def that needs to be decomposed.
1884+
// node - the hwintrinsic node to decompose
1885+
//
1886+
// Return Value:
1887+
// The GT_LONG node wrapping the upper and lower halves.
1888+
//
1889+
GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node)
1890+
{
1891+
assert(node == use.Def());
1892+
assert(varTypeIsLong(node));
1893+
assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId()));
1894+
1895+
GenTree* op1 = node->Op(1);
1896+
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
1897+
var_types simdBaseType = node->GetSimdBaseType();
1898+
unsigned simdSize = node->GetSimdSize();
1899+
1900+
assert(varTypeIsLong(simdBaseType));
1901+
assert(varTypeIsSIMD(op1));
1902+
1903+
GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1));
1904+
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
1905+
JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n");
1906+
DISPTREERANGE(Range(), simdTmpVar);
1907+
1908+
GenTree* loResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, simdTmpVar, CORINFO_TYPE_INT, simdSize);
1909+
Range().InsertAfter(simdTmpVar, loResult);
1910+
1911+
simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
1912+
Range().InsertAfter(loResult, simdTmpVar);
1913+
1914+
GenTree* hiResult;
1915+
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
1916+
{
1917+
GenTree* one = m_compiler->gtNewIconNode(1);
1918+
hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
1919+
1920+
Range().InsertAfter(simdTmpVar, one, hiResult);
1921+
}
1922+
else
1923+
{
1924+
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));
1925+
1926+
GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
1927+
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
1928+
node->GetSimdBaseJitType(), simdSize);
1929+
hiResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize);
1930+
1931+
Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
1932+
}
1933+
1934+
Range().Remove(node);
1935+
1936+
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
1937+
}
1938+
18381939
//------------------------------------------------------------------------
18391940
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
18401941
//
@@ -2262,6 +2363,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
22622363
{
22632364
return;
22642365
}
2366+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
2367+
if (varDsc->lvIsParam)
2368+
{
2369+
// Promotion blocks combined read optimizations for SIMD loads of long params
2370+
return;
2371+
}
2372+
#endif // FEATURE_HW_INTRINSICS && TARGET_X86
22652373

22662374
varDsc->lvFieldCnt = 2;
22672375
varDsc->lvFieldLclStart = m_compiler->lvaCount;

src/coreclr/jit/decomposelongs.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1414
#define _DECOMPOSELONGS_H_
1515

1616
#include "compiler.h"
17+
#include "lower.h"
1718

1819
class DecomposeLongs
1920
{
2021
public:
21-
DecomposeLongs(Compiler* compiler)
22+
DecomposeLongs(Compiler* compiler, Lowering* lowering)
2223
: m_compiler(compiler)
24+
, m_lowering(lowering)
2325
{
2426
}
2527

2628
void PrepareForDecomposition();
2729
void DecomposeBlock(BasicBlock* block);
2830

29-
static void DecomposeRange(Compiler* compiler, LIR::Range& range);
31+
static void DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range);
3032

3133
private:
3234
inline LIR::Range& Range() const
@@ -64,6 +66,7 @@ class DecomposeLongs
6466
#ifdef FEATURE_HW_INTRINSICS
6567
GenTree* DecomposeHWIntrinsic(LIR::Use& use);
6668
GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
69+
GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node);
6770
GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
6871
#endif // FEATURE_HW_INTRINSICS
6972

@@ -80,6 +83,7 @@ class DecomposeLongs
8083

8184
// Data
8285
Compiler* m_compiler;
86+
Lowering* m_lowering;
8387
LIR::Range* m_range;
8488
};
8589

src/coreclr/jit/emitxarch.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20772,22 +20772,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
2077220772
break;
2077320773

2077420774
case INS_movd:
20775-
case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead)
20775+
case INS_movq:
2077620776
if (memAccessKind == PERFSCORE_MEMORY_NONE)
2077720777
{
20778-
// movd r32, xmm or xmm, r32
20779-
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
20780-
result.insLatency = PERFSCORE_LATENCY_3C;
20778+
if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2()))
20779+
{
20780+
// movq xmm, xmm
20781+
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
20782+
result.insLatency = PERFSCORE_LATENCY_1C;
20783+
}
20784+
else
20785+
{
20786+
// movd r32/64, xmm or xmm, r32/64
20787+
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
20788+
result.insLatency = PERFSCORE_LATENCY_3C;
20789+
}
2078120790
}
2078220791
else if (memAccessKind == PERFSCORE_MEMORY_READ)
2078320792
{
20784-
// movd xmm, m32
20793+
// ins xmm, m32/64
2078520794
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
2078620795
result.insLatency += PERFSCORE_LATENCY_2C;
2078720796
}
2078820797
else
2078920798
{
20790-
// movd m32, xmm
20799+
// ins m32/64, xmm
2079120800
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
2079220801
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
2079320802
result.insLatency += PERFSCORE_LATENCY_2C;

0 commit comments

Comments
 (0)