@@ -78,19 +78,19 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block)
78
78
// Return Value:
79
79
// None.
80
80
//
81
- void DecomposeLongs::DecomposeRange (Compiler* compiler, LIR::Range& range)
81
+ void DecomposeLongs::DecomposeRange (Compiler* compiler, Lowering* lowering, LIR::Range& range)
82
82
{
83
83
assert (compiler != nullptr );
84
84
85
- DecomposeLongs decomposer (compiler);
85
+ DecomposeLongs decomposer (compiler, lowering );
86
86
decomposer.m_range = ⦥
87
87
88
88
decomposer.DecomposeRangeHelper ();
89
89
}
90
90
91
91
// ------------------------------------------------------------------------
92
92
// DecomposeLongs::DecomposeRangeHelper:
93
- // Decompiose each node in the current range.
93
+ // Decompose each node in the current range.
94
94
//
95
95
// Decomposition is done as an execution-order walk. Decomposition of
96
96
// a particular node can create new nodes that need to be further
@@ -122,44 +122,76 @@ void DecomposeLongs::DecomposeRangeHelper()
122
122
GenTree* DecomposeLongs::DecomposeNode (GenTree* tree)
123
123
{
124
124
// Handle the case where we are implicitly using the lower half of a long lclVar.
125
- if (( tree->TypeGet () == TYP_INT) && tree->OperIsLocal ())
125
+ if (tree->TypeIs ( TYP_INT) && tree->OperIsLocal ())
126
126
{
127
127
LclVarDsc* varDsc = m_compiler->lvaGetDesc (tree->AsLclVarCommon ());
128
128
if (varTypeIsLong (varDsc) && varDsc->lvPromoted )
129
129
{
130
- #ifdef DEBUG
131
- if (m_compiler->verbose )
132
- {
133
- printf (" Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
134
- " half:\n " );
135
- m_compiler->gtDispTreeRange (Range (), tree);
136
- }
137
- #endif // DEBUG
130
+ JITDUMP (" Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
131
+ " half:\n " );
132
+ DISPTREERANGE (Range (), tree);
133
+
138
134
unsigned loVarNum = varDsc->lvFieldLclStart ;
139
135
tree->AsLclVarCommon ()->SetLclNum (loVarNum);
140
136
return tree->gtNext ;
141
137
}
142
138
}
143
139
144
- if (tree->TypeGet () != TYP_LONG)
140
+ if (! tree->TypeIs ( TYP_LONG) )
145
141
{
146
142
return tree->gtNext ;
147
143
}
148
144
149
- #ifdef DEBUG
150
- if (m_compiler->verbose )
151
- {
152
- printf (" Decomposing TYP_LONG tree. BEFORE:\n " );
153
- m_compiler->gtDispTreeRange (Range (), tree);
154
- }
155
- #endif // DEBUG
156
-
157
145
LIR::Use use;
158
146
if (!Range ().TryGetUse (tree, &use))
159
147
{
160
148
LIR::Use::MakeDummyUse (Range (), tree, &use);
161
149
}
162
150
151
+ #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
152
+ if (!use.IsDummyUse ())
153
+ {
154
+ // HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
155
+ // Here we do a conservative check for specific cases where it is certain the load/store
156
+ // can be contained. In those cases, we can skip decomposition.
157
+
158
+ GenTree* user = use.User ();
159
+
160
+ if (user->OperIsHWIntrinsic ())
161
+ {
162
+ if (tree->OperIs (GT_CNS_LNG) ||
163
+ (tree->OperIs (GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem (user, tree)))
164
+ {
165
+ NamedIntrinsic intrinsicId = user->AsHWIntrinsic ()->GetHWIntrinsicId ();
166
+ assert (HWIntrinsicInfo::IsVectorCreate (intrinsicId) ||
167
+ HWIntrinsicInfo::IsVectorCreateScalar (intrinsicId) ||
168
+ HWIntrinsicInfo::IsVectorCreateScalarUnsafe (intrinsicId));
169
+
170
+ return tree->gtNext ;
171
+ }
172
+ }
173
+ else if (user->OperIs (GT_STOREIND) && tree->OperIsHWIntrinsic () && m_compiler->opts .OptimizationEnabled ())
174
+ {
175
+ NamedIntrinsic intrinsicId = tree->AsHWIntrinsic ()->GetHWIntrinsicId ();
176
+ if (HWIntrinsicInfo::IsVectorToScalar (intrinsicId) && m_lowering->IsSafeToContainMem (user, tree))
177
+ {
178
+ return tree->gtNext ;
179
+ }
180
+ }
181
+ }
182
+
183
+ if (tree->OperIs (GT_STOREIND) && tree->AsStoreInd ()->Data ()->OperIsHWIntrinsic ())
184
+ {
185
+ // We should only get here if we matched the second pattern above.
186
+ assert (HWIntrinsicInfo::IsVectorToScalar (tree->AsStoreInd ()->Data ()->AsHWIntrinsic ()->GetHWIntrinsicId ()));
187
+
188
+ return tree->gtNext ;
189
+ }
190
+ #endif // FEATURE_HW_INTRINSICS && TARGET_X86
191
+
192
+ JITDUMP (" Decomposing TYP_LONG tree. BEFORE:\n " );
193
+ DISPTREERANGE (Range (), tree);
194
+
163
195
GenTree* nextNode = nullptr ;
164
196
switch (tree->OperGet ())
165
197
{
@@ -270,19 +302,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
270
302
271
303
// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
272
304
// element into two elements: one for each half of the GT_LONG.
273
- if (( use.Def ()->OperGet () == GT_LONG) && !use.IsDummyUse () && ( use.User ()->OperGet () == GT_FIELD_LIST))
305
+ if (use.Def ()->OperIs ( GT_LONG) && !use.IsDummyUse () && use.User ()->OperIs ( GT_FIELD_LIST))
274
306
{
275
307
DecomposeFieldList (use.User ()->AsFieldList (), use.Def ()->AsOp ());
276
308
}
277
309
278
- #ifdef DEBUG
279
- if (m_compiler->verbose )
280
- {
281
- // NOTE: st_lcl_var doesn't dump properly afterwards.
282
- printf (" Decomposing TYP_LONG tree. AFTER:\n " );
283
- m_compiler->gtDispTreeRange (Range (), use.Def ());
284
- }
285
- #endif
310
+ // NOTE: st_lcl_var doesn't dump properly afterwards.
311
+ JITDUMP (" Decomposing TYP_LONG tree. AFTER:\n " );
312
+ DISPTREERANGE (Range (), use.Def ());
286
313
287
314
// When casting from a decomposed long to a smaller integer we can discard the high part.
288
315
if (m_compiler->opts .OptimizationEnabled () && !use.IsDummyUse () && use.User ()->OperIs (GT_CAST) &&
@@ -1707,6 +1734,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
1707
1734
return DecomposeHWIntrinsicGetElement (use, hwintrinsicTree);
1708
1735
}
1709
1736
1737
+ case NI_Vector128_ToScalar:
1738
+ case NI_Vector256_ToScalar:
1739
+ case NI_Vector512_ToScalar:
1740
+ {
1741
+ return DecomposeHWIntrinsicToScalar (use, hwintrinsicTree);
1742
+ }
1743
+
1710
1744
case NI_EVEX_MoveMask:
1711
1745
{
1712
1746
return DecomposeHWIntrinsicMoveMask (use, hwintrinsicTree);
@@ -1751,9 +1785,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
1751
1785
{
1752
1786
assert (node == use.Def ());
1753
1787
assert (varTypeIsLong (node));
1754
- assert ((node->GetHWIntrinsicId () == NI_Vector128_GetElement) ||
1755
- (node->GetHWIntrinsicId () == NI_Vector256_GetElement) ||
1756
- (node->GetHWIntrinsicId () == NI_Vector512_GetElement));
1788
+ assert (HWIntrinsicInfo::IsVectorGetElement (node->GetHWIntrinsicId ()));
1757
1789
1758
1790
GenTree* op1 = node->Op (1 );
1759
1791
GenTree* op2 = node->Op (2 );
@@ -1835,6 +1867,75 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
1835
1867
return FinalizeDecomposition (use, loResult, hiResult, hiResult);
1836
1868
}
1837
1869
1870
+ // ------------------------------------------------------------------------
1871
+ // DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
1872
+ //
1873
+ // create:
1874
+ //
1875
+ // tmp_simd_var = simd_var
1876
+ // lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
1877
+ // hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
1878
+ // - or -
1879
+ // GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
1880
+ // return: GT_LONG(lo_result, hi_result)
1881
+ //
1882
+ // Arguments:
1883
+ // use - the LIR::Use object for the def that needs to be decomposed.
1884
+ // node - the hwintrinsic node to decompose
1885
+ //
1886
+ // Return Value:
1887
+ // The GT_LONG node wrapping the upper and lower halves.
1888
+ //
1889
+ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar (LIR::Use& use, GenTreeHWIntrinsic* node)
1890
+ {
1891
+ assert (node == use.Def ());
1892
+ assert (varTypeIsLong (node));
1893
+ assert (HWIntrinsicInfo::IsVectorToScalar (node->GetHWIntrinsicId ()));
1894
+
1895
+ GenTree* op1 = node->Op (1 );
1896
+ NamedIntrinsic intrinsicId = node->GetHWIntrinsicId ();
1897
+ var_types simdBaseType = node->GetSimdBaseType ();
1898
+ unsigned simdSize = node->GetSimdSize ();
1899
+
1900
+ assert (varTypeIsLong (simdBaseType));
1901
+ assert (varTypeIsSIMD (op1));
1902
+
1903
+ GenTree* simdTmpVar = RepresentOpAsLocalVar (op1, node, &node->Op (1 ));
1904
+ unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon ()->GetLclNum ();
1905
+ JITDUMP (" [DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n " );
1906
+ DISPTREERANGE (Range (), simdTmpVar);
1907
+
1908
+ GenTree* loResult = m_compiler->gtNewSimdToScalarNode (TYP_INT, simdTmpVar, CORINFO_TYPE_INT, simdSize);
1909
+ Range ().InsertAfter (simdTmpVar, loResult);
1910
+
1911
+ simdTmpVar = m_compiler->gtNewLclLNode (simdTmpVarNum, simdTmpVar->TypeGet ());
1912
+ Range ().InsertAfter (loResult, simdTmpVar);
1913
+
1914
+ GenTree* hiResult;
1915
+ if (m_compiler->compOpportunisticallyDependsOn (InstructionSet_SSE41))
1916
+ {
1917
+ GenTree* one = m_compiler->gtNewIconNode (1 );
1918
+ hiResult = m_compiler->gtNewSimdGetElementNode (TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
1919
+
1920
+ Range ().InsertAfter (simdTmpVar, one, hiResult);
1921
+ }
1922
+ else
1923
+ {
1924
+ assert (m_compiler->compIsaSupportedDebugOnly (InstructionSet_SSE2));
1925
+
1926
+ GenTree* thirtyTwo = m_compiler->gtNewIconNode (32 );
1927
+ GenTree* shift = m_compiler->gtNewSimdBinOpNode (GT_RSZ, op1->TypeGet (), simdTmpVar, thirtyTwo,
1928
+ node->GetSimdBaseJitType (), simdSize);
1929
+ hiResult = m_compiler->gtNewSimdToScalarNode (TYP_INT, shift, CORINFO_TYPE_INT, simdSize);
1930
+
1931
+ Range ().InsertAfter (simdTmpVar, thirtyTwo, shift, hiResult);
1932
+ }
1933
+
1934
+ Range ().Remove (node);
1935
+
1936
+ return FinalizeDecomposition (use, loResult, hiResult, hiResult);
1937
+ }
1938
+
1838
1939
// ------------------------------------------------------------------------
1839
1940
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
1840
1941
//
@@ -2262,6 +2363,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
2262
2363
{
2263
2364
return ;
2264
2365
}
2366
+ #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
2367
+ if (varDsc->lvIsParam )
2368
+ {
2369
+ // Promotion blocks combined read optimizations for SIMD loads of long params
2370
+ return ;
2371
+ }
2372
+ #endif // FEATURE_HW_INTRINSICS && TARGET_X86
2265
2373
2266
2374
varDsc->lvFieldCnt = 2 ;
2267
2375
varDsc->lvFieldLclStart = m_compiler->lvaCount ;
0 commit comments