@@ -90,7 +90,7 @@ void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
90
90
91
91
// ------------------------------------------------------------------------
92
92
// DecomposeLongs::DecomposeRangeHelper:
93
- // Decompiose each node in the current range.
93
+ // Decompose each node in the current range.
94
94
//
95
95
// Decomposition is done as an execution-order walk. Decomposition of
96
96
// a particular node can create new nodes that need to be further
@@ -122,44 +122,92 @@ void DecomposeLongs::DecomposeRangeHelper()
122
122
GenTree* DecomposeLongs::DecomposeNode (GenTree* tree)
123
123
{
124
124
// Handle the case where we are implicitly using the lower half of a long lclVar.
125
- if (( tree->TypeGet () == TYP_INT) && tree->OperIsLocal ())
125
+ if (tree->TypeIs ( TYP_INT) && tree->OperIsLocal ())
126
126
{
127
127
LclVarDsc* varDsc = m_compiler->lvaGetDesc (tree->AsLclVarCommon ());
128
128
if (varTypeIsLong (varDsc) && varDsc->lvPromoted )
129
129
{
130
- #ifdef DEBUG
131
- if (m_compiler->verbose )
132
- {
133
- printf (" Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
134
- " half:\n " );
135
- m_compiler->gtDispTreeRange (Range (), tree);
136
- }
137
- #endif // DEBUG
130
+ JITDUMP (" Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
131
+ " half:\n " );
132
+ DISPTREERANGE (Range (), tree);
133
+
138
134
unsigned loVarNum = varDsc->lvFieldLclStart ;
139
135
tree->AsLclVarCommon ()->SetLclNum (loVarNum);
140
136
return tree->gtNext ;
141
137
}
142
138
}
143
139
144
- if (tree->TypeGet () != TYP_LONG)
140
+ if (! tree->TypeIs ( TYP_LONG) )
145
141
{
146
142
return tree->gtNext ;
147
143
}
148
144
149
- #ifdef DEBUG
150
- if (m_compiler->verbose )
151
- {
152
- printf (" Decomposing TYP_LONG tree. BEFORE:\n " );
153
- m_compiler->gtDispTreeRange (Range (), tree);
154
- }
155
- #endif // DEBUG
156
-
157
145
LIR::Use use;
158
146
if (!Range ().TryGetUse (tree, &use))
159
147
{
160
148
LIR::Use::MakeDummyUse (Range (), tree, &use);
161
149
}
162
150
151
+ #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
152
+ if (!use.IsDummyUse ())
153
+ {
154
+ // HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
155
+ // Here we do a conservative check for specific cases where it is certain the load/store
156
+ // can be contained. In those cases, we can skip decomposition.
157
+
158
+ GenTree* user = use.User ();
159
+
160
+ if (user->OperIsHWIntrinsic ())
161
+ {
162
+ if (tree->OperIs (GT_CNS_LNG) || (tree->OperIs (GT_IND, GT_LCL_FLD) && (user == tree->gtNext )))
163
+ {
164
+ return tree->gtNext ;
165
+ }
166
+ }
167
+ else if (user->OperIs (GT_STOREIND) && tree->OperIsHWIntrinsic () && m_compiler->opts .OptimizationEnabled ())
168
+ {
169
+ // We're looking for this common pattern, with operands in either order in the LIR sequence:
170
+ // t1 = * HWINTRINSIC long ToScalar
171
+ // t0 = LCL_VAR byref
172
+ // /--* t0 byref
173
+ // +--* t1 long
174
+ // * STOREIND long
175
+
176
+ GenTree* next = tree->gtNext ;
177
+ if ((user != next) && !m_compiler->gtTreeHasSideEffects (next, GTF_SIDE_EFFECT))
178
+ {
179
+ next = next->gtNext ;
180
+ }
181
+
182
+ if (user == next)
183
+ {
184
+ NamedIntrinsic intrinsic = tree->AsHWIntrinsic ()->GetHWIntrinsicId ();
185
+
186
+ if ((intrinsic == NI_Vector128_ToScalar) || (intrinsic == NI_Vector256_ToScalar) ||
187
+ (intrinsic == NI_Vector512_ToScalar))
188
+ {
189
+ return tree->gtNext ;
190
+ }
191
+ }
192
+ }
193
+ }
194
+
195
+ if (tree->OperIs (GT_STOREIND) && tree->AsStoreInd ()->Data ()->OperIsHWIntrinsic ())
196
+ {
197
+ #if DEBUG
198
+ // We should only get here if we matched the second pattern above.
199
+ NamedIntrinsic intrinsic = tree->AsStoreInd ()->Data ()->AsHWIntrinsic ()->GetHWIntrinsicId ();
200
+ assert ((intrinsic == NI_Vector128_ToScalar) || (intrinsic == NI_Vector256_ToScalar) ||
201
+ (intrinsic == NI_Vector512_ToScalar));
202
+ #endif // DEBUG
203
+
204
+ return tree->gtNext ;
205
+ }
206
+ #endif // FEATURE_HW_INTRINSICS && TARGET_X86
207
+
208
+ JITDUMP (" Decomposing TYP_LONG tree. BEFORE:\n " );
209
+ DISPTREERANGE (Range (), tree);
210
+
163
211
GenTree* nextNode = nullptr ;
164
212
switch (tree->OperGet ())
165
213
{
@@ -270,19 +318,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
270
318
271
319
// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
272
320
// element into two elements: one for each half of the GT_LONG.
273
- if (( use.Def ()->OperGet () == GT_LONG) && !use.IsDummyUse () && ( use.User ()->OperGet () == GT_FIELD_LIST))
321
+ if (use.Def ()->OperIs ( GT_LONG) && !use.IsDummyUse () && use.User ()->OperIs ( GT_FIELD_LIST))
274
322
{
275
323
DecomposeFieldList (use.User ()->AsFieldList (), use.Def ()->AsOp ());
276
324
}
277
325
278
- #ifdef DEBUG
279
- if (m_compiler->verbose )
280
- {
281
- // NOTE: st_lcl_var doesn't dump properly afterwards.
282
- printf (" Decomposing TYP_LONG tree. AFTER:\n " );
283
- m_compiler->gtDispTreeRange (Range (), use.Def ());
284
- }
285
- #endif
326
+ // NOTE: st_lcl_var doesn't dump properly afterwards.
327
+ JITDUMP (" Decomposing TYP_LONG tree. AFTER:\n " );
328
+ DISPTREERANGE (Range (), use.Def ());
286
329
287
330
// When casting from a decomposed long to a smaller integer we can discard the high part.
288
331
if (m_compiler->opts .OptimizationEnabled () && !use.IsDummyUse () && use.User ()->OperIs (GT_CAST) &&
@@ -1707,6 +1750,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
1707
1750
return DecomposeHWIntrinsicGetElement (use, hwintrinsicTree);
1708
1751
}
1709
1752
1753
+ case NI_Vector128_ToScalar:
1754
+ case NI_Vector256_ToScalar:
1755
+ case NI_Vector512_ToScalar:
1756
+ {
1757
+ return DecomposeHWIntrinsicToScalar (use, hwintrinsicTree);
1758
+ }
1759
+
1710
1760
case NI_EVEX_MoveMask:
1711
1761
{
1712
1762
return DecomposeHWIntrinsicMoveMask (use, hwintrinsicTree);
@@ -1835,6 +1885,94 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
1835
1885
return FinalizeDecomposition (use, loResult, hiResult, hiResult);
1836
1886
}
1837
1887
1888
+ // ------------------------------------------------------------------------
1889
+ // DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
1890
+ //
1891
+ // create:
1892
+ //
1893
+ // tmp_simd_var = simd_var
1894
+ // lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
1895
+ // hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
1896
+ // - or -
1897
+ // GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
1898
+ // return: GT_LONG(lo_result, hi_result)
1899
+ //
1900
+ // Arguments:
1901
+ // use - the LIR::Use object for the def that needs to be decomposed.
1902
+ // node - the hwintrinsic node to decompose
1903
+ //
1904
+ // Return Value:
1905
+ // The GT_LONG node wrapping the upper and lower halves.
1906
+ //
1907
+ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar (LIR::Use& use, GenTreeHWIntrinsic* node)
1908
+ {
1909
+ assert (node == use.Def ());
1910
+ assert (varTypeIsLong (node));
1911
+
1912
+ GenTree* op1 = node->Op (1 );
1913
+ NamedIntrinsic intrinsicId = node->GetHWIntrinsicId ();
1914
+ var_types simdBaseType = node->GetSimdBaseType ();
1915
+ unsigned simdSize = node->GetSimdSize ();
1916
+
1917
+ assert (varTypeIsLong (simdBaseType));
1918
+ assert (varTypeIsSIMD (op1));
1919
+ assert ((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar) ||
1920
+ (intrinsicId == NI_Vector512_ToScalar));
1921
+
1922
+ GenTree* simdTmpVar = RepresentOpAsLocalVar (op1, node, &node->Op (1 ));
1923
+ unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon ()->GetLclNum ();
1924
+ JITDUMP (" [DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n " );
1925
+ DISPTREERANGE (Range (), simdTmpVar);
1926
+
1927
+ GenTreeHWIntrinsic* loResult =
1928
+ m_compiler->gtNewSimdHWIntrinsicNode (TYP_INT, simdTmpVar, intrinsicId, CORINFO_TYPE_INT, simdSize);
1929
+ Range ().InsertAfter (simdTmpVar, loResult);
1930
+
1931
+ simdTmpVar = m_compiler->gtNewLclLNode (simdTmpVarNum, simdTmpVar->TypeGet ());
1932
+ Range ().InsertAfter (loResult, simdTmpVar);
1933
+
1934
+ GenTreeHWIntrinsic* hiResult;
1935
+ if (m_compiler->compOpportunisticallyDependsOn (InstructionSet_SSE41))
1936
+ {
1937
+ NamedIntrinsic getElement = NI_Illegal;
1938
+ switch (simdSize)
1939
+ {
1940
+ case 16 :
1941
+ getElement = NI_Vector128_GetElement;
1942
+ break ;
1943
+ case 32 :
1944
+ getElement = NI_Vector256_GetElement;
1945
+ break ;
1946
+ case 64 :
1947
+ getElement = NI_Vector512_GetElement;
1948
+ break ;
1949
+ default :
1950
+ unreached ();
1951
+ }
1952
+
1953
+ GenTree* one = m_compiler->gtNewIconNode (1 );
1954
+ hiResult =
1955
+ m_compiler->gtNewSimdHWIntrinsicNode (TYP_INT, simdTmpVar, one, getElement, CORINFO_TYPE_INT, simdSize);
1956
+
1957
+ Range ().InsertAfter (simdTmpVar, one, hiResult);
1958
+ }
1959
+ else
1960
+ {
1961
+ assert (m_compiler->compIsaSupportedDebugOnly (InstructionSet_SSE2));
1962
+
1963
+ GenTree* thirtyTwo = m_compiler->gtNewIconNode (32 );
1964
+ GenTree* shift = m_compiler->gtNewSimdBinOpNode (GT_RSZ, op1->TypeGet (), simdTmpVar, thirtyTwo,
1965
+ node->GetSimdBaseJitType (), simdSize);
1966
+ hiResult = m_compiler->gtNewSimdHWIntrinsicNode (TYP_INT, shift, intrinsicId, CORINFO_TYPE_INT, simdSize);
1967
+
1968
+ Range ().InsertAfter (simdTmpVar, thirtyTwo, shift, hiResult);
1969
+ }
1970
+
1971
+ Range ().Remove (node);
1972
+
1973
+ return FinalizeDecomposition (use, loResult, hiResult, hiResult);
1974
+ }
1975
+
1838
1976
// ------------------------------------------------------------------------
1839
1977
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
1840
1978
//
@@ -2262,6 +2400,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
2262
2400
{
2263
2401
return ;
2264
2402
}
2403
+ #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
2404
+ if (varDsc->lvIsParam )
2405
+ {
2406
+ // Promotion blocks combined read optimizations for SIMD loads of long params
2407
+ return ;
2408
+ }
2409
+ #endif // FEATURE_HW_INTRINSICS && TARGET_X86
2265
2410
2266
2411
varDsc->lvFieldCnt = 2 ;
2267
2412
varDsc->lvFieldLclStart = m_compiler->lvaCount ;
0 commit comments