Skip to content

Commit 620bd3e

Browse files
Ensure Vector256.Dot produces a V256 result (#88712)
* Ensure Vector256.Dot produces a V256 result * Apply formatting patch
1 parent e0acb9d commit 620bd3e

File tree

1 file changed

+41
-38
lines changed

1 file changed

+41
-38
lines changed

src/coreclr/jit/lowerxarch.cpp

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4462,29 +4462,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
44624462
assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
44634463

44644464
// We will be constructing the following parts:
4465-
// idx = CNS_INT int 0xF1
4465+
// idx = CNS_INT int 0xFF
44664466
// /--* op1 simd16
44674467
// +--* op2 simd16
44684468
// +--* idx int
44694469
// tmp1 = * HWINTRINSIC simd32 T DotProduct
44704470
// /--* tmp1 simd32
44714471
// * STORE_LCL_VAR simd32
44724472
// tmp1 = LCL_VAR simd32
4473-
// /--* tmp1 simd32
4474-
// tmp1 = * HWINTRINSIC simd16 T GetLower
44754473
// tmp2 = LCL_VAR simd32
4476-
// /--* tmp2 simd16
4477-
// tmp2 = * HWINTRINSIC simd16 T GetUpper
4478-
// /--* tmp1 simd16
4479-
// +--* tmp2 simd16
4480-
// node = * HWINTRINSIC simd16 T Add
4474+
// tmp3 = LCL_VAR simd32
4475+
// /--* tmp2 simd32
4476+
// +--* tmp3 simd32
4477+
// +--* CNS_INT int 0x01
4478+
// tmp2 = * HWINTRINSIC simd32 T Permute
4479+
// /--* tmp1 simd32
4480+
// +--* tmp2 simd32
4481+
// node = * HWINTRINSIC simd32 T Add
44814482

44824483
// This is roughly the following managed code:
44834484
// var tmp1 = Avx.DotProduct(op1, op2, 0xFF);
4484-
// var tmp2 = tmp1.GetUpper();
4485-
// return Sse.Add(tmp1, tmp2);
4485+
// var tmp2 = Avx.Permute2x128(tmp1, tmp1, 0x4E);
4486+
// return Avx.Add(tmp1, tmp2);
44864487

4487-
idx = comp->gtNewIconNode(0xF1, TYP_INT);
4488+
idx = comp->gtNewIconNode(0xFF, TYP_INT);
44884489
BlockRange().InsertBefore(node, idx);
44894490

44904491
tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, simdBaseJitType,
@@ -4500,27 +4501,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
45004501
tmp2 = comp->gtClone(tmp1);
45014502
BlockRange().InsertAfter(tmp1, tmp2);
45024503

4503-
tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize);
4504+
tmp3 = comp->gtClone(tmp2);
45044505
BlockRange().InsertAfter(tmp2, tmp3);
4505-
LowerNode(tmp3);
45064506

4507-
tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize);
4508-
BlockRange().InsertAfter(tmp3, tmp1);
4509-
LowerNode(tmp1);
4507+
idx = comp->gtNewIconNode(0x01, TYP_INT);
4508+
BlockRange().InsertAfter(tmp3, idx);
45104509

4511-
tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16);
4512-
BlockRange().InsertAfter(tmp1, tmp2);
4510+
tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, NI_AVX_Permute2x128, simdBaseJitType,
4511+
simdSize);
4512+
BlockRange().InsertAfter(idx, tmp2);
4513+
LowerNode(tmp2);
4514+
4515+
tmp1 = comp->gtNewSimdBinOpNode(GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize);
4516+
BlockRange().InsertAfter(tmp2, tmp1);
45134517

45144518
// We're producing a vector result, so just return the result directly
45154519
LIR::Use use;
45164520

45174521
if (BlockRange().TryGetUse(node, &use))
45184522
{
4519-
use.ReplaceWith(tmp2);
4523+
use.ReplaceWith(tmp1);
45204524
}
45214525

45224526
BlockRange().Remove(node);
4523-
return LowerNode(tmp2);
4527+
return LowerNode(tmp1);
45244528
}
45254529

45264530
case TYP_DOUBLE:
@@ -4999,21 +5003,19 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
49995003
// /--* tmp1 simd32
50005004
// * STORE_LCL_VAR simd32
50015005
// tmp1 = LCL_VAR simd32
5002-
// /--* tmp1 simd32
5003-
// tmp1 = * HWINTRINSIC simd16 T GetLower
50045006
// tmp2 = LCL_VAR simd32
50055007
// /--* tmp2 simd32
5006-
// tmp3 = * HWINTRINSIC simd16 T GetUpper
5007-
// /--* tmp1 simd16
5008-
// +--* tmp3 simd16
5009-
// tmp1 = * HWINTRINSIC simd16 T Add
5008+
// +--* CNS_INT int 0x01
5009+
// tmp2 = * HWINTRINSIC simd32 float Permute
5010+
// /--* tmp1 simd32
5011+
// +--* tmp2 simd32
5012+
// tmp1 = * HWINTRINSIC simd32 T Add
50105013
// ...
50115014

50125015
// This is roughly the following managed code:
50135016
// ...
5014-
// var tmp2 = tmp1;
5015-
// tmp3 = tmp2.GetUpper();
5016-
// var tmp1 = Isa.Add(tmp1.GetLower(), tmp2);
5017+
// var tmp2 = Isa.Permute2x128(tmp1, tmp2, 0x01);
5018+
// tmp1 = Isa.Add(tmp1, tmp2);
50175019
// ...
50185020

50195021
assert(simdBaseType != TYP_FLOAT);
@@ -5026,20 +5028,21 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
50265028
tmp2 = comp->gtClone(tmp1);
50275029
BlockRange().InsertAfter(tmp1, tmp2);
50285030

5029-
tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize);
5031+
tmp3 = comp->gtClone(tmp2);
50305032
BlockRange().InsertAfter(tmp2, tmp3);
5031-
LowerNode(tmp3);
50325033

5033-
tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize);
5034-
BlockRange().InsertAfter(tmp3, tmp1);
5035-
LowerNode(tmp1);
5034+
idx = comp->gtNewIconNode(0x01, TYP_INT);
5035+
BlockRange().InsertAfter(tmp3, idx);
50365036

5037-
tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16);
5038-
BlockRange().InsertAfter(tmp1, tmp2);
5037+
NamedIntrinsic permute2x128 = (simdBaseType == TYP_DOUBLE) ? NI_AVX_Permute2x128 : NI_AVX2_Permute2x128;
5038+
5039+
tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, permute2x128, simdBaseJitType, simdSize);
5040+
BlockRange().InsertAfter(idx, tmp2);
50395041
LowerNode(tmp2);
50405042

5041-
node->SetSimdSize(16);
5042-
tmp1 = tmp2;
5043+
tmp1 = comp->gtNewSimdBinOpNode(GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize);
5044+
BlockRange().InsertAfter(tmp2, tmp1);
5045+
LowerNode(tmp1);
50435046
}
50445047

50455048
// We're producing a vector result, so just return the result directly

0 commit comments

Comments
 (0)