@@ -4462,29 +4462,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
44624462 assert (comp->compIsaSupportedDebugOnly (InstructionSet_AVX));
44634463
44644464 // We will be constructing the following parts:
4465- // idx = CNS_INT int 0xF1
4465+ // idx = CNS_INT int 0xFF
44664466 // /--* op1 simd16
44674467 // +--* op2 simd16
44684468 // +--* idx int
44694469 // tmp1 = * HWINTRINSIC simd32 T DotProduct
44704470 // /--* tmp1 simd32
44714471 // * STORE_LCL_VAR simd32
44724472 // tmp1 = LCL_VAR simd32
4473- // /--* tmp1 simd32
4474- // tmp1 = * HWINTRINSIC simd16 T GetLower
44754473 // tmp2 = LCL_VAR simd32
4476- // /--* tmp2 simd16
4477- // tmp2 = * HWINTRINSIC simd16 T GetUpper
4478- // /--* tmp1 simd16
4479- // +--* tmp2 simd16
4480- // node = * HWINTRINSIC simd16 T Add
4474+ // tmp3 = LCL_VAR simd32
4475+ // /--* tmp2 simd32
4476+ // +--* tmp3 simd32
4477+ // +--* CNS_INT int 0x01
4478+ // tmp2 = * HWINTRINSIC simd32 T Permute
4479+ // /--* tmp1 simd32
4480+ // +--* tmp2 simd32
4481+ // node = * HWINTRINSIC simd32 T Add
44814482
44824483 // This is roughly the following managed code:
44834484 // var tmp1 = Avx.DotProduct(op1, op2, 0xFF);
4484- // var tmp2 = tmp1.GetUpper( );
4485- // return Sse .Add(tmp1, tmp2);
4485+ // var tmp2 = Avx.Permute2x128(tmp1, tmp1, 0x4E );
4486+ // return Avx .Add(tmp1, tmp2);
44864487
4487- idx = comp->gtNewIconNode (0xF1 , TYP_INT);
4488+ idx = comp->gtNewIconNode (0xFF , TYP_INT);
44884489 BlockRange ().InsertBefore (node, idx);
44894490
44904491 tmp1 = comp->gtNewSimdHWIntrinsicNode (simdType, op1, op2, idx, NI_AVX_DotProduct, simdBaseJitType,
@@ -4500,27 +4501,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
45004501 tmp2 = comp->gtClone (tmp1);
45014502 BlockRange ().InsertAfter (tmp1, tmp2);
45024503
4503- tmp3 = comp->gtNewSimdGetUpperNode (TYP_SIMD16, tmp2, simdBaseJitType, simdSize );
4504+ tmp3 = comp->gtClone ( tmp2);
45044505 BlockRange ().InsertAfter (tmp2, tmp3);
4505- LowerNode (tmp3);
45064506
4507- tmp1 = comp->gtNewSimdGetLowerNode (TYP_SIMD16, tmp1, simdBaseJitType, simdSize);
4508- BlockRange ().InsertAfter (tmp3, tmp1);
4509- LowerNode (tmp1);
4507+ idx = comp->gtNewIconNode (0x01 , TYP_INT);
4508+ BlockRange ().InsertAfter (tmp3, idx);
45104509
4511- tmp2 = comp->gtNewSimdBinOpNode (GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16 );
4512- BlockRange ().InsertAfter (tmp1, tmp2);
4510+ tmp2 = comp->gtNewSimdHWIntrinsicNode (simdType, tmp2, tmp3, idx, NI_AVX_Permute2x128, simdBaseJitType,
4511+ simdSize);
4512+ BlockRange ().InsertAfter (idx, tmp2);
4513+ LowerNode (tmp2);
4514+
4515+ tmp1 = comp->gtNewSimdBinOpNode (GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize);
4516+ BlockRange ().InsertAfter (tmp2, tmp1);
45134517
45144518 // We're producing a vector result, so just return the result directly
45154519 LIR::Use use;
45164520
45174521 if (BlockRange ().TryGetUse (node, &use))
45184522 {
4519- use.ReplaceWith (tmp2 );
4523+ use.ReplaceWith (tmp1 );
45204524 }
45214525
45224526 BlockRange ().Remove (node);
4523- return LowerNode (tmp2 );
4527+ return LowerNode (tmp1 );
45244528 }
45254529
45264530 case TYP_DOUBLE:
@@ -4999,21 +5003,19 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
49995003 // /--* tmp1 simd32
50005004 // * STORE_LCL_VAR simd32
50015005 // tmp1 = LCL_VAR simd32
5002- // /--* tmp1 simd32
5003- // tmp1 = * HWINTRINSIC simd16 T GetLower
50045006 // tmp2 = LCL_VAR simd32
50055007 // /--* tmp2 simd32
5006- // tmp3 = * HWINTRINSIC simd16 T GetUpper
5007- // /--* tmp1 simd16
5008- // +--* tmp3 simd16
5009- // tmp1 = * HWINTRINSIC simd16 T Add
5008+ // +--* CNS_INT int 0x01
5009+ // tmp2 = * HWINTRINSIC simd32 float Permute
5010+ // /--* tmp1 simd32
5011+ // +--* tmp2 simd32
5012+ // tmp1 = * HWINTRINSIC simd32 T Add
50105013 // ...
50115014
50125015 // This is roughly the following managed code:
50135016 // ...
5014- // var tmp2 = tmp1;
5015- // tmp3 = tmp2.GetUpper();
5016- // var tmp1 = Isa.Add(tmp1.GetLower(), tmp2);
5017+ // var tmp2 = Isa.Permute2x128(tmp1, tmp2, 0x01);
5018+ // tmp1 = Isa.Add(tmp1, tmp2);
50175019 // ...
50185020
50195021 assert (simdBaseType != TYP_FLOAT);
@@ -5026,20 +5028,21 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
50265028 tmp2 = comp->gtClone (tmp1);
50275029 BlockRange ().InsertAfter (tmp1, tmp2);
50285030
5029- tmp3 = comp->gtNewSimdGetUpperNode (TYP_SIMD16, tmp2, simdBaseJitType, simdSize );
5031+ tmp3 = comp->gtClone ( tmp2);
50305032 BlockRange ().InsertAfter (tmp2, tmp3);
5031- LowerNode (tmp3);
50325033
5033- tmp1 = comp->gtNewSimdGetLowerNode (TYP_SIMD16, tmp1, simdBaseJitType, simdSize);
5034- BlockRange ().InsertAfter (tmp3, tmp1);
5035- LowerNode (tmp1);
5034+ idx = comp->gtNewIconNode (0x01 , TYP_INT);
5035+ BlockRange ().InsertAfter (tmp3, idx);
50365036
5037- tmp2 = comp->gtNewSimdBinOpNode (GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16 );
5038- BlockRange ().InsertAfter (tmp1, tmp2);
5037+ NamedIntrinsic permute2x128 = (simdBaseType == TYP_DOUBLE) ? NI_AVX_Permute2x128 : NI_AVX2_Permute2x128;
5038+
5039+ tmp2 = comp->gtNewSimdHWIntrinsicNode (simdType, tmp2, tmp3, idx, permute2x128, simdBaseJitType, simdSize);
5040+ BlockRange ().InsertAfter (idx, tmp2);
50395041 LowerNode (tmp2);
50405042
5041- node->SetSimdSize (16 );
5042- tmp1 = tmp2;
5043+ tmp1 = comp->gtNewSimdBinOpNode (GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize);
5044+ BlockRange ().InsertAfter (tmp2, tmp1);
5045+ LowerNode (tmp1);
50435046 }
50445047
50455048 // We're producing a vector result, so just return the result directly
0 commit comments