@@ -3419,45 +3419,77 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
34193419 {
34203420 assert (simdBaseType == TYP_FLOAT);
34213421
3422+ // We need to mask off the most significant element to avoid the shuffle + add
3423+ // from including it in the computed result. We need to do this for both op1 and
3424+ // op2 in case one of them is `NaN` (because Zero * NaN == NaN)
3425+
3426+ simd16_t simd16Val = {};
3427+
3428+ simd16Val.i32 [0 ] = -1 ;
3429+ simd16Val.i32 [1 ] = -1 ;
3430+ simd16Val.i32 [2 ] = -1 ;
3431+ simd16Val.i32 [3 ] = +0 ;
3432+
3433+ simdType = TYP_SIMD16;
3434+ simdSize = 16 ;
3435+
34223436 // We will be constructing the following parts:
34233437 // ...
34243438 // +--* CNS_INT int -1
34253439 // +--* CNS_INT int -1
34263440 // +--* CNS_INT int -1
34273441 // +--* CNS_INT int 0
34283442 // tmp1 = * HWINTRINSIC simd16 T Create
3429- // /--* op2 simd16
3443+ // /--* op1 simd16
34303444 // +--* tmp1 simd16
34313445 // op1 = * HWINTRINSIC simd16 T And
34323446 // ...
34333447
34343448 // This is roughly the following managed code:
34353449 // ...
34363450 // tmp1 = Vector128.Create(-1, -1, -1, 0);
3437- // op1 = Sse.And(op1, tmp2 );
3451+ // op1 = Sse.And(op1, tmp1 );
34383452 // ...
34393453
3440- GenTree* cns0 = comp->gtNewIconNode (- 1 , TYP_INT );
3441- BlockRange (). InsertAfter (op1, cns0) ;
3454+ GenTreeVecCon* vecCon1 = comp->gtNewVconNode (simdType, simdBaseJitType );
3455+ vecCon1-> gtSimd16Val = simd16Val ;
34423456
3443- GenTree* cns1 = comp->gtNewIconNode (-1 , TYP_INT);
3444- BlockRange ().InsertAfter (cns0, cns1);
3457+ BlockRange ().InsertAfter (op1, vecCon1);
34453458
3446- GenTree* cns2 = comp->gtNewIconNode (- 1 , TYP_INT );
3447- BlockRange ().InsertAfter (cns1, cns2 );
3459+ op1 = comp->gtNewSimdHWIntrinsicNode (simdType, op1, vecCon1, NI_SSE_And, simdBaseJitType, simdSize );
3460+ BlockRange ().InsertAfter (vecCon1, op1 );
34483461
3449- GenTree* cns3 = comp-> gtNewIconNode ( 0 , TYP_INT );
3450- BlockRange (). InsertAfter (cns2, cns3 );
3462+ LowerNode (vecCon1 );
3463+ LowerNode (op1 );
34513464
3452- tmp1 = comp->gtNewSimdHWIntrinsicNode (simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create,
3453- CORINFO_TYPE_INT, 16 );
3454- BlockRange ().InsertAfter (cns3, tmp1);
3465+ // We will be constructing the following parts:
3466+ // ...
3467+ // +--* CNS_INT int -1
3468+ // +--* CNS_INT int -1
3469+ // +--* CNS_INT int -1
3470+ // +--* CNS_INT int 0
3471+ // tmp2 = * HWINTRINSIC simd16 T Create
3472+ // /--* op2 simd16
3473+ // +--* tmp2 simd16
3474+ // op2 = * HWINTRINSIC simd16 T And
3475+ // ...
34553476
3456- op1 = comp->gtNewSimdHWIntrinsicNode (simdType, op1, tmp1, NI_SSE_And, simdBaseJitType, simdSize);
3457- BlockRange ().InsertAfter (tmp1, op1);
3477+ // This is roughly the following managed code:
3478+ // ...
3479+ // tmp2 = Vector128.Create(-1, -1, -1, 0);
3480+ // op2 = Sse.And(op2, tmp2);
3481+ // ...
34583482
3459- LowerNode (tmp1);
3460- LowerNode (op1);
3483+ GenTreeVecCon* vecCon2 = comp->gtNewVconNode (simdType, simdBaseJitType);
3484+ vecCon2->gtSimd16Val = simd16Val;
3485+
3486+ BlockRange ().InsertAfter (op2, vecCon2);
3487+
3488+ op2 = comp->gtNewSimdHWIntrinsicNode (simdType, op2, vecCon2, NI_SSE_And, simdBaseJitType, simdSize);
3489+ BlockRange ().InsertAfter (vecCon2, op2);
3490+
3491+ LowerNode (vecCon2);
3492+ LowerNode (op2);
34613493 }
34623494 }
34633495
0 commit comments