@@ -3419,45 +3419,77 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
3419
3419
{
3420
3420
assert (simdBaseType == TYP_FLOAT);
3421
3421
3422
+ // We need to mask off the most significant element to avoid the shuffle + add
3423
+ // from including it in the computed result. We need to do this for both op1 and
3424
+ // op2 in case one of them is `NaN` (because Zero * NaN == NaN)
3425
+
3426
+ simd16_t simd16Val = {};
3427
+
3428
+ simd16Val.i32 [0 ] = -1 ;
3429
+ simd16Val.i32 [1 ] = -1 ;
3430
+ simd16Val.i32 [2 ] = -1 ;
3431
+ simd16Val.i32 [3 ] = +0 ;
3432
+
3433
+ simdType = TYP_SIMD16;
3434
+ simdSize = 16 ;
3435
+
3422
3436
// We will be constructing the following parts:
3423
3437
// ...
3424
3438
// +--* CNS_INT int -1
3425
3439
// +--* CNS_INT int -1
3426
3440
// +--* CNS_INT int -1
3427
3441
// +--* CNS_INT int 0
3428
3442
// tmp1 = * HWINTRINSIC simd16 T Create
3429
- // /--* op2 simd16
3443
+ // /--* op1 simd16
3430
3444
// +--* tmp1 simd16
3431
3445
// op1 = * HWINTRINSIC simd16 T And
3432
3446
// ...
3433
3447
3434
3448
// This is roughly the following managed code:
3435
3449
// ...
3436
3450
// tmp1 = Vector128.Create(-1, -1, -1, 0);
3437
- // op1 = Sse.And(op1, tmp2 );
3451
+ // op1 = Sse.And(op1, tmp1 );
3438
3452
// ...
3439
3453
3440
- GenTree* cns0 = comp->gtNewIconNode (- 1 , TYP_INT );
3441
- BlockRange (). InsertAfter (op1, cns0) ;
3454
+ GenTreeVecCon* vecCon1 = comp->gtNewVconNode (simdType, simdBaseJitType );
3455
+ vecCon1-> gtSimd16Val = simd16Val ;
3442
3456
3443
- GenTree* cns1 = comp->gtNewIconNode (-1 , TYP_INT);
3444
- BlockRange ().InsertAfter (cns0, cns1);
3457
+ BlockRange ().InsertAfter (op1, vecCon1);
3445
3458
3446
- GenTree* cns2 = comp->gtNewIconNode (- 1 , TYP_INT );
3447
- BlockRange ().InsertAfter (cns1, cns2 );
3459
+ op1 = comp->gtNewSimdHWIntrinsicNode (simdType, op1, vecCon1, NI_SSE_And, simdBaseJitType, simdSize );
3460
+ BlockRange ().InsertAfter (vecCon1, op1 );
3448
3461
3449
- GenTree* cns3 = comp-> gtNewIconNode ( 0 , TYP_INT );
3450
- BlockRange (). InsertAfter (cns2, cns3 );
3462
+ LowerNode (vecCon1 );
3463
+ LowerNode (op1 );
3451
3464
3452
- tmp1 = comp->gtNewSimdHWIntrinsicNode (simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create,
3453
- CORINFO_TYPE_INT, 16 );
3454
- BlockRange ().InsertAfter (cns3, tmp1);
3465
+ // We will be constructing the following parts:
3466
+ // ...
3467
+ // +--* CNS_INT int -1
3468
+ // +--* CNS_INT int -1
3469
+ // +--* CNS_INT int -1
3470
+ // +--* CNS_INT int 0
3471
+ // tmp2 = * HWINTRINSIC simd16 T Create
3472
+ // /--* op2 simd16
3473
+ // +--* tmp2 simd16
3474
+ // op2 = * HWINTRINSIC simd16 T And
3475
+ // ...
3455
3476
3456
- op1 = comp->gtNewSimdHWIntrinsicNode (simdType, op1, tmp1, NI_SSE_And, simdBaseJitType, simdSize);
3457
- BlockRange ().InsertAfter (tmp1, op1);
3477
+ // This is roughly the following managed code:
3478
+ // ...
3479
+ // tmp2 = Vector128.Create(-1, -1, -1, 0);
3480
+ // op2 = Sse.And(op2, tmp2);
3481
+ // ...
3458
3482
3459
- LowerNode (tmp1);
3460
- LowerNode (op1);
3483
+ GenTreeVecCon* vecCon2 = comp->gtNewVconNode (simdType, simdBaseJitType);
3484
+ vecCon2->gtSimd16Val = simd16Val;
3485
+
3486
+ BlockRange ().InsertAfter (op2, vecCon2);
3487
+
3488
+ op2 = comp->gtNewSimdHWIntrinsicNode (simdType, op2, vecCon2, NI_SSE_And, simdBaseJitType, simdSize);
3489
+ BlockRange ().InsertAfter (vecCon2, op2);
3490
+
3491
+ LowerNode (vecCon2);
3492
+ LowerNode (op2);
3461
3493
}
3462
3494
}
3463
3495
0 commit comments