@@ -2488,7 +2488,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
2488
2488
CorInfoType maskBaseJitType = simdBaseJitType;
2489
2489
var_types maskBaseType = simdBaseType;
2490
2490
2491
- if (op1Msk->OperIsHWIntrinsic (NI_EVEX_ConvertMaskToVector ))
2491
+ if (op1Msk->OperIsConvertMaskToVector ( ))
2492
2492
{
2493
2493
GenTreeHWIntrinsic* cvtMaskToVector = op1Msk->AsHWIntrinsic ();
2494
2494
@@ -2499,122 +2499,131 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
2499
2499
maskBaseType = cvtMaskToVector->GetSimdBaseType ();
2500
2500
}
2501
2501
2502
- if (!varTypeIsFloating (simdBaseType) && (simdSize != 64 ) && op2->IsVectorZero () &&
2503
- comp->compOpportunisticallyDependsOn (InstructionSet_SSE41) && !varTypeIsMask (op1Msk))
2502
+ if (!varTypeIsFloating (simdBaseType) && (simdSize != 64 ) && !varTypeIsMask (op1Msk))
2504
2503
{
2505
- // On SSE4.1 or higher we can optimize comparisons against zero to
2506
- // just use PTEST. We can't support it for floating-point, however,
2507
- // as it has both +0.0 and -0.0 where +0.0 == -0.0
2504
+ bool isOp2VectorZero = op2->IsVectorZero ();
2508
2505
2509
- bool skipReplaceOperands = false ;
2510
-
2511
- if (op1->OperIsHWIntrinsic ())
2506
+ if ((isOp2VectorZero || op2->IsVectorAllBitsSet ()) &&
2507
+ comp->compOpportunisticallyDependsOn (InstructionSet_SSE41))
2512
2508
{
2513
- GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic ();
2514
- NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId ();
2509
+ // On SSE4.1 or higher we can optimize comparisons against Zero or AllBitsSet to
2510
+ // just use PTEST. We can't support it for floating-point, however, as it has
2511
+ // both +0.0 and -0.0 where +0.0 == -0.0
2515
2512
2516
- GenTree* nestedOp1 = nullptr ;
2517
- GenTree* nestedOp2 = nullptr ;
2518
- bool isEmbeddedBroadcast = false ;
2513
+ bool skipReplaceOperands = false ;
2519
2514
2520
- if (op1Intrinsic-> GetOperandCount () == 2 )
2515
+ if (!isOp2VectorZero )
2521
2516
{
2522
- nestedOp1 = op1Intrinsic->Op (1 );
2523
- nestedOp2 = op1Intrinsic->Op (2 );
2517
+ // We can optimize to TestC(op1, allbitsset)
2518
+ //
2519
+ // This works out because TestC sets CF if (~x & y) == 0, so:
2520
+ // ~00 & 11 = 11; 11 & 11 = 11; NC
2521
+ // ~01 & 11 = 01; 10 & 11 = 10; NC
2522
+ // ~10 & 11 = 10; 01 & 11 = 01; NC
2523
+ // ~11 & 11 = 11; 00 & 11 = 00; C
2524
2524
2525
- assert (!nestedOp1->isContained ());
2526
- isEmbeddedBroadcast = nestedOp2->isContained () && nestedOp2->OperIsHWIntrinsic ();
2527
- }
2525
+ assert (op2->IsVectorAllBitsSet ());
2526
+ cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2528
2527
2529
- switch (op1IntrinsicId)
2528
+ skipReplaceOperands = true ;
2529
+ }
2530
+ else if (op1->OperIsHWIntrinsic ())
2530
2531
{
2531
- case NI_SSE_And:
2532
- case NI_SSE2_And:
2533
- case NI_AVX_And:
2534
- case NI_AVX2_And:
2532
+ assert (op2->IsVectorZero ());
2533
+
2534
+ GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic ();
2535
+
2536
+ if (op1Intrinsic->GetOperandCount () == 2 )
2535
2537
{
2536
- // We can optimize to TestZ(op1.op1, op1.op2)
2538
+ GenTree* nestedOp1 = op1Intrinsic->Op (1 );
2539
+ GenTree* nestedOp2 = op1Intrinsic->Op (2 );
2540
+
2541
+ assert (!nestedOp1->isContained ());
2542
+ bool isEmbeddedBroadcast = nestedOp2->isContained () && nestedOp2->OperIsHWIntrinsic ();
2537
2543
2538
- if (isEmbeddedBroadcast)
2544
+ bool isScalar = false ;
2545
+ genTreeOps oper = op1Intrinsic->GetOperForHWIntrinsicId (&isScalar);
2546
+
2547
+ switch (oper)
2539
2548
{
2540
- // PTEST doesn't support embedded broadcast
2541
- break ;
2542
- }
2549
+ case GT_AND:
2550
+ {
2551
+ // We can optimize to TestZ(op1.op1, op1.op2)
2543
2552
2544
- node->Op (1 ) = nestedOp1;
2545
- node->Op (2 ) = nestedOp2;
2553
+ if (isEmbeddedBroadcast)
2554
+ {
2555
+ // PTEST doesn't support embedded broadcast
2556
+ break ;
2557
+ }
2546
2558
2547
- BlockRange (). Remove (op1) ;
2548
- BlockRange (). Remove (op2) ;
2559
+ node-> Op ( 1 ) = nestedOp1 ;
2560
+ node-> Op ( 2 ) = nestedOp2 ;
2549
2561
2550
- skipReplaceOperands = true ;
2551
- break ;
2552
- }
2562
+ BlockRange ().Remove (op1);
2563
+ BlockRange ().Remove (op2);
2553
2564
2554
- case NI_SSE_AndNot:
2555
- case NI_SSE2_AndNot:
2556
- case NI_AVX_AndNot:
2557
- case NI_AVX2_AndNot:
2558
- {
2559
- // We can optimize to TestC(op1.op1, op1.op2)
2565
+ skipReplaceOperands = true ;
2566
+ break ;
2567
+ }
2560
2568
2561
- if (isEmbeddedBroadcast)
2562
- {
2563
- // PTEST doesn't support embedded broadcast
2564
- break ;
2565
- }
2569
+ case GT_AND_NOT:
2570
+ {
2571
+ // We can optimize to TestC(op1.op1, op1.op2)
2572
+
2573
+ if (isEmbeddedBroadcast)
2574
+ {
2575
+ // PTEST doesn't support embedded broadcast
2576
+ break ;
2577
+ }
2566
2578
2567
- cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2579
+ cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
2568
2580
2569
- node->Op (1 ) = nestedOp1;
2570
- node->Op (2 ) = nestedOp2;
2581
+ node->Op (1 ) = nestedOp1;
2582
+ node->Op (2 ) = nestedOp2;
2571
2583
2572
- BlockRange ().Remove (op1);
2573
- BlockRange ().Remove (op2);
2584
+ BlockRange ().Remove (op1);
2585
+ BlockRange ().Remove (op2);
2574
2586
2575
- skipReplaceOperands = true ;
2576
- break ;
2577
- }
2587
+ skipReplaceOperands = true ;
2588
+ break ;
2589
+ }
2578
2590
2579
- default :
2580
- {
2581
- break ;
2591
+ default :
2592
+ {
2593
+ break ;
2594
+ }
2595
+ }
2582
2596
}
2583
2597
}
2584
- }
2585
-
2586
- if (!skipReplaceOperands)
2587
- {
2588
- // Default handler, emit a TestZ(op1, op1)
2589
2598
2590
- node->Op (1 ) = op1;
2591
- BlockRange ().Remove (op2);
2599
+ if (!skipReplaceOperands)
2600
+ {
2601
+ // Default handler, emit a TestZ(op1, op1)
2602
+ assert (op2->IsVectorZero ());
2592
2603
2593
- LIR::Use op1Use (BlockRange (), &node->Op (1 ), node);
2594
- ReplaceWithLclVar (op1Use);
2595
- op1 = node->Op (1 );
2604
+ node->Op (1 ) = op1;
2605
+ BlockRange ().Remove (op2);
2596
2606
2597
- op2 = comp->gtClone (op1);
2598
- BlockRange ().InsertAfter (op1, op2);
2599
- node->Op (2 ) = op2;
2600
- }
2607
+ LIR::Use op1Use (BlockRange (), &node->Op (1 ), node);
2608
+ ReplaceWithLclVar (op1Use);
2609
+ op1 = node->Op (1 );
2601
2610
2602
- if (simdSize == 32 )
2603
- {
2604
- // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2605
- node->ChangeHWIntrinsicId (NI_AVX_TestZ);
2606
- LowerHWIntrinsicCC (node, NI_AVX_PTEST, cmpCnd);
2607
- }
2608
- else
2609
- {
2610
- assert (simdSize == 16 );
2611
+ op2 = comp->gtClone (op1);
2612
+ BlockRange ().InsertAfter (op1, op2);
2613
+ node->Op (2 ) = op2;
2614
+ }
2611
2615
2612
- // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
2613
- node->ChangeHWIntrinsicId (NI_SSE41_TestZ);
2614
- LowerHWIntrinsicCC (node, NI_SSE41_PTEST, cmpCnd);
2616
+ if (simdSize == 32 )
2617
+ {
2618
+ LowerHWIntrinsicCC (node, NI_AVX_PTEST, cmpCnd);
2619
+ }
2620
+ else
2621
+ {
2622
+ assert (simdSize == 16 );
2623
+ LowerHWIntrinsicCC (node, NI_SSE41_PTEST, cmpCnd);
2624
+ }
2625
+ return LowerNode (node);
2615
2626
}
2616
-
2617
- return LowerNode (node);
2618
2627
}
2619
2628
2620
2629
// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
@@ -3490,7 +3499,7 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node)
3490
3499
}
3491
3500
}
3492
3501
3493
- if (condition->OperIsHWIntrinsic (NI_EVEX_ConvertMaskToVector ))
3502
+ if (condition->OperIsConvertMaskToVector ( ))
3494
3503
{
3495
3504
GenTree* tmp = condition->AsHWIntrinsic ()->Op (1 );
3496
3505
BlockRange ().Remove (condition);
0 commit comments