@@ -21423,45 +21423,84 @@ GenTree* Compiler::gtNewSimdBinOpNode(
21423
21423
}
21424
21424
else if (varTypeIsLong(simdBaseType))
21425
21425
{
21426
- assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
21426
+ if ((simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_SSE41))
21427
+ {
21428
+ assert((simdSize == 16) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
21427
21429
21428
- assert(((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41)) ||
21429
- ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2)));
21430
+ // Make op1 and op2 multi-use:
21431
+ GenTree* op1Dup = fgMakeMultiUse(&op1);
21432
+ GenTree* op2Dup = fgMakeMultiUse(&op2);
21430
21433
21431
- // Make op1 and op2 multi-use:
21432
- GenTree* op1Dup = fgMakeMultiUse(&op1);
21433
- GenTree* op2Dup = fgMakeMultiUse(&op2);
21434
+ const bool is256 = simdSize == 32;
21435
+
21436
+ // Vector256<ulong> tmp0 = Avx2.Multiply(left, right);
21437
+ GenTreeHWIntrinsic* tmp0 =
21438
+ gtNewSimdHWIntrinsicNode(type, op1, op2, is256 ? NI_AVX2_Multiply : NI_SSE2_Multiply,
21439
+ CORINFO_TYPE_ULONG, simdSize);
21440
+
21441
+ // Vector256<uint> tmp1 = Avx2.Shuffle(right.AsUInt32(), ZWXY);
21442
+ GenTree* shuffleMask = gtNewIconNode(SHUFFLE_ZWXY, TYP_INT);
21443
+ GenTreeHWIntrinsic* tmp1 =
21444
+ gtNewSimdHWIntrinsicNode(type, op2Dup, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
21445
+ CORINFO_TYPE_UINT, simdSize);
21446
+
21447
+ // Vector256<uint> tmp2 = Avx2.MultiplyLow(left.AsUInt32(), tmp1);
21448
+ GenTree* tmp2 = gtNewSimdBinOpNode(GT_MUL, type, op1Dup, tmp1, CORINFO_TYPE_UINT, simdSize);
21434
21449
21435
- const bool is256 = simdSize == 32;
21450
+ // Vector256<int> tmp3 = Avx2.HorizontalAdd(tmp2.AsInt32(), Vector256<int>.Zero);
21451
+ GenTreeHWIntrinsic* tmp3 =
21452
+ gtNewSimdHWIntrinsicNode(type, tmp2, gtNewZeroConNode(type),
21453
+ is256 ? NI_AVX2_HorizontalAdd : NI_SSSE3_HorizontalAdd,
21454
+ CORINFO_TYPE_UINT, simdSize);
21436
21455
21437
- // Vector256<ulong> tmp0 = Avx2.Multiply(left, right);
21438
- GenTreeHWIntrinsic* tmp0 =
21439
- gtNewSimdHWIntrinsicNode(type, op1, op2, is256 ? NI_AVX2_Multiply : NI_SSE2_Multiply,
21440
- CORINFO_TYPE_ULONG, simdSize);
21456
+ // Vector256<int> tmp4 = Avx2.Shuffle(tmp3, YWXW);
21457
+ shuffleMask = gtNewIconNode(SHUFFLE_YWXW, TYP_INT);
21458
+ GenTreeHWIntrinsic* tmp4 =
21459
+ gtNewSimdHWIntrinsicNode(type, tmp3, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
21460
+ CORINFO_TYPE_UINT, simdSize);
21441
21461
21442
- // Vector256<uint> tmp1 = Avx2.Shuffle(right.AsUInt32(), ZWXY);
21443
- GenTree* shuffleMask = gtNewIconNode(SHUFFLE_ZWXY, TYP_INT);
21444
- GenTreeHWIntrinsic* tmp1 =
21445
- gtNewSimdHWIntrinsicNode(type, op2Dup, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
21446
- CORINFO_TYPE_UINT, simdSize);
21462
+ // result = tmp0 + tmp4;
21463
+ return gtNewSimdBinOpNode(GT_ADD, type, tmp0, tmp4, simdBaseJitType, simdSize);
21464
+ }
21465
+ else
21466
+ {
21467
+ // SSE2 implementation is simple decomposition using pmuludq,
21468
+ // which multiplies two uint32s and returns a uint64 result.
21469
+ // aLo * bLo + ((aLo * bHi + aHi * bLo) << 32)
21470
+ GenTree* op1Dup1 = fgMakeMultiUse(&op1);
21471
+ GenTree* op1Dup2 = gtCloneExpr(op1Dup1);
21472
+ GenTree* op2Dup1 = fgMakeMultiUse(&op2);
21473
+ GenTree* op2Dup2 = gtCloneExpr(op2Dup1);
21474
+
21475
+ // Vector128<ulong> low = Sse2.Multiply(left.AsUInt32(), right.AsUInt32());
21476
+ GenTreeHWIntrinsic* low =
21477
+ gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE2_Multiply, CORINFO_TYPE_ULONG, simdSize);
21447
21478
21448
- // Vector256<uint> tmp2 = Avx2.MultiplyLow(left.AsUInt32(), tmp1);
21449
- GenTree* tmp2 = gtNewSimdBinOpNode(GT_MUL, type, op1Dup, tmp1, CORINFO_TYPE_UINT, simdSize);
21479
+ // Vector128<uint> rightHi = (right >>> 32).AsUInt32();
21480
+ GenTree* rightHi =
21481
+ gtNewSimdBinOpNode(GT_RSZ, type, op2Dup1, gtNewIconNode(32), simdBaseJitType, simdSize);
21450
21482
21451
- // Vector256<int> tmp3 = Avx2.HorizontalAdd(tmp2.AsInt32(), Vector256<int>.Zero);
21452
- GenTreeHWIntrinsic* tmp3 =
21453
- gtNewSimdHWIntrinsicNode(type, tmp2, gtNewZeroConNode(type),
21454
- is256 ? NI_AVX2_HorizontalAdd : NI_SSSE3_HorizontalAdd, CORINFO_TYPE_UINT,
21455
- simdSize);
21483
+ // Vector128<ulong> tmp0 = Sse2.Multiply(rightHi, left.AsUInt32());
21484
+ GenTreeHWIntrinsic* tmp0 = gtNewSimdHWIntrinsicNode(type, rightHi, op1Dup1, NI_SSE2_Multiply,
21485
+ CORINFO_TYPE_ULONG, simdSize);
21456
21486
21457
- // Vector256<int> tmp4 = Avx2.Shuffle(tmp3, YWXW);
21458
- shuffleMask = gtNewIconNode(SHUFFLE_YWXW, TYP_INT);
21459
- GenTreeHWIntrinsic* tmp4 =
21460
- gtNewSimdHWIntrinsicNode(type, tmp3, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
21461
- CORINFO_TYPE_UINT, simdSize);
21487
+ // Vector128<uint> leftHi = (left >>> 32).AsUInt32();
21488
+ GenTree* leftHi =
21489
+ gtNewSimdBinOpNode(GT_RSZ, type, op1Dup2, gtNewIconNode(32), simdBaseJitType, simdSize);
21462
21490
21463
- // result = tmp0 + tmp4;
21464
- return gtNewSimdBinOpNode(GT_ADD, type, tmp0, tmp4, simdBaseJitType, simdSize);
21491
+ // Vector128<ulong> tmp1 = Sse2.Multiply(leftHi, right.AsUInt32());
21492
+ GenTreeHWIntrinsic* tmp1 =
21493
+ gtNewSimdHWIntrinsicNode(type, leftHi, op2Dup2, NI_SSE2_Multiply, CORINFO_TYPE_ULONG, simdSize);
21494
+
21495
+ // Vector128<ulong> tmp2 = tmp0 + tmp1;
21496
+ GenTree* tmp2 = gtNewSimdBinOpNode(GT_ADD, type, tmp0, tmp1, simdBaseJitType, simdSize);
21497
+
21498
+ // Vector128<ulong> mid = tmp2 << 32;
21499
+ GenTree* mid = gtNewSimdBinOpNode(GT_LSH, type, tmp2, gtNewIconNode(32), simdBaseJitType, simdSize);
21500
+
21501
+ // return low + mid;
21502
+ return gtNewSimdBinOpNode(GT_ADD, type, low, mid, simdBaseJitType, simdSize);
21503
+ }
21465
21504
}
21466
21505
#elif defined(TARGET_ARM64)
21467
21506
if (varTypeIsLong(simdBaseType))
0 commit comments