Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28840,8 +28840,10 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
switch (GetHWIntrinsicId())
{
#ifdef TARGET_XARCH
case NI_X86Base_BigMul:
case NI_X86Base_DivRem:
return compiler->typGetBlkLayout(genTypeSize(GetSimdBaseType()) * 2);
case NI_X86Base_X64_BigMul:
case NI_X86Base_X64_DivRem:
return compiler->typGetBlkLayout(16);
#endif // TARGET_XARCH
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,9 @@ struct HWIntrinsicInfo
#endif

#ifdef TARGET_XARCH
case NI_X86Base_BigMul:
case NI_X86Base_DivRem:
case NI_X86Base_X64_BigMul:
case NI_X86Base_X64_DivRem:
return 2;
#endif // TARGET_XARCH
Expand Down
35 changes: 35 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2487,6 +2487,41 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)

switch (intrinsicId)
{
case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(node->GetOperandCount() == 2);
assert(instOptions == INS_OPTS_NONE);
assert(!node->Op(1)->isContained());

// SIMD base type is from signature and can distinguish signed and unsigned
var_types targetType = node->GetSimdBaseType();
GenTree* regOp = node->Op(1);
GenTree* rmOp = node->Op(2);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType, compiler);

emitAttr attr = emitTypeSize(targetType);
emitter* emit = GetEmitter();

// If rmOp is already in EAX, use that as implicit operand
if (rmOp->isUsedFromReg() && rmOp->GetRegNum() == REG_EAX)
{
std::swap(rmOp, regOp);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please explain this swap to me?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case op2 (rmop) is already present in RAX we use op2 as implicit operand.
Otherwise we would overwrite that value.

I should probably add a comment similar to the one I added for codeGenMulHi

// If op2 is already present in RAX use that as implicit operand
if (rmOp->isUsedFromReg() && (rmOp->GetRegNum() == REG_RAX))
{
std::swap(regOp, rmOp);

Do you want me to add the comment to this PR or #115966 ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with either option, I think the PR is not too big to split it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the PR is not too big to split it?

I've updated both PRs.
The 2 reasons to open this as a separate PR was to

  1. Show that it works for the non-AVX2 path and passes all tests
  2. For the mulx path I am a bit unsure about the best way to handle the allocation of the RDX register the MULX code produced worse code than MUL for at least one method (you can expand comment to se asm code) . Either approach is better than what's in main but I wanted to avoid adding a "mulx" optimization that gives worse code on average

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this PR is the same as #115966 but without the BMI2 path? Does it have any potential performance impact on BMI2 CPUs or diffs? could you please resolve the conflicts so we can run the diffs (or close one of the PRs)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this PR is the same as #115966 but without the BMI2 path?

Yes that is correct

Does it have any potential performance impact on BMI2 CPUs or diffs?

It might be expand "generated code" for edx ret comment for example diff and some thoughts

}

// op1: EAX, op2: reg/mem
emit->emitIns_Mov(INS_mov, attr, REG_EAX, regOp->GetRegNum(), /* canSkip */ true);

// emit the MUL/IMUL instruction
emit->emitInsBinary(ins, attr, node, rmOp);

// verify target registers are as expected
assert(node->GetRegByIndex(0) == REG_EAX);
assert(node->GetRegByIndex(1) == REG_EDX);

break;
}

case NI_X86Base_BitScanForward:
case NI_X86Base_BitScanReverse:
case NI_X86Base_X64_BitScanForward:
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ HARDWARE_INTRINSIC(X86Base, AlignRight,
HARDWARE_INTRINSIC(X86Base, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_pandd, INS_pandd, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
Expand Down Expand Up @@ -614,7 +615,8 @@ HARDWARE_INTRINSIC(X86Base, Xor,
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// 64-bit only Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BitScanForward
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BigMul
HARDWARE_INTRINSIC(X86Base_X64, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
Expand Down
21 changes: 21 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4216,6 +4216,27 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(sig->numArgs == 2);
assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(retType == TYP_STRUCT);
assert(simdBaseJitType != CORINFO_TYPE_UNDEF);

op2 = impPopStack().val;
op1 = impPopStack().val;

GenTreeHWIntrinsic* multiplyIntrinsic = gtNewScalarHWIntrinsicNode(retType, op1, op2, intrinsic);

// Store the type from signature into SIMD base type for convenience
multiplyIntrinsic->SetSimdBaseJitType(simdBaseJitType);

retNode = impStoreMultiRegValueToVar(multiplyIntrinsic,
sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}

case NI_X86Base_CompareScalarGreaterThan:
case NI_X86Base_CompareScalarGreaterThanOrEqual:
case NI_X86Base_CompareScalarNotGreaterThan:
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10215,6 +10215,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down
30 changes: 28 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2475,6 +2475,30 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(numArgs == 2);
assert(dstCount == 2);
assert(isRMW);
assert(!op1->isContained());

SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op1, RBM_NONE, canHWIntrinsicUseApxRegs);

// mulEAX always use EAX, if one operand is contained, specify other op with fixed EAX register
// otherwise dont force any register, we might get the second parameter in EAX
srcCount = BuildOperandUses(op1, op2->isContained() ? SRBM_EAX : apxAwareRegCandidates);
srcCount += BuildOperandUses(op2, apxAwareRegCandidates);

// result put in EAX and EDX
BuildDef(intrinsicTree, SRBM_EAX, 0);
BuildDef(intrinsicTree, SRBM_EDX, 1);

buildUses = false;
break;
}

case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down Expand Up @@ -3007,9 +3031,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else
{
// Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
// Currently dstCount = 2 is only used for DivRem and BigMul, which has special constraints and is handled
// above
assert((dstCount == 0) ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
(intrinsicId == NI_X86Base_BigMul) || (intrinsicId == NI_X86Base_X64_BigMul))));
}

*pDstCount = dstCount;
Expand Down
24 changes: 19 additions & 5 deletions src/libraries/System.Private.CoreLib/src/System/Math.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ public static long BigMul(int a, int b)
return ((long)a) * b;
}


#if !(TARGET_ARM64 || (TARGET_AMD64 && !MONO)) // BigMul 64*64 has high performance intrinsics on ARM64 and AMD64 (but not yet on MONO)
/// <summary>
/// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
/// </summary>
Expand All @@ -180,21 +180,18 @@ public static long BigMul(int a, int b)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(ulong a, uint b, out ulong low)
{
#if TARGET_64BIT
return Math.BigMul((ulong)a, (ulong)b, out low);
#else
ulong prodL = ((ulong)(uint)a) * b;
ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b);

low = ((prodH << 32) | (uint)prodL);
return (prodH >> 32);
#endif
}

/// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(uint a, ulong b, out ulong low)
=> BigMul(b, a, out low);
#endif

/// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
/// <param name="a">The first number to multiply.</param>
Expand All @@ -205,13 +202,23 @@ internal static ulong BigMul(uint a, ulong b, out ulong low)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong BigMul(ulong a, ulong b, out ulong low)
{
#if !MONO // X64.BigMul is not yet implemented in MONO
// X86Base.X64.BigMul is more performant than X86Base.X64.MultiplyNoFlags that has performance issues (#11782)
// so we don't need a separate BMI2 path
if (X86Base.X64.IsSupported)
{
(low, ulong hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#else
if (Bmi2.X64.IsSupported)
{
ulong tmp;
ulong high = Bmi2.X64.MultiplyNoFlags(a, b, &tmp);
low = tmp;
return high;
}
#endif
else if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down Expand Up @@ -251,6 +258,13 @@ static ulong SoftwareFallback(ulong a, ulong b, out ulong low)
/// <returns>The high 64-bit of the product of the specified numbers.</returns>
public static long BigMul(long a, long b, out long low)
{
#if !MONO // X86Base.BigMul is not yet implemented in MONO
if (X86Base.X64.IsSupported)
{
(low, long hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#endif
if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) BigMul(ulong left, ulong right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (long Lower, long Upper) BigMul(long left, long right) { throw new PlatformNotSupportedException(); }
}

/// <summary>
Expand Down Expand Up @@ -109,6 +126,28 @@ internal X64() { }
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> MUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (uint Lower, uint Upper) BigMul(uint left, uint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (int Lower, int Upper) BigMul(int left, int right) { throw new PlatformNotSupportedException(); }

/// <summary> MUL reg/m</summary>
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); }

/// <summary> IMUL reg/m</summary>
internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) => DivRem(lower, upper, divisor);

#if !MONO
/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(ulong, ulong, out ulong)" />.</para>
/// <para>In the future it might emit mulx on compatible hardware</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) BigMul(ulong left, ulong right) => BigMul(left, right);

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(long, long, out long)" />.</para>
/// </remarks>
internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right);
#endif
}

/// <summary>
Expand Down Expand Up @@ -123,6 +143,26 @@ public static unsafe (int Eax, int Ebx, int Ecx, int Edx) CpuId(int functionId,
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) => DivRem(lower, upper, divisor);

#if !MONO
/// <summary>
/// <para> MUL reg/m32</para>
/// </summary>
internal static (uint Lower, uint Upper) BigMul(uint left, uint right) => BigMul(left, right);

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
internal static (int Lower, int Upper) BigMul(int left, int right) => BigMul(left, right);

/// <summary> MUL reg/m</summary>
/// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right);

/// <summary> IMUL reg/m</summary>
/// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>
internal static (nint Lower, nint Upper) BigMul(nint left, nint right) => BigMul(left, right);
#endif

/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Loading