Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28840,8 +28840,10 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
switch (GetHWIntrinsicId())
{
#ifdef TARGET_XARCH
case NI_X86Base_BigMul:
case NI_X86Base_DivRem:
return compiler->typGetBlkLayout(genTypeSize(GetSimdBaseType()) * 2);
case NI_X86Base_X64_BigMul:
case NI_X86Base_X64_DivRem:
return compiler->typGetBlkLayout(16);
#endif // TARGET_XARCH
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,9 @@ struct HWIntrinsicInfo
#endif

#ifdef TARGET_XARCH
case NI_X86Base_BigMul:
case NI_X86Base_DivRem:
case NI_X86Base_X64_BigMul:
case NI_X86Base_X64_DivRem:
return 2;
#endif // TARGET_XARCH
Expand Down
52 changes: 52 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2487,6 +2487,58 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)

switch (intrinsicId)
{
case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(node->GetOperandCount() == 2);
assert(instOptions == INS_OPTS_NONE);
assert(!node->Op(1)->isContained());

// SIMD base type is from signature and can distinguish signed and unsigned
var_types targetType = node->GetSimdBaseType();
GenTree* regOp = node->Op(1);
GenTree* rmOp = node->Op(2);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType, compiler);

emitAttr attr = emitTypeSize(targetType);
emitter* emit = GetEmitter();

// Unsigned multiplication can use mulx on BMI2-capable CPUs
if (ins == INS_mulEAX && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
// op1: EDX, op2: reg/mem (operand 3) => hiRes: (operand 1), lowReg: (operand 2)
assert((regOp->GetRegNum() == REG_EDX) || (rmOp->GetRegNum() != REG_EDX));

// mov the first operand into implicit source operand EDX/RDX
emit->emitIns_Mov(INS_mov, attr, REG_EDX, regOp->GetRegNum(), /* canSkip */ true);

// emit MULX instruction
regNumber lowReg = node->GetRegByIndex(0);
regNumber hiReg = node->GetRegByIndex(1);
inst_RV_RV_TT(INS_mulx, attr, hiReg, lowReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
}
else
{
// If rmOp is already in EAX, use that as implicit operand
if (rmOp->isUsedFromReg() && rmOp->GetRegNum() == REG_EAX)
{
std::swap(rmOp, regOp);
}

// op1: EAX, op2: reg/mem
emit->emitIns_Mov(INS_mov, attr, REG_EAX, regOp->GetRegNum(), /* canSkip */ true);

// emit the MUL/IMUL instruction
emit->emitInsBinary(ins, attr, node, rmOp);

// verify target registers are as expected
assert(node->GetRegByIndex(0) == REG_EAX);
assert(node->GetRegByIndex(1) == REG_EDX);
}

break;
}

case NI_X86Base_BitScanForward:
case NI_X86Base_BitScanReverse:
case NI_X86Base_X64_BitScanForward:
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ HARDWARE_INTRINSIC(X86Base, AlignRight,
HARDWARE_INTRINSIC(X86Base, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_pandd, INS_pandd, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
Expand Down Expand Up @@ -614,7 +615,8 @@ HARDWARE_INTRINSIC(X86Base, Xor,
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// 64-bit only Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BitScanForward
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BigMul
HARDWARE_INTRINSIC(X86Base_X64, BigMul, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
Expand Down
21 changes: 21 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4216,6 +4216,27 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(sig->numArgs == 2);
assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(retType == TYP_STRUCT);
assert(simdBaseJitType != CORINFO_TYPE_UNDEF);

op2 = impPopStack().val;
op1 = impPopStack().val;

GenTreeHWIntrinsic* multiplyIntrinsic = gtNewScalarHWIntrinsicNode(retType, op1, op2, intrinsic);

// Store the type from signature into SIMD base type for convenience
multiplyIntrinsic->SetSimdBaseJitType(simdBaseJitType);

retNode = impStoreMultiRegValueToVar(multiplyIntrinsic,
sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}

case NI_X86Base_CompareScalarGreaterThan:
case NI_X86Base_CompareScalarGreaterThanOrEqual:
case NI_X86Base_CompareScalarNotGreaterThan:
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10215,6 +10215,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down
50 changes: 48 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2475,6 +2475,50 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_X86Base_BigMul:
case NI_X86Base_X64_BigMul:
{
assert(numArgs == 2);
assert(dstCount == 2);
assert(isRMW);
assert(!op1->isContained());

if ((baseType == TYP_ULONG || baseType == TYP_UINT) &&
compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
isRMW = false;

SingleTypeRegSet apxAwareRegCandidates =
Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson Jun 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand.

It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead.

How do you feel about it? i am not sure if it is better or not

generated assembly

With commit that fix return value in rdx:

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mov      rdx, r10
       mulx     rax, rdx, rax
       xor      rax, rdx
       add      rax, r8
       mov      rdx, qword ptr [rcx+0x10]
       mov      r8, qword ptr [rsp+0x10]
       xor      rdx, qword ptr [r8+0x10]
       mov      r10, qword ptr [rcx+0x18]
       xor      r10, qword ptr [r8+0x18]
       mulx     r10, rdx, r10
       xor      r10, rdx
       add      rax, r10
       mov      rdx, qword ptr [rcx+0x20]
       xor      rdx, qword ptr [r8+0x20]
       mov      r10, qword ptr [rcx+0x28]
       xor      r10, qword ptr [r8+0x28]
       mulx     r10, rdx, r10
       xor      r10, rdx
       add      rax, r10
       mov      rdx, qword ptr [rcx+0x30]
       xor      rdx, qword ptr [r8+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [r8+0x38]
       mulx     rcx, rdx, rcx
       xor      rcx, rdx
       add      rax, rcx
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=153 bbWeight=1 PerfScore 60.50

G_M1045_IG03:  ;; offset=0x0099
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 154

Without commit

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mov      rdx, r10
       mulx     r10, rax, rax
       xor      rax, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x10]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x10]
       mov      r10, qword ptr [rcx+0x18]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x18]
       mov      rdx, r10
       mulx     r10, r8, r8
       xor      r8, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x20]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x20]
       mov      r10, qword ptr [rcx+0x28]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x28]
       mov      rdx, r10
       mulx     r10, r8, r8
       xor      r8, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x30]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [rdx+0x38]
       mov      rdx, rcx
       mulx     rdx, rcx, r8
       xor      rcx, rdx
       add      rax, rcx
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=182 bbWeight=1 PerfScore 65.25

G_M1045_IG03:  ;; offset=0x00B6
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 183

Without AVX2 (mul only)

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x10]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x10]
       mov      r10, qword ptr [rcx+0x18]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x18]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x20]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x20]
       mov      r10, qword ptr [rcx+0x28]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x28]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x30]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [rdx+0x38]
       mul      rdx:rax, rcx
       xor      rax, rdx
       add      rax, r8
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=162 bbWeight=1 PerfScore 64.25

G_M1045_IG03:  ;; offset=0x00A2
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 163

Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson Aug 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EgorBo do you have any preference on which approach to take here? (Hardcode rdx as return register or not)

The main advantage with the other approach as i see it is that the code, and generated code for mul and mulx becomes more similar.

I will try and clarify the commen for the swap and fix the conflict in a day or two and would like to know if there is anything else to change.

ForceLowGprForApxIfNeeded(op2, RBM_NONE, canHWIntrinsicUseApxRegs);
// mulx, place op1 in implicit EDX register since op2 might be contained
srcCount = BuildOperandUses(op1, SRBM_EDX);
srcCount += BuildOperandUses(op2, apxAwareRegCandidates);

// result in any register
SingleTypeRegSet apxAwareDestCandidates =
ForceLowGprForApxIfNeeded(intrinsicTree, RBM_NONE, canHWIntrinsicUseApxRegs);
BuildDef(intrinsicTree, apxAwareDestCandidates, 0);
BuildDef(intrinsicTree, apxAwareDestCandidates, 1);
}
else // Signed multiply or normal unsigned multiply in one operand form
{
SingleTypeRegSet apxAwareRegCandidates =
ForceLowGprForApxIfNeeded(op1, RBM_NONE, canHWIntrinsicUseApxRegs);

// mulEAX always use EAX, if one operand is contained, specify other op with fixed EAX register
// otherwise dont force any register, we might get the second parameter in EAX
srcCount = BuildOperandUses(op1, op2->isContained() ? SRBM_EAX : apxAwareRegCandidates);
srcCount += BuildOperandUses(op2, apxAwareRegCandidates);

// result put in EAX and EDX
BuildDef(intrinsicTree, SRBM_EAX, 0);
BuildDef(intrinsicTree, SRBM_EDX, 1);
}

buildUses = false;
break;
}

case NI_AVX2_MultiplyNoFlags:
case NI_AVX2_X64_MultiplyNoFlags:
{
Expand Down Expand Up @@ -3007,9 +3051,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else
{
// Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
// Currently dstCount = 2 is only used for DivRem and BigMul, which has special constraints and is handled
// above
assert((dstCount == 0) ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
(intrinsicId == NI_X86Base_BigMul) || (intrinsicId == NI_X86Base_X64_BigMul))));
}

*pDstCount = dstCount;
Expand Down
24 changes: 19 additions & 5 deletions src/libraries/System.Private.CoreLib/src/System/Math.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ public static long BigMul(int a, int b)
return ((long)a) * b;
}


#if !(TARGET_ARM64 || (TARGET_AMD64 && !MONO)) // BigMul 64*64 has high performance intrinsics on ARM64 and AMD64 (but not yet on MONO)
/// <summary>
/// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
/// </summary>
Expand All @@ -180,21 +180,18 @@ public static long BigMul(int a, int b)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(ulong a, uint b, out ulong low)
{
#if TARGET_64BIT
return Math.BigMul((ulong)a, (ulong)b, out low);
#else
ulong prodL = ((ulong)(uint)a) * b;
ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b);

low = ((prodH << 32) | (uint)prodL);
return (prodH >> 32);
#endif
}

/// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(uint a, ulong b, out ulong low)
=> BigMul(b, a, out low);
#endif

/// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
/// <param name="a">The first number to multiply.</param>
Expand All @@ -205,13 +202,23 @@ internal static ulong BigMul(uint a, ulong b, out ulong low)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong BigMul(ulong a, ulong b, out ulong low)
{
#if !MONO // X64.BigMul is not yet implemented in MONO
// X86Base.X64.BigMul is more performant than X86Base.X64.MultiplyNoFlags that has performance issues (#11782)
// so we don't need a separate BMI2 path
if (X86Base.X64.IsSupported)
{
(low, ulong hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#else
if (Bmi2.X64.IsSupported)
{
ulong tmp;
ulong high = Bmi2.X64.MultiplyNoFlags(a, b, &tmp);
low = tmp;
return high;
}
#endif
else if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down Expand Up @@ -251,6 +258,13 @@ static ulong SoftwareFallback(ulong a, ulong b, out ulong low)
/// <returns>The high 64-bit of the product of the specified numbers.</returns>
public static long BigMul(long a, long b, out long low)
{
#if !MONO // X86Base.BigMul is not yet implemented in MONO
if (X86Base.X64.IsSupported)
{
(low, long hi) = X86Base.X64.BigMul(a, b);
return hi;
}
#endif
if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,25 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// <para> MULX reg reg reg/m64 (if BMI2 is supported)</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(ulong, ulong, out ulong)" />.</para>
/// <para>Can emit either mul or mulx depending on hardware</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) BigMul(ulong left, ulong right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (long Lower, long Upper) BigMul(long left, long right) { throw new PlatformNotSupportedException(); }
}

/// <summary>
Expand Down Expand Up @@ -109,6 +128,26 @@ internal X64() { }
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> MUL reg/m32</para>
/// <para> MULX reg reg reg/m32 (if BMI2 is supported)</para>
/// </summary>
internal static (uint Lower, uint Upper) BigMul(uint left, uint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
internal static (int Lower, int Upper) BigMul(int left, int right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> MUL reg/m</para>
/// <para> MULX reg reg reg/m (if BMI2 is supported)</para>
/// </summary>
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); }

/// <summary> IMUL reg/m</summary>
internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The nint and 32 bit version never gets called, should I remove them or keep them for the symmetry ?

I do no longer think it makes much sense to make this methods public (#58263) since you can just call existing BigMul methods, and the nint version was solved by a simple *if * in #114731


/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Loading
Loading