-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base
#115966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d781b3a
7db8372
331cc98
614edf6
c91abc6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2475,6 +2475,50 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou | |
| break; | ||
| } | ||
|
|
||
| case NI_X86Base_BigMul: | ||
| case NI_X86Base_X64_BigMul: | ||
| { | ||
| assert(numArgs == 2); | ||
| assert(dstCount == 2); | ||
| assert(isRMW); | ||
| assert(!op1->isContained()); | ||
|
|
||
| if ((baseType == TYP_ULONG || baseType == TYP_UINT) && | ||
| compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) | ||
| { | ||
| isRMW = false; | ||
|
|
||
| SingleTypeRegSet apxAwareRegCandidates = | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand. It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead. How do you feel about it? i am not sure if it is better or not generated assemblyWith commit that fix return value in rdx:; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx rax, rdx, rax
xor rax, rdx
add rax, r8
mov rdx, qword ptr [rcx+0x10]
mov r8, qword ptr [rsp+0x10]
xor rdx, qword ptr [r8+0x10]
mov r10, qword ptr [rcx+0x18]
xor r10, qword ptr [r8+0x18]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x20]
xor rdx, qword ptr [r8+0x20]
mov r10, qword ptr [rcx+0x28]
xor r10, qword ptr [r8+0x28]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x30]
xor rdx, qword ptr [r8+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [r8+0x38]
mulx rcx, rdx, rcx
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=153 bbWeight=1 PerfScore 60.50
G_M1045_IG03: ;; offset=0x0099
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 154
Without commit; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx r10, rax, rax
xor rax, r10
add rax, r8
mov r8, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mov rdx, rcx
mulx rdx, rcx, r8
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=182 bbWeight=1 PerfScore 65.25
G_M1045_IG03: ;; offset=0x00B6
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 183Without AVX2 (mul only); Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mul rdx:rax, rcx
xor rax, rdx
add rax, r8
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=162 bbWeight=1 PerfScore 64.25
G_M1045_IG03: ;; offset=0x00A2
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 163
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @EgorBo do you have any preference on which approach to take here? (Hardcode rdx as return register or not) The main advantage with the other approach as i see it is that the code, and generated code for mul and mulx becomes more similar. I will try and clarify the commen for the swap and fix the conflict in a day or two and would like to know if there is anything else to change. |
||
| ForceLowGprForApxIfNeeded(op2, RBM_NONE, canHWIntrinsicUseApxRegs); | ||
| // mulx, place op1 in implicit EDX register since op2 might be contained | ||
| srcCount = BuildOperandUses(op1, SRBM_EDX); | ||
| srcCount += BuildOperandUses(op2, apxAwareRegCandidates); | ||
|
|
||
| // result in any register | ||
| SingleTypeRegSet apxAwareDestCandidates = | ||
| ForceLowGprForApxIfNeeded(intrinsicTree, RBM_NONE, canHWIntrinsicUseApxRegs); | ||
| BuildDef(intrinsicTree, apxAwareDestCandidates, 0); | ||
| BuildDef(intrinsicTree, apxAwareDestCandidates, 1); | ||
| } | ||
| else // Signed multiply or normal unsigned multiply in one operand form | ||
| { | ||
| SingleTypeRegSet apxAwareRegCandidates = | ||
| ForceLowGprForApxIfNeeded(op1, RBM_NONE, canHWIntrinsicUseApxRegs); | ||
|
|
||
| // mulEAX always use EAX, if one operand is contained, specify other op with fixed EAX register | ||
| // otherwise dont force any register, we might get the second parameter in EAX | ||
| srcCount = BuildOperandUses(op1, op2->isContained() ? SRBM_EAX : apxAwareRegCandidates); | ||
| srcCount += BuildOperandUses(op2, apxAwareRegCandidates); | ||
|
|
||
| // result put in EAX and EDX | ||
| BuildDef(intrinsicTree, SRBM_EAX, 0); | ||
| BuildDef(intrinsicTree, SRBM_EDX, 1); | ||
| } | ||
|
|
||
| buildUses = false; | ||
| break; | ||
| } | ||
|
|
||
| case NI_AVX2_MultiplyNoFlags: | ||
| case NI_AVX2_X64_MultiplyNoFlags: | ||
| { | ||
|
|
@@ -3007,9 +3051,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou | |
| } | ||
| else | ||
| { | ||
| // Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above | ||
| // Currently dstCount = 2 is only used for DivRem and BigMul, which has special constraints and is handled | ||
| // above | ||
| assert((dstCount == 0) || | ||
| ((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem)))); | ||
| ((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) || | ||
| (intrinsicId == NI_X86Base_BigMul) || (intrinsicId == NI_X86Base_X64_BigMul)))); | ||
| } | ||
|
|
||
| *pDstCount = dstCount; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,25 @@ internal X64() { } | |
| /// </summary> | ||
| [Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)] | ||
| public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> | ||
| /// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para> | ||
| /// <para> MUL reg/m64</para> | ||
| /// <para> MULX reg reg reg/m64 (if BMI2 is supported)</para> | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para>Its functionality is exposed by the public <see cref="Math.BigMul(ulong, ulong, out ulong)" />.</para> | ||
| /// <para>Can emit either mul or mulx depending on hardware</para> | ||
| /// </remarks> | ||
| internal static (ulong Lower, ulong Upper) BigMul(ulong left, ulong right) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> | ||
| /// <para> IMUL reg/m64</para> | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para> | ||
| /// </remarks> | ||
| internal static (long Lower, long Upper) BigMul(long left, long right) { throw new PlatformNotSupportedException(); } | ||
| } | ||
|
|
||
| /// <summary> | ||
|
|
@@ -109,6 +128,26 @@ internal X64() { } | |
| [Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)] | ||
| public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> | ||
| /// <para> MUL reg/m32</para> | ||
| /// <para> MULX reg reg reg/m32 (if BMI2 is supported)</para> | ||
| /// </summary> | ||
| internal static (uint Lower, uint Upper) BigMul(uint left, uint right) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> | ||
| /// <para> IMUL reg/m32</para> | ||
| /// </summary> | ||
| internal static (int Lower, int Upper) BigMul(int left, int right) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> | ||
| /// <para> MUL reg/m</para> | ||
| /// <para> MULX reg reg reg/m (if BMI2 is supported)</para> | ||
| /// </summary> | ||
| internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); } | ||
|
|
||
| /// <summary> IMUL reg/m</summary> | ||
| internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| /// <summary> | ||
| /// <para>void _mm_pause (void);</para> | ||
| /// <para> PAUSE</para> | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.