-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimizes vector conversions with AVX512 #87878
Optimizes vector conversions with AVX512 #87878
Conversation
…vtsd2usi uses ulong.max_value to show FPE for negative, NAN and ulong_max + 1 values.
…architecture. This is because we have changed the JITDbl2Ulng helper function to mimic the new IEEE compliant AVX512 instruction vcvtsd2usi. In the process, we needed to update the library test case because the default Floating Point Error (FPE) value for the new instruction is different from the default MSVC FPE value i.e. 0.
…not changing the library test case but the API to make sure NaN cases are handled.
…id handling edge cases (-1,0) separately inside the helper.
trying to return EA_4BYTE for INS_vcvttss2usi to make sure that we read dword and not qword for float to ulong
… a special handling for vcvttss2usi64 to make sure we read only dword instead of qword for float to ulong conversion
…n for vcvttsd2usi, vcvttusi2sd32/64
…th other similar instructions
…r nowayasserts and also checking for float and doubel both in lowercast for overflow and conversion to ulong
…y are not available in release mode
…) into a single node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT)
…ideration 32bit and 64 bit version of vcvttss2usi.
…endsOn only runs in debug mode.
…ertToUInt64 for double, ConvertToInt64 for double, ConvertToDouble for ulong/long for vector
…xarch for instructions used for truncation
…ort those conversions due to issues related to mismatch between non AVX512 and AVX512 machine
…X512 and also extending NI_VectorT512_ConvertToInt64 and NI_VectorT512_ConvertToUInt64 support for Float.
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue Details
Following new APIs have been added here as a part of this PR --> 1. Vector.ConvertToUInt64public static Vector256<ulong> ConvDoubleToUlong256(Vector256<double> val)
{
return Vector256.ConvertToUInt64(val);
} Assembly Before Optimization G_M9746_IG01: ;; offset=0000H
push rbp
sub rsp, 144
vzeroupper
lea rbp, [rsp+90H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=47 bbWeight=1 PerfScore 9.08
G_M9746_IG02: ;; offset=002FH
cmp dword ptr [(reloc 0x7ffb63022898)], 0
je SHORT G_M9746_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03: ;; offset=0038H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04: ;; offset=003DH
nop
mov rdx, bword ptr [rbp+18H]
vmovups ymm0, ymmword ptr [rdx]
vmovups ymmword ptr [rbp-70H], ymm0
lea rdx, [rbp-70H]
lea rcx, [rbp-50H]
call [System.Runtime.Intrinsics.Vector256:ConvertToUInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[ulong]]
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=56 bbWeight=1 PerfScore 16.50
G_M9746_IG05: ;; offset=0075H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06: ;; offset=008DH
vzeroupper
add rsp, 144
pop rbp
ret
;; size=12 bbWeight=1 PerfScore 2.75 Assembly After Optimization G_M9746_IG01: ;; offset=0000H
push rbp
sub rsp, 112
vzeroupper
lea rbp, [rsp+70H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=41 bbWeight=1 PerfScore 9.08
G_M9746_IG02: ;; offset=0029H
cmp dword ptr [(reloc 0x7ffb61d22898)], 0
je SHORT G_M9746_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03: ;; offset=0032H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04: ;; offset=0037H
nop
mov rax, bword ptr [rbp+18H]
vcvttpd2uqq ymm0, ymmword ptr [rax]
vmovups ymmword ptr [rbp-50H], ymm0
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=42 bbWeight=1 PerfScore 13.50
G_M9746_IG05: ;; offset=0061H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06: ;; offset=0079H
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75 2. Vector.ConvertToDoublepublic static Vector256<double> ConvUlongToDouble256(Vector256<ulong> val)
{
return Vector256.ConvertToDouble(val);
} Assembly Before Optimization G_M14098_IG01: ;; offset=0000H
push rbp
sub rsp, 144
vzeroupper
lea rbp, [rsp+90H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=47 bbWeight=1 PerfScore 9.08
G_M14098_IG02: ;; offset=002FH
cmp dword ptr [(reloc 0x7ffb61ac2898)], 0
je SHORT G_M14098_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03: ;; offset=0038H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04: ;; offset=003DH
nop
mov rdx, bword ptr [rbp+18H]
vmovups ymm0, ymmword ptr [rdx]
vmovups ymmword ptr [rbp-70H], ymm0
lea rdx, [rbp-70H]
lea rcx, [rbp-50H]
call [System.Runtime.Intrinsics.Vector256:ConvertToDouble(System.Runtime.Intrinsics.Vector256`1[ulong]):System.Runtime.Intrinsics.Vector256`1[double]]
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=56 bbWeight=1 PerfScore 16.50
G_M14098_IG05: ;; offset=0075H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06: ;; offset=008DH
vzeroupper
add rsp, 144
pop rbp
ret
;; size=12 bbWeight=1 PerfScore 2.75 Assembly After Optimization G_M14098_IG01: ;; offset=0000H
push rbp
sub rsp, 112
vzeroupper
lea rbp, [rsp+70H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=41 bbWeight=1 PerfScore 9.08
G_M14098_IG02: ;; offset=0029H
cmp dword ptr [(reloc 0x7ffb61ae2898)], 0
je SHORT G_M14098_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03: ;; offset=0032H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04: ;; offset=0037H
nop
mov rax, bword ptr [rbp+18H]
vcvtuqq2pd ymm0, ymmword ptr [rax]
vmovups ymmword ptr [rbp-50H], ymm0
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=42 bbWeight=1 PerfScore 13.50
G_M14098_IG05: ;; offset=0061H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06: ;; offset=0079H
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75 3. Vector.ConvertToInt64public static Vector256<long> ConvDoubleToLong256(Vector256<double> val)
{
return Vector256.ConvertToInt64(val);
} Assembly Before Optimization G_M10770_IG01: ;; offset=0000H
push rbp
sub rsp, 144
vzeroupper
lea rbp, [rsp+90H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=47 bbWeight=1 PerfScore 9.08
G_M10770_IG02: ;; offset=002FH
cmp dword ptr [(reloc 0x7ff8bed1c5f8)], 0
je SHORT G_M10770_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03: ;; offset=0038H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04: ;; offset=003DH
nop
mov rdx, bword ptr [rbp+18H]
vmovups ymm0, ymmword ptr [rdx]
vmovups ymmword ptr [rbp-70H], ymm0
lea rdx, [rbp-70H]
lea rcx, [rbp-50H]
call [System.Runtime.Intrinsics.Vector256:ConvertToInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[long]]
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=56 bbWeight=1 PerfScore 16.50
G_M10770_IG05: ;; offset=0075H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06: ;; offset=008DH
vzeroupper
add rsp, 144
pop rbp
ret
;; size=12 bbWeight=1 PerfScore 2.75 Assembly After Optimization G_M10770_IG01: ;; offset=0000H
push rbp
sub rsp, 112
vzeroupper
lea rbp, [rsp+70H]
vxorps xmm4, xmm4, xmm4
vmovdqa32 xmmword ptr [rbp-30H], xmm4
vmovdqa32 xmmword ptr [rbp-20H], xmm4
mov bword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
;; size=41 bbWeight=1 PerfScore 9.08
G_M10770_IG02: ;; offset=0029H
cmp dword ptr [(reloc 0x7ff8babbc5f8)], 0
je SHORT G_M10770_IG04
;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03: ;; offset=0032H
call CORINFO_HELP_DBG_IS_JUST_MY_CODE
;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04: ;; offset=0037H
nop
mov rax, bword ptr [rbp+18H]
vcvttpd2qq ymm0, ymmword ptr [rax]
vmovups ymmword ptr [rbp-50H], ymm0
vmovups ymm0, ymmword ptr [rbp-50H]
vmovups ymmword ptr [rbp-30H], ymm0
nop
;; size=42 bbWeight=1 PerfScore 13.50
G_M10770_IG05: ;; offset=0061H
mov rax, bword ptr [rbp+10H]
vmovups ymm0, ymmword ptr [rbp-30H]
vmovups ymmword ptr [rax], ymm0
mov rax, bword ptr [rbp+10H]
;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06: ;; offset=0079H
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
|
…in asserts aka code review changes
…endsOn checks to make sure they are ran only if we need AVX512. These checks being costly, moving them to the innermost checks in nested if checks.
Draft Pull Request was automatically closed for 30 days of inactivity. Please let us know if you'd like to reopen it. |
NO NEED FOR REVIEW AT THIS TIME.
Following new APIs have been added here as a part of this PR -->
1. Vector.ConvertToUInt64
Assembly Before Optimization
Assembly After Optimization
2. Vector.ConvertToDouble
Assembly Before Optimization
Assembly After Optimization
3. Vector.ConvertToInt64
Assembly Before Optimization
Assembly After Optimization