Optimizes vector conversions with AVX512 #87878

khushal1996 · 2023-06-21T19:37:46Z

NO NEED FOR REVIEW AT THIS TIME.

Dependent on PR Optimize scalar conversions with AVX512 #84384

Following new APIs have been added here as a part of this PR -->

1. Vector.ConvertToUInt64

public static Vector256<ulong> ConvDoubleToUlong256(Vector256<double> val)
{
    return Vector256.ConvertToUInt64(val);
}

Assembly Before Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb63022898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToUInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[ulong]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M9746_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61d22898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2uqq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M9746_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

2. Vector.ConvertToDouble

public static Vector256<double> ConvUlongToDouble256(Vector256<ulong> val)
{
   return Vector256.ConvertToDouble(val);
}

Assembly Before Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb61ac2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToDouble(System.Runtime.Intrinsics.Vector256`1[ulong]):System.Runtime.Intrinsics.Vector256`1[double]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M14098_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61ae2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvtuqq2pd ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M14098_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

3. Vector.ConvertToInt64

public static Vector256<long> ConvDoubleToLong256(Vector256<double> val)
{
    return Vector256.ConvertToInt64(val);
}

Assembly Before Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ff8bed1c5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[long]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M10770_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ff8babbc5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2qq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M10770_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

…vtsd2usi uses ulong.max_value to show FPE for negative, NAN and ulong_max + 1 values.

…architecture. This is because we have changed the JITDbl2Ulng helper function to mimic the new IEEE compliant AVX512 instruction vcvtsd2usi. In the process, we needed to update the library test case because the default Floating Point Error (FPE) value for the new instruction is different from the default MSVC FPE value i.e. 0.

…not changing the library test case but the API to make sure NaN cases are handled.

…id handling edge cases (-1,0) separately inside the helper.

…ulong/uint

trying to return EA_4BYTE for INS_vcvttss2usi to make sure that we read dword and not qword for float to ulong

… a special handling for vcvttss2usi64 to make sure we read only dword instead of qword for float to ulong conversion

…n for vcvttsd2usi, vcvttusi2sd32/64

…th other similar instructions

…r nowayasserts and also checking for float and doubel both in lowercast for overflow and conversion to ulong

…y are not available in release mode

…) into a single node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT)

…ideration 32bit and 64 bit version of vcvttss2usi.

…endsOn only runs in debug mode.

…ertToUInt64 for double, ConvertToInt64 for double, ConvertToDouble for ulong/long for vector

…xarch for instructions used for truncation

…ort those conversions due to issues related to mismatch between non AVX512 and AVX512 machine

…X512 and also extending NI_VectorT512_ConvertToInt64 and NI_VectorT512_ConvertToUInt64 support for Float.

ghost · 2023-06-21T19:37:57Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Dependent on PR Optimize scalar conversions with AVX512 #84384

Following new APIs have been added here as a part of this PR -->

1. Vector.ConvertToUInt64

public static Vector256<ulong> ConvDoubleToUlong256(Vector256<double> val)
{
    return Vector256.ConvertToUInt64(val);
}

Assembly Before Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb63022898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToUInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[ulong]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M9746_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61d22898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2uqq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M9746_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

2. Vector.ConvertToDouble

public static Vector256<double> ConvUlongToDouble256(Vector256<ulong> val)
{
   return Vector256.ConvertToDouble(val);
}

Assembly Before Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb61ac2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToDouble(System.Runtime.Intrinsics.Vector256`1[ulong]):System.Runtime.Intrinsics.Vector256`1[double]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M14098_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61ae2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvtuqq2pd ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M14098_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

3. Vector.ConvertToInt64

public static Vector256<long> ConvDoubleToLong256(Vector256<double> val)
{
    return Vector256.ConvertToInt64(val);
}

Assembly Before Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ff8bed1c5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[long]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M10770_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ff8babbc5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2qq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M10770_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

Author:	khushal1996
Assignees:	-
Labels:	`area-CodeGen-coreclr`
Milestone:	-

…in asserts aka code review changes

…endsOn checks to make sure they are ran only if we need AVX512. These checks being costly, moving them to the innermost checks in nested if checks.

ghost · 2023-08-05T11:01:43Z

Draft Pull Request was automatically closed for 30 days of inactivity. Please let us know if you'd like to reopen it.

khushal1996 and others added 30 commits June 18, 2023 13:57

fixing the JITDbl2Ulng helper function. The new AVX512 instruction vc…

7d764be

…vtsd2usi uses ulong.max_value to show FPE for negative, NAN and ulong_max + 1 values.

Fixing the JITDbl2Ulng helper function. Also making sure that we are …

f018095

…not changing the library test case but the API to make sure NaN cases are handled.

reverting jitformat

ffe97cd

Adding a truncate function to the Dbl2Ulng helper to make sure we avo…

a8ee861

…id handling edge cases (-1,0) separately inside the helper.

Adding code to handle vectorized conversion for float/double to/from …

bbd8a8b

…ulong/uint

reverting changes for float to ulong

a21a077

enabling float to ulong conversion

1e3415a

Making change to set w1 bit for evex

c788c67

merging with main. Picking up hwintrinsiclistxarh from main

fbb2a90

trying to return EA_4BYTE for INS_vcvttss2usi to make sure that we read dword and not qword for float to ulong

jit format

9fece01

Splitting vcvttss2usi to vcvttss2usi32 and vcvttss2usi64. Also adding…

b40cd8e

… a special handling for vcvttss2usi64 to make sure we read only dword instead of qword for float to ulong conversion

undoing jitformat changes due to merge error

710026e

removing unused code and correcting throughput and latency informatio…

75e6acf

…n for vcvttsd2usi, vcvttusi2sd32/64

correcting throughput and latency for vcvttss2usi32 and placing it wi…

e15be4b

…th other similar instructions

formatting

10e2876

formatting

9463173

updating comments

4f7bb67

updating code for github comments. Using compIsaSupportedDebugOnly fo…

a99725c

…r nowayasserts and also checking for float and doubel both in lowercast for overflow and conversion to ulong

reverting to original checks for ISA supported Debug only because the…

44390b2

…y are not available in release mode

running jitformat

2f20ef3

running jitformat

b7dff8a

combine the 2 nodes GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT…

9622f78

…) into a single node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT)

merging with main and updating hwintrinsiclistxarch to take into cons…

d3b542f

…ideration 32bit and 64 bit version of vcvttss2usi.

Changing noway_assert to assert to make sure compOpportunisticallyDep…

8343e18

…endsOn only runs in debug mode.

running jitformat

e456763

accelerates ConvertToSingle for uint, ConvertToUInt32 for float, Conv…

0e88650

…ertToUInt64 for double, ConvertToInt64 for double, ConvertToDouble for ulong/long for vector

Reverting changes for convertToUint32 and also reverting hwintrinlist…

d97a169

…xarch for instructions used for truncation

reverting changes for float/double to uint for scalar values

4e0b663

Removing unused code for UINT<->float/double conversions. Cannot supp…

cffa6ea

…ort those conversions due to issues related to mismatch between non AVX512 and AVX512 machine

Adding IsBaselineVector512IsaSupportedOpportunistically checks for AV…

2697d26

…X512 and also extending NI_VectorT512_ConvertToInt64 and NI_VectorT512_ConvertToUInt64 support for Float.

ghost added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member labels Jun 21, 2023

khushal1996 added 3 commits June 21, 2023 12:48

Inserting proper break or return in switch-case for intrinsics

f43f99c

Inserting proper break or return in switch-case for intrinsics

d51b862

Runnign jitforamt

3f99873

runfoapp bot mentioned this pull request Jun 21, 2023

Long Running Test: Interop/MonoAPI/MonoMono/PInvokeDetach/PInvokeDetach.sh #73040

Closed

This was referenced Jun 22, 2023

Could not load file or assembly 'Microsoft.CodeAnalysis.NetAnalyzers #84995

Closed

Unable to load Analyzer assembly .../Microsoft.CodeAnalysis.Analyzers.dll : Not a valid assembly #85082

Closed

khushal1996 added 4 commits June 22, 2023 14:05

moving asserts and taking them out of if checks

12ff62e

jitformat

a119b35

Changing compOpportunisticallyDependsOn to compIsaSupportedDebugOnly …

47afca2

…in asserts aka code review changes

Making code review changes. Moving around the comOpportunisticallyDep…

c5c2a44

…endsOn checks to make sure they are ran only if we need AVX512. These checks being costly, moving them to the innermost checks in nested if checks.

build-analysis bot mentioned this pull request Jun 23, 2023

LibraryImportGenerator.Unit.Tests crashed in CI #87951

Closed

tannergooding added the avx512 Related to the AVX-512 architecture label Jun 27, 2023

removing float to ulong conversion

42001ac

ghost closed this Aug 5, 2023

ghost locked as resolved and limited conversation to collaborators Sep 4, 2023

This pull request was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Optimizes vector conversions with AVX512 #87878

Optimizes vector conversions with AVX512 #87878

Uh oh!

khushal1996 commented Jun 21, 2023 •

edited

Loading

Uh oh!

ghost commented Jun 21, 2023

1. Vector.ConvertToUInt64

2. Vector.ConvertToDouble

3. Vector.ConvertToInt64

Uh oh!

ghost commented Aug 5, 2023

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Optimizes vector conversions with AVX512 #87878

Optimizes vector conversions with AVX512 #87878

Uh oh!

Conversation

khushal1996 commented Jun 21, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

NO NEED FOR REVIEW AT THIS TIME.

1. Vector.ConvertToUInt64

2. Vector.ConvertToDouble

3. Vector.ConvertToInt64

Uh oh!

ghost commented Jun 21, 2023

1. Vector.ConvertToUInt64

2. Vector.ConvertToDouble

3. Vector.ConvertToInt64

Uh oh!

ghost commented Aug 5, 2023

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

khushal1996 commented Jun 21, 2023 •

edited

Loading