Skip to content

Irrelevant type checks in inlined generics for value types influencing optimizations #37904

@stephentoub

Description

@stephentoub

Consider this sharplab.io repro:
https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABAJgEYBYAKGIAYACY8gOgCUBXAOwwEt8YLAMIR8AB14AbGFADKMgG68wMXAG4a9BrIAW2KGIAy2YO258BG6jQDaAKV4YA4jC4zlACgwBPMTAgAZh68PACUoQC6mgDMDLgw2NIAJkykDEIAPAAqAHwMAO46MjAMWQwgDACSIuL6JtLZOTQA3jQM7UyxzEgMIRjpomL6MOQeskNcjQwA1jDeuGilDBihDAC8eYaquFl6XKOz8zZ0EYsrDAD8DOTlDHRWbR3EXeQ9fQO1sKRjE1OHC0tzhsGFtcDs9t9/sdTstVlcbhV7jRHu0bABZGAYHQQJKVcSSDwYrE4vFiSQAeTEfAgXFwLAAggBzRmwMG8BQwSpcSQhEKMyIozpMV4MYAQCCSEHbXbYfYeMrSAIYRZlKC8Rk6FaC1rUDp63oBBheXz+IJZVZrNbLE2BDzMOjhQX6p4AdgYAUS8QeuudTDdiowwkGwyyEA8ao15wydyseoAvsifaiidjcfjCZjU6SKVTeDS6UyWdt2ZzubyuPyokmhd1ReLJaDwbLvgqYEqVQwI5rQtqna6GAGg58YKHw+ruwxo0jqwnqHGgA==

using System;
using System.Runtime.CompilerServices;
using SharpLab.Runtime;

[JitGeneric(typeof(int))]
public sealed class C<T> where T : IComparable<T>
{
    public static int Compare1(Span<T> keys, T t) => LessThan1(keys[0], t) ? 1 : 0;

    public static int Compare2(Span<T> keys, T t) => LessThan2(keys[0], t) ? 1 : 0;

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool LessThan1(T left, T right)
    {
        if (typeof(T) == typeof(string))
            return false;

        return left.CompareTo(right) < 0;
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool LessThan2(T left, T right)
    {
        return left.CompareTo(right) < 0;
    }
}

The codegen for Compare1 and Compare2 are identical, as is expected, since T is a value type (Int32) and we should be specializing such that the whole string type check is eliminated:

C`1[[System.Int32, System.Private.CoreLib]].LessThan1(Int32, Int32)
    L0000: cmp ecx, edx
    L0002: jge short L000b
    L0004: mov eax, 0xffffffff
    L0009: jmp short L0018
    L000b: cmp ecx, edx
    L000d: jle short L0016
    L000f: mov eax, 1
    L0014: jmp short L0018
    L0016: xor eax, eax
    L0018: test eax, eax
    L001a: setl al
    L001d: movzx eax, al
    L0020: ret

C`1[[System.Int32, System.Private.CoreLib]].LessThan2(Int32, Int32)
    L0000: cmp ecx, edx
    L0002: jge short L000b
    L0004: mov eax, 0xffffffff
    L0009: jmp short L0018
    L000b: cmp ecx, edx
    L000d: jle short L0016
    L000f: mov eax, 1
    L0014: jmp short L0018
    L0016: xor eax, eax
    L0018: test eax, eax
    L001a: setl al
    L001d: movzx eax, al
    L0020: ret

However, when these methods are inlined, the callers exhibit non-trivial differences:

C`1[[System.Int32, System.Private.CoreLib]].Compare1(System.Span`1<Int32>, Int32)
    L0000: sub rsp, 0x28
    L0004: cmp dword ptr [rcx+8], 0
    L0008: jbe short L0044
    L000a: mov rax, [rcx]
    L000d: mov eax, [rax]
    L000f: cmp eax, edx
    L0011: jge short L001a
    L0013: mov ecx, 0xffffffff
    L0018: jmp short L0027
    L001a: cmp eax, edx
    L001c: jle short L0025
    L001e: mov ecx, 1
    L0023: jmp short L0027
    L0025: xor ecx, ecx
    L0027: test ecx, ecx
    L0029: setl al
    L002c: movzx eax, al
    L002f: test eax, eax
    L0031: jne short L003a
    L0033: xor eax, eax
    L0035: add rsp, 0x28
    L0039: ret
    L003a: mov eax, 1
    L003f: add rsp, 0x28
    L0043: ret
    L0044: call 0x00007ff8aa8af9f0
    L0049: int3

C`1[[System.Int32, System.Private.CoreLib]].Compare2(System.Span`1<Int32>, Int32)
    L0000: sub rsp, 0x28
    L0004: cmp dword ptr [rcx+8], 0
    L0008: jbe short L0024
    L000a: mov rax, [rcx]
    L000d: mov eax, [rax]
    L000f: cmp eax, edx
    L0011: jl short L001a
    L0013: xor eax, eax
    L0015: add rsp, 0x28
    L0019: ret
    L001a: mov eax, 1
    L001f: add rsp, 0x28
    L0023: ret
    L0024: call 0x00007ff8aa8af9f0
    L0029: int3

It looks like when the type check is absent, the JIT figures out that the whole int.CompareTo(int) < 0 can be simplified into just:

    L000f: cmp eax, edx
    L0011: jl short L001a

but when the irrelevant type check is present, it fails to optimize this, and we end up with the complete implementation from Int32.CompareTo:

    L000f: cmp eax, edx
    L0011: jge short L001a
    L0013: mov ecx, 0xffffffff
    L0018: jmp short L0027
    L001a: cmp eax, edx
    L001c: jle short L0025
    L001e: mov ecx, 1
    L0023: jmp short L0027
    L0025: xor ecx, ecx
    L0027: test ecx, ecx
    L0029: setl al
    L002c: movzx eax, al
    L002f: test eax, eax
    L0031: jne short L003a

I see similar behavior from the master branch. Repro:

using System;
using System.Runtime.CompilerServices;

public static class Program
{
    public static void Main()
    {
        Span<double> keys = new double[] { 1, 2, 3, 4, 5 };
        C<double>.Compare1(keys, 6);
        C<double>.Compare2(keys, 6);
    }
}

public sealed class C<T> where T : IComparable<T>
{
    [MethodImpl(MethodImplOptions.NoInlining)]
    public static int Compare1(Span<T> keys, T t) => LessThan1(keys[0], t) ? 1 : 0;

    [MethodImpl(MethodImplOptions.NoInlining)]
    public static int Compare2(Span<T> keys, T t) => LessThan2(keys[0], t) ? 1 : 0;

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool LessThan1(T left, T right)
    {
        if (typeof(T) == typeof(string))
            return false;

        return left.CompareTo(right) < 0;
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool LessThan2(T left, T right)
    {
        return left.CompareTo(right) < 0;
    }
}

and output from JitDisasm:

; Assembly listing for method C`1[Int32][System.Int32]:Compare1(System.Span`1[Int32],int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  4,  8   )   byref  ->  rcx         ld-addr-op
;  V01 arg1         [V01,T01] (  4,  3.25)     int  ->  rdx
;  V02 OutArgs      [V02    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T04] (  2,  2   )    bool  ->  rax         "Inline return value spill temp"
;  V04 tmp2         [V04,T02] (  3,  4.50)     int  ->  rax         ld-addr-op "Inlining Arg"
;  V05 tmp3         [V05,T03] (  4,  2.50)     int  ->  rcx         "Inline return value spill temp"
;* V06 tmp4         [V06    ] (  0,  0   )   byref  ->  zero-ref    V08._pointer(offs=0x00) P-INDEP "field V00._pointer (fldOffset=0x0)"
;* V07 tmp5         [V07    ] (  0,  0   )     int  ->  zero-ref    V08._length(offs=0x08) P-INDEP "field V00._length (fldOffset=0x8)"
;* V08 tmp6         [V08    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref"
;
; Lcl frame size = 40

G_M10938_IG01:
       4883EC28             sub      rsp, 40
                                                ;; bbWeight=1    PerfScore 0.25
G_M10938_IG02:
       83790800             cmp      dword ptr [rcx+8], 0
       763A                 jbe      SHORT G_M10938_IG12
       488B01               mov      rax, bword ptr [rcx]
       8B00                 mov      eax, dword ptr [rax]
       3BC2                 cmp      eax, edx
       7D07                 jge      SHORT G_M10938_IG04
                                                ;; bbWeight=1    PerfScore 8.25
G_M10938_IG03:
       B9FFFFFFFF           mov      ecx, -1
       EB0D                 jmp      SHORT G_M10938_IG07
                                                ;; bbWeight=0.50 PerfScore 1.13
G_M10938_IG04:
       3BC2                 cmp      eax, edx
       7E07                 jle      SHORT G_M10938_IG06
                                                ;; bbWeight=0.25 PerfScore 0.31
G_M10938_IG05:
       B901000000           mov      ecx, 1
       EB02                 jmp      SHORT G_M10938_IG07
                                                ;; bbWeight=0.50 PerfScore 1.13
G_M10938_IG06:
       33C9                 xor      ecx, ecx
                                                ;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG07:
       85C9                 test     ecx, ecx
       0F9CC0               setl     al
       0FB6C0               movzx    rax, al
       85C0                 test     eax, eax
       7507                 jne      SHORT G_M10938_IG10
                                                ;; bbWeight=1    PerfScore 2.75
G_M10938_IG08:
       33C0                 xor      eax, eax
                                                ;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG09:
       4883C428             add      rsp, 40
       C3                   ret
                                                ;; bbWeight=0.50 PerfScore 0.63
G_M10938_IG10:
       B801000000           mov      eax, 1
                                                ;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG11:
       4883C428             add      rsp, 40
       C3                   ret
                                                ;; bbWeight=0.50 PerfScore 0.63
G_M10938_IG12:
       E8773D545F           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3
                                                ;; bbWeight=0    PerfScore 0.00

; Total bytes of code 74, prolog size 4, PerfScore 22.84, (MethodHash=255cd545) for method C`1[Int32][System.Int32]:Compare1(System.Span`1[Int32],int):int
; ============================================================

; Assembly listing for method C`1[Int32][System.Int32]:Compare2(System.Span`1[Int32],int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  4,  8   )   byref  ->  rcx         ld-addr-op
;  V01 arg1         [V01,T01] (  3,  3   )     int  ->  rdx
;  V02 OutArgs      [V02    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T02] (  2,  4   )     int  ->  rax         ld-addr-op "Inlining Arg"
;* V04 tmp2         [V04,T03] (  0,  0   )     int  ->  zero-ref    "Inline return value spill temp"
;* V05 tmp3         [V05    ] (  0,  0   )   byref  ->  zero-ref    V07._pointer(offs=0x00) P-INDEP "field V00._pointer (fldOffset=0x0)"
;* V06 tmp4         [V06    ] (  0,  0   )     int  ->  zero-ref    V07._length(offs=0x08) P-INDEP "field V00._length (fldOffset=0x8)"
;* V07 tmp5         [V07    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref"
;
; Lcl frame size = 40

G_M28153_IG01:
       4883EC28             sub      rsp, 40
                                                ;; bbWeight=1    PerfScore 0.25
G_M28153_IG02:
       83790800             cmp      dword ptr [rcx+8], 0
       761A                 jbe      SHORT G_M28153_IG07
       488B01               mov      rax, bword ptr [rcx]
       8B00                 mov      eax, dword ptr [rax]
       3BC2                 cmp      eax, edx
       7C07                 jl       SHORT G_M28153_IG05
                                                ;; bbWeight=1    PerfScore 8.25
G_M28153_IG03:
       33C0                 xor      eax, eax
                                                ;; bbWeight=0.50 PerfScore 0.13
G_M28153_IG04:
       4883C428             add      rsp, 40
       C3                   ret
                                                ;; bbWeight=0.50 PerfScore 0.63
G_M28153_IG05:
       B801000000           mov      eax, 1
                                                ;; bbWeight=0.50 PerfScore 0.13
G_M28153_IG06:
       4883C428             add      rsp, 40
       C3                   ret
                                                ;; bbWeight=0.50 PerfScore 0.63
G_M28153_IG07:
       E8373D545F           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3
                                                ;; bbWeight=0    PerfScore 0.00

; Total bytes of code 42, prolog size 4, PerfScore 14.20, (MethodHash=ea4a9206) for method C`1[Int32][System.Int32]:Compare2(System.Span`1[Int32],int):int

cc: @AndyAyersMS

category:cq
theme:inlining
skill-level:expert
cost:large

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions