-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Open
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issuePerformance related issue
Milestone
Description
using System;
using System.Runtime.CompilerServices;
using SharpLab.Runtime;
[JitGeneric(typeof(int))]
public sealed class C<T> where T : IComparable<T>
{
public static int Compare1(Span<T> keys, T t) => LessThan1(keys[0], t) ? 1 : 0;
public static int Compare2(Span<T> keys, T t) => LessThan2(keys[0], t) ? 1 : 0;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool LessThan1(T left, T right)
{
if (typeof(T) == typeof(string))
return false;
return left.CompareTo(right) < 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool LessThan2(T left, T right)
{
return left.CompareTo(right) < 0;
}
}
The codegen for Compare1 and Compare2 are identical, as is expected, since T is a value type (Int32) and we should be specializing such that the whole string type check is eliminated:
C`1[[System.Int32, System.Private.CoreLib]].LessThan1(Int32, Int32)
L0000: cmp ecx, edx
L0002: jge short L000b
L0004: mov eax, 0xffffffff
L0009: jmp short L0018
L000b: cmp ecx, edx
L000d: jle short L0016
L000f: mov eax, 1
L0014: jmp short L0018
L0016: xor eax, eax
L0018: test eax, eax
L001a: setl al
L001d: movzx eax, al
L0020: ret
C`1[[System.Int32, System.Private.CoreLib]].LessThan2(Int32, Int32)
L0000: cmp ecx, edx
L0002: jge short L000b
L0004: mov eax, 0xffffffff
L0009: jmp short L0018
L000b: cmp ecx, edx
L000d: jle short L0016
L000f: mov eax, 1
L0014: jmp short L0018
L0016: xor eax, eax
L0018: test eax, eax
L001a: setl al
L001d: movzx eax, al
L0020: ret
However, when these methods are inlined, the callers exhibit non-trivial differences:
C`1[[System.Int32, System.Private.CoreLib]].Compare1(System.Span`1<Int32>, Int32)
L0000: sub rsp, 0x28
L0004: cmp dword ptr [rcx+8], 0
L0008: jbe short L0044
L000a: mov rax, [rcx]
L000d: mov eax, [rax]
L000f: cmp eax, edx
L0011: jge short L001a
L0013: mov ecx, 0xffffffff
L0018: jmp short L0027
L001a: cmp eax, edx
L001c: jle short L0025
L001e: mov ecx, 1
L0023: jmp short L0027
L0025: xor ecx, ecx
L0027: test ecx, ecx
L0029: setl al
L002c: movzx eax, al
L002f: test eax, eax
L0031: jne short L003a
L0033: xor eax, eax
L0035: add rsp, 0x28
L0039: ret
L003a: mov eax, 1
L003f: add rsp, 0x28
L0043: ret
L0044: call 0x00007ff8aa8af9f0
L0049: int3
C`1[[System.Int32, System.Private.CoreLib]].Compare2(System.Span`1<Int32>, Int32)
L0000: sub rsp, 0x28
L0004: cmp dword ptr [rcx+8], 0
L0008: jbe short L0024
L000a: mov rax, [rcx]
L000d: mov eax, [rax]
L000f: cmp eax, edx
L0011: jl short L001a
L0013: xor eax, eax
L0015: add rsp, 0x28
L0019: ret
L001a: mov eax, 1
L001f: add rsp, 0x28
L0023: ret
L0024: call 0x00007ff8aa8af9f0
L0029: int3
It looks like when the type check is absent, the JIT figures out that the whole int.CompareTo(int) < 0
can be simplified into just:
L000f: cmp eax, edx
L0011: jl short L001a
but when the irrelevant type check is present, it fails to optimize this, and we end up with the complete implementation from Int32.CompareTo
:
L000f: cmp eax, edx
L0011: jge short L001a
L0013: mov ecx, 0xffffffff
L0018: jmp short L0027
L001a: cmp eax, edx
L001c: jle short L0025
L001e: mov ecx, 1
L0023: jmp short L0027
L0025: xor ecx, ecx
L0027: test ecx, ecx
L0029: setl al
L002c: movzx eax, al
L002f: test eax, eax
L0031: jne short L003a
I see similar behavior from the master branch. Repro:
using System;
using System.Runtime.CompilerServices;
public static class Program
{
public static void Main()
{
Span<double> keys = new double[] { 1, 2, 3, 4, 5 };
C<double>.Compare1(keys, 6);
C<double>.Compare2(keys, 6);
}
}
public sealed class C<T> where T : IComparable<T>
{
[MethodImpl(MethodImplOptions.NoInlining)]
public static int Compare1(Span<T> keys, T t) => LessThan1(keys[0], t) ? 1 : 0;
[MethodImpl(MethodImplOptions.NoInlining)]
public static int Compare2(Span<T> keys, T t) => LessThan2(keys[0], t) ? 1 : 0;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool LessThan1(T left, T right)
{
if (typeof(T) == typeof(string))
return false;
return left.CompareTo(right) < 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool LessThan2(T left, T right)
{
return left.CompareTo(right) < 0;
}
}
and output from JitDisasm:
; Assembly listing for method C`1[Int32][System.Int32]:Compare1(System.Span`1[Int32],int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 4, 8 ) byref -> rcx ld-addr-op
; V01 arg1 [V01,T01] ( 4, 3.25) int -> rdx
; V02 OutArgs [V02 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03,T04] ( 2, 2 ) bool -> rax "Inline return value spill temp"
; V04 tmp2 [V04,T02] ( 3, 4.50) int -> rax ld-addr-op "Inlining Arg"
; V05 tmp3 [V05,T03] ( 4, 2.50) int -> rcx "Inline return value spill temp"
;* V06 tmp4 [V06 ] ( 0, 0 ) byref -> zero-ref V08._pointer(offs=0x00) P-INDEP "field V00._pointer (fldOffset=0x0)"
;* V07 tmp5 [V07 ] ( 0, 0 ) int -> zero-ref V08._length(offs=0x08) P-INDEP "field V00._length (fldOffset=0x8)"
;* V08 tmp6 [V08 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
;
; Lcl frame size = 40
G_M10938_IG01:
4883EC28 sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M10938_IG02:
83790800 cmp dword ptr [rcx+8], 0
763A jbe SHORT G_M10938_IG12
488B01 mov rax, bword ptr [rcx]
8B00 mov eax, dword ptr [rax]
3BC2 cmp eax, edx
7D07 jge SHORT G_M10938_IG04
;; bbWeight=1 PerfScore 8.25
G_M10938_IG03:
B9FFFFFFFF mov ecx, -1
EB0D jmp SHORT G_M10938_IG07
;; bbWeight=0.50 PerfScore 1.13
G_M10938_IG04:
3BC2 cmp eax, edx
7E07 jle SHORT G_M10938_IG06
;; bbWeight=0.25 PerfScore 0.31
G_M10938_IG05:
B901000000 mov ecx, 1
EB02 jmp SHORT G_M10938_IG07
;; bbWeight=0.50 PerfScore 1.13
G_M10938_IG06:
33C9 xor ecx, ecx
;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG07:
85C9 test ecx, ecx
0F9CC0 setl al
0FB6C0 movzx rax, al
85C0 test eax, eax
7507 jne SHORT G_M10938_IG10
;; bbWeight=1 PerfScore 2.75
G_M10938_IG08:
33C0 xor eax, eax
;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG09:
4883C428 add rsp, 40
C3 ret
;; bbWeight=0.50 PerfScore 0.63
G_M10938_IG10:
B801000000 mov eax, 1
;; bbWeight=0.50 PerfScore 0.13
G_M10938_IG11:
4883C428 add rsp, 40
C3 ret
;; bbWeight=0.50 PerfScore 0.63
G_M10938_IG12:
E8773D545F call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 74, prolog size 4, PerfScore 22.84, (MethodHash=255cd545) for method C`1[Int32][System.Int32]:Compare1(System.Span`1[Int32],int):int
; ============================================================
; Assembly listing for method C`1[Int32][System.Int32]:Compare2(System.Span`1[Int32],int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 4, 8 ) byref -> rcx ld-addr-op
; V01 arg1 [V01,T01] ( 3, 3 ) int -> rdx
; V02 OutArgs [V02 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03,T02] ( 2, 4 ) int -> rax ld-addr-op "Inlining Arg"
;* V04 tmp2 [V04,T03] ( 0, 0 ) int -> zero-ref "Inline return value spill temp"
;* V05 tmp3 [V05 ] ( 0, 0 ) byref -> zero-ref V07._pointer(offs=0x00) P-INDEP "field V00._pointer (fldOffset=0x0)"
;* V06 tmp4 [V06 ] ( 0, 0 ) int -> zero-ref V07._length(offs=0x08) P-INDEP "field V00._length (fldOffset=0x8)"
;* V07 tmp5 [V07 ] ( 0, 0 ) struct (16) zero-ref "Promoted implicit byref"
;
; Lcl frame size = 40
G_M28153_IG01:
4883EC28 sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M28153_IG02:
83790800 cmp dword ptr [rcx+8], 0
761A jbe SHORT G_M28153_IG07
488B01 mov rax, bword ptr [rcx]
8B00 mov eax, dword ptr [rax]
3BC2 cmp eax, edx
7C07 jl SHORT G_M28153_IG05
;; bbWeight=1 PerfScore 8.25
G_M28153_IG03:
33C0 xor eax, eax
;; bbWeight=0.50 PerfScore 0.13
G_M28153_IG04:
4883C428 add rsp, 40
C3 ret
;; bbWeight=0.50 PerfScore 0.63
G_M28153_IG05:
B801000000 mov eax, 1
;; bbWeight=0.50 PerfScore 0.13
G_M28153_IG06:
4883C428 add rsp, 40
C3 ret
;; bbWeight=0.50 PerfScore 0.63
G_M28153_IG07:
E8373D545F call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 42, prolog size 4, PerfScore 14.20, (MethodHash=ea4a9206) for method C`1[Int32][System.Int32]:Compare2(System.Span`1[Int32],int):int
cc: @AndyAyersMS
category:cq
theme:inlining
skill-level:expert
cost:large
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issuePerformance related issue