Skip to content

JIT: Suboptimal block layout with loops exiting to returns #105083

Closed
@jakobbotsch

Description

@jakobbotsch

I noticed this codegen from the aspnet collection:

; Assembly listing for method Perfolizer.Horology.TimeUnit:GetBestTimeUnit(double[]):Perfolizer.Horology.TimeUnit (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Dynamic PGO
; rsp based frame
; fully interruptible
; with Dynamic PGO: fgCalledCount is 80
; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  4,  4   )     ref  ->  rcx         class-hnd single-def <double[]>
;  V01 loc0         [V01,T06] (  2,  2.60)  double  ->  mm0         single-def
;  V02 loc1         [V02,T04] (  2,  2.60)     ref  ->  rdx         class-hnd exact single-def <Perfolizer.Horology.TimeUnit[]>
;* V03 loc2         [V03,T05] (  0,  0   )     int  ->  zero-ref   
;  V04 loc3         [V04,T03] (  3,  4.20)     ref  ->  rax         class-hnd <Perfolizer.Horology.TimeUnit>
;  V05 OutArgs      [V05    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;  V06 rat0         [V06,T01] (  4,  5.80)    long  ->  rcx         "Strength reduced derived IV"
;  V07 rat1         [V07,T02] (  4,  5.80)     int  ->   r8         "Trip count IV"
;
; Lcl frame size = 40

G_M16141_IG01:        ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref, nogc <-- Prolog IG
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25
G_M16141_IG02:        ; bbWeight=1, gcrefRegs=0002 {rcx}, byrefRegs=0000 {}, byref, isz
       ; gcrRegs +[rcx]
       cmp      dword ptr [rcx+0x08], 0
       je       SHORT G_M16141_IG08
       call     [<unknown method>]
       ; gcrRegs -[rcx]
       ; gcr arg pop 0
       mov      rax, 0xD1FFAB1E      ; const ptr
       mov      rdx, gword ptr [rax]
       ; gcrRegs +[rdx]
       mov      ecx, 16
       mov      r8d, 7
						;; size=36 bbWeight=1 PerfScore 9.75
G_M16141_IG03:        ; bbWeight=1.60, gcrefRegs=0004 {rdx}, byrefRegs=0000 {}, byref, isz
       mov      rax, gword ptr [rdx+rcx]
       ; gcrRegs +[rax]
       imul     r10, qword ptr [rax+0x18], 0x3E8
       vxorps   xmm1, xmm1, xmm1
       vcvtsi2sd xmm1, xmm1, r10
       vucomisd xmm1, xmm0
       jbe      SHORT G_M16141_IG05
						;; size=27 bbWeight=1.60 PerfScore 26.13
G_M16141_IG04:        ; bbWeight=1, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byref, epilog, nogc
       ; gcrRegs -[rdx]
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
G_M16141_IG05:        ; bbWeight=1.60, gcVars=0000000000000000 {}, gcrefRegs=0004 {rdx}, byrefRegs=0000 {}, gcvars, byref, isz
       ; gcrRegs -[rax] +[rdx]
       add      rcx, 8
       dec      r8d
       jne      SHORT G_M16141_IG03
						;; size=9 bbWeight=1.60 PerfScore 2.40
G_M16141_IG06:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
       ; gcrRegs -[rdx]
       mov      rdx, 0xD1FFAB1E      ; const ptr
       mov      rdx, gword ptr [rdx]
       ; gcrRegs +[rdx]
       mov      rcx, 0xD1FFAB1E      ; <unknown method>
						;; size=23 bbWeight=0 PerfScore 0.00
G_M16141_IG07:        ; bbWeight=0, epilog, nogc, extend
       add      rsp, 40
       tail.jmp [System.Linq.Enumerable:Last[System.__Canon](System.Collections.Generic.IEnumerable`1[System.__Canon]):System.__Canon]
						;; size=10 bbWeight=0 PerfScore 0.00
G_M16141_IG08:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
       ; gcrRegs -[rdx]
       mov      rax, 0xD1FFAB1E      ; const ptr
       mov      rax, gword ptr [rax]
       ; gcrRegs +[rax]
						;; size=13 bbWeight=0 PerfScore 0.00
G_M16141_IG09:        ; bbWeight=0, epilog, nogc, extend
       add      rsp, 40
       ret      
						;; size=5 bbWeight=0 PerfScore 0.00

; Total bytes of code 132, prolog size 4, PerfScore 39.78, instruction count 28, allocated bytes for code 132 (MethodHash=973dc0f2) for method Perfolizer.Horology.TimeUnit:GetBestTimeUnit(double[]):Perfolizer.Horology.TimeUnit (Tier1)
; ============================================================

Note the G_M16141_IG04 epilog in the middle of the loop. I would expect that epilog to be placed after the loop instead, such that G_M16141_IG05 was packed together with G_M16141_IG03. In this case that block even has higher block weight, so I am surprised to see it there.

FWIW, this problem seems quite common without PGO; e.g.

[MethodImpl(MethodImplOptions.NoInlining)]
public bool Contains(char c, char[] s)
{
    for (int i = 0; i < s.Length; i++)
    {
        if (s[i] == c)
            return true;
    }

    return false;
}

results in

G_M15173_IG04:  ;; offset=0x0010
       movzx    rax, word  ptr [r8]
       cmp      eax, edx
       jne      SHORT G_M15173_IG07
						;; size=8 bbWeight=4 PerfScore 13.00
G_M15173_IG05:  ;; offset=0x0018
       mov      eax, 1
						;; size=5 bbWeight=0.50 PerfScore 0.12
G_M15173_IG06:  ;; offset=0x001D
       ret      
						;; size=1 bbWeight=0.50 PerfScore 0.50
G_M15173_IG07:  ;; offset=0x001E
       add      r8, 2
       dec      ecx
       jne      SHORT G_M15173_IG04
						;; size=8 bbWeight=4 PerfScore 6.00

without PGO (e.g. when looking in disasmo, or with DOTNET_TieredCompilation=0).

Not sure if this is already on the radar, cc @amanasifkhalid @dotnet/jit-contrib

Metadata

Metadata

Labels

Priority:2Work that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions