Closed
Description
I noticed this codegen from the aspnet collection:
; Assembly listing for method Perfolizer.Horology.TimeUnit:GetBestTimeUnit(double[]):Perfolizer.Horology.TimeUnit (Tier1)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; Tier1 code
; optimized code
; optimized using Dynamic PGO
; rsp based frame
; fully interruptible
; with Dynamic PGO: fgCalledCount is 80
; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 4, 4 ) ref -> rcx class-hnd single-def <double[]>
; V01 loc0 [V01,T06] ( 2, 2.60) double -> mm0 single-def
; V02 loc1 [V02,T04] ( 2, 2.60) ref -> rdx class-hnd exact single-def <Perfolizer.Horology.TimeUnit[]>
;* V03 loc2 [V03,T05] ( 0, 0 ) int -> zero-ref
; V04 loc3 [V04,T03] ( 3, 4.20) ref -> rax class-hnd <Perfolizer.Horology.TimeUnit>
; V05 OutArgs [V05 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V06 rat0 [V06,T01] ( 4, 5.80) long -> rcx "Strength reduced derived IV"
; V07 rat1 [V07,T02] ( 4, 5.80) int -> r8 "Trip count IV"
;
; Lcl frame size = 40
G_M16141_IG01: ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref, nogc <-- Prolog IG
sub rsp, 40
;; size=4 bbWeight=1 PerfScore 0.25
G_M16141_IG02: ; bbWeight=1, gcrefRegs=0002 {rcx}, byrefRegs=0000 {}, byref, isz
; gcrRegs +[rcx]
cmp dword ptr [rcx+0x08], 0
je SHORT G_M16141_IG08
call [<unknown method>]
; gcrRegs -[rcx]
; gcr arg pop 0
mov rax, 0xD1FFAB1E ; const ptr
mov rdx, gword ptr [rax]
; gcrRegs +[rdx]
mov ecx, 16
mov r8d, 7
;; size=36 bbWeight=1 PerfScore 9.75
G_M16141_IG03: ; bbWeight=1.60, gcrefRegs=0004 {rdx}, byrefRegs=0000 {}, byref, isz
mov rax, gword ptr [rdx+rcx]
; gcrRegs +[rax]
imul r10, qword ptr [rax+0x18], 0x3E8
vxorps xmm1, xmm1, xmm1
vcvtsi2sd xmm1, xmm1, r10
vucomisd xmm1, xmm0
jbe SHORT G_M16141_IG05
;; size=27 bbWeight=1.60 PerfScore 26.13
G_M16141_IG04: ; bbWeight=1, gcrefRegs=0001 {rax}, byrefRegs=0000 {}, byref, epilog, nogc
; gcrRegs -[rdx]
add rsp, 40
ret
;; size=5 bbWeight=1 PerfScore 1.25
G_M16141_IG05: ; bbWeight=1.60, gcVars=0000000000000000 {}, gcrefRegs=0004 {rdx}, byrefRegs=0000 {}, gcvars, byref, isz
; gcrRegs -[rax] +[rdx]
add rcx, 8
dec r8d
jne SHORT G_M16141_IG03
;; size=9 bbWeight=1.60 PerfScore 2.40
G_M16141_IG06: ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
; gcrRegs -[rdx]
mov rdx, 0xD1FFAB1E ; const ptr
mov rdx, gword ptr [rdx]
; gcrRegs +[rdx]
mov rcx, 0xD1FFAB1E ; <unknown method>
;; size=23 bbWeight=0 PerfScore 0.00
G_M16141_IG07: ; bbWeight=0, epilog, nogc, extend
add rsp, 40
tail.jmp [System.Linq.Enumerable:Last[System.__Canon](System.Collections.Generic.IEnumerable`1[System.__Canon]):System.__Canon]
;; size=10 bbWeight=0 PerfScore 0.00
G_M16141_IG08: ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
; gcrRegs -[rdx]
mov rax, 0xD1FFAB1E ; const ptr
mov rax, gword ptr [rax]
; gcrRegs +[rax]
;; size=13 bbWeight=0 PerfScore 0.00
G_M16141_IG09: ; bbWeight=0, epilog, nogc, extend
add rsp, 40
ret
;; size=5 bbWeight=0 PerfScore 0.00
; Total bytes of code 132, prolog size 4, PerfScore 39.78, instruction count 28, allocated bytes for code 132 (MethodHash=973dc0f2) for method Perfolizer.Horology.TimeUnit:GetBestTimeUnit(double[]):Perfolizer.Horology.TimeUnit (Tier1)
; ============================================================
Note the G_M16141_IG04
epilog in the middle of the loop. I would expect that epilog to be placed after the loop instead, such that G_M16141_IG05
was packed together with G_M16141_IG03
. In this case that block even has higher block weight, so I am surprised to see it there.
FWIW, this problem seems quite common without PGO; e.g.
[MethodImpl(MethodImplOptions.NoInlining)]
public bool Contains(char c, char[] s)
{
for (int i = 0; i < s.Length; i++)
{
if (s[i] == c)
return true;
}
return false;
}
results in
G_M15173_IG04: ;; offset=0x0010
movzx rax, word ptr [r8]
cmp eax, edx
jne SHORT G_M15173_IG07
;; size=8 bbWeight=4 PerfScore 13.00
G_M15173_IG05: ;; offset=0x0018
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M15173_IG06: ;; offset=0x001D
ret
;; size=1 bbWeight=0.50 PerfScore 0.50
G_M15173_IG07: ;; offset=0x001E
add r8, 2
dec ecx
jne SHORT G_M15173_IG04
;; size=8 bbWeight=4 PerfScore 6.00
without PGO (e.g. when looking in disasmo, or with DOTNET_TieredCompilation=0
).
Not sure if this is already on the radar, cc @amanasifkhalid @dotnet/jit-contrib