Skip to content

Jit: Sub-optimal code when function is inlined #32415

Closed
@Thealexbarney

Description

@Thealexbarney

As I created this example code I realized that it happens in similar circumstances as #32414, but I don't know if they have the same cause or not.

Given the following code:
SharpLab link

public readonly ref struct Wrapper
{
    private readonly ReadOnlySpan<byte> _buffer;

    public int Length => _buffer.Length;
    public byte this[int i] => _buffer[i];
    public byte GetUnsafe(int i) => Unsafe.Add(ref MemoryMarshal.GetReference(_buffer), i);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int SumFirst4(Wrapper p)
{
    return p.GetUnsafe(0) + p.GetUnsafe(1) + p.GetUnsafe(2) + p.GetUnsafe(3);
}

public static int SumFirst4Caller(Wrapper p) => SumFirst4(p);

SumFirst4 produces this. There's some weird stuff going on with register allocation, but nothing too bad.

G_M29037_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M29037_IG02:
       mov      rax, bword ptr [rcx]
       mov      rdx, rax
       movzx    rdx, byte  ptr [rdx]
       mov      rcx, rax
       movzx    rcx, byte  ptr [rcx+1]
       add      edx, ecx
       mov      rcx, rax
       movzx    rcx, byte  ptr [rcx+2]
       add      edx, ecx
       movzx    rax, byte  ptr [rax+3]
       add      eax, edx
						;; bbWeight=1    PerfScore 11.50
G_M29037_IG03:
       ret      
						;; bbWeight=1    PerfScore 1.00

; Total bytes of code 34

SumFirst4Caller produces this, copying the span to the stack and re-dereferencing the pointer for every element accessed.

G_M45217_IG01:
       sub      rsp, 24
       vzeroupper 
       xor      rax, rax
       mov      qword ptr [rsp+08H], rax
						;; bbWeight=1    PerfScore 2.50
G_M45217_IG02:
       vmovdqu  xmm0, xmmword ptr [rcx]
       vmovdqu  xmmword ptr [rsp+08H], xmm0
						;; bbWeight=1    PerfScore 3.00
G_M45217_IG03:
       lea      rax, bword ptr [rsp+08H]
       mov      rax, bword ptr [rax]
       movzx    rax, byte  ptr [rax]
       lea      rdx, bword ptr [rsp+08H]
       mov      rdx, bword ptr [rdx]
       movzx    rdx, byte  ptr [rdx+1]
       add      eax, edx
       lea      rdx, bword ptr [rsp+08H]
       mov      rdx, bword ptr [rdx]
       movzx    rdx, byte  ptr [rdx+2]
       add      eax, edx
       lea      rdx, bword ptr [rsp+08H]
       mov      rdx, bword ptr [rdx]
       movzx    rdx, byte  ptr [rdx+3]
       add      eax, edx
						;; bbWeight=1    PerfScore 18.75
G_M45217_IG04:
       add      rsp, 24
       ret      
						;; bbWeight=1    PerfScore 1.25

; Total bytes of code 82

For reference, adding this extension and swapping Wrapper for ReadOnlySpan<byte> gives the following code for both SumFirst4 and SumFirst4Caller.

public static byte GetUnsafe(this ReadOnlySpan<byte> span, int i)
{
    return Unsafe.Add(ref MemoryMarshal.GetReference(span), i);
}

G_M57413_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M57413_IG02:
       mov      rax, bword ptr [rcx]
       movzx    rdx, byte  ptr [rax]
       movzx    rcx, byte  ptr [rax+1]
       add      edx, ecx
       movzx    rcx, byte  ptr [rax+2]
       add      edx, ecx
       movzx    rax, byte  ptr [rax+3]
       add      eax, edx
						;; bbWeight=1    PerfScore 10.75
G_M57413_IG03:
       ret      
						;; bbWeight=1    PerfScore 1.00

; Total bytes of code 25

category:cq
theme:structs
skill-level:expert
cost:large
impact:large

Metadata

Metadata

Assignees

Labels

Priority:2Work that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions