Skip to content

Computed goto codegen is inconsistent on aarch64. #14444

Closed
@fubark

Description

@fubark

Zig Version

0.11.0-dev.1436+59d9afcb5

Steps to Reproduce and Observed Behavior

On compiler explorer with the args -O ReleaseFast -target aarch64-macos.12-none

extern fn foo() void;
extern fn foo2() void;
extern fn foo3() void;

const Op = enum {
    add,
    sub,
    mul,
    ret,
};

export fn eval(insts: [*]const u8) void {
    var pc: usize = 0;
    while (true) {
        switch (@intToEnum(Op, insts[pc])) {
            .add => {
                foo();
                pc += 1;
                continue;
            },
            .sub => {
                foo2();
                pc += 2;
                continue;
            },
            .mul => {
                foo3();
                pc += 3;
                continue;
            },
            .ret => {
                return;
            },
        }
    }
}

Generates the following:

_eval:
        stp     x22, x21, [sp, #-48]!
        stp     x20, x19, [sp, #16]
        stp     x29, x30, [sp, #32]
        mov     x19, x0
        mov     x20, xzr
        adrp    x21, LJTI0_0@PAGE
        add     x21, x21, LJTI0_0@PAGEOFF
        b       LBB0_2
LBB0_1:
        bl      _foo3
        mov     w8, #3
        add     x20, x20, x8
LBB0_2:
        ldrb    w8, [x19, x20]
        and     x8, x8, #0x3
        adr     x9, LBB0_1
        ldrb    w10, [x21, x8]
        add     x9, x9, x10, lsl #2
        br      x9
LBB0_3:
        bl      _foo
        mov     w8, #1
        add     x20, x20, x8
        b       LBB0_2
LBB0_4:
        bl      _foo2
        mov     w8, #2
        add     x20, x20, x8
        b       LBB0_2
LBB0_5:
        ldp     x29, x30, [sp, #32]
        ldp     x20, x19, [sp, #16]
        ldp     x22, x21, [sp], #48
        ret
LJTI0_0:
        .byte   (LBB0_3-LBB0_1)>>2
        .byte   (LBB0_4-LBB0_1)>>2
        .byte   (LBB0_1-LBB0_1)>>2
        .byte   (LBB0_5-LBB0_1)>>2

Expected Behavior

I expected the codegen to be similar to x86_64 which inlines the jump to the next switch case. Instead on aarch64, it does an indirect jump first and then does the table lookup. Here is the x86_64 output for comparison:

eval:
        push    r14
        push    rbx
        push    rax
        mov     r14, rdi
        xor     ebx, ebx
        movzx   eax, byte ptr [rdi + rbx]
        and     eax, 3
        jmp     qword ptr [8*rax + .LJTI0_0]
.LBB0_1:
        call    foo@PLT
        mov     eax, 1
        add     rbx, rax
        movzx   eax, byte ptr [r14 + rbx]
        and     eax, 3
        jmp     qword ptr [8*rax + .LJTI0_0]
.LBB0_2:
        call    foo2@PLT
        mov     eax, 2
        add     rbx, rax
        movzx   eax, byte ptr [r14 + rbx]
        and     eax, 3
        jmp     qword ptr [8*rax + .LJTI0_0]
.LBB0_3:
        call    foo3@PLT
        mov     eax, 3
        add     rbx, rax
        movzx   eax, byte ptr [r14 + rbx]
        and     eax, 3
        jmp     qword ptr [8*rax + .LJTI0_0]
.LBB0_4:
        add     rsp, 8
        pop     rbx
        pop     r14
        ret
.LJTI0_0:
        .quad   .LBB0_1
        .quad   .LBB0_2
        .quad   .LBB0_3
        .quad   .LBB0_4

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions