Skip to content

__multi3 optimization #4077

@shamatar

Description

@shamatar

Feature

Implement an optimization pass that would eliminate the __multi3 function from WASM binary during JIT by replacing it with ISA specific (mainly for x86_64 and arm64) sequences, and then inline such sequences into callsites that would allow further optimizations

Benefit

A lot of code dealing with cryptography would benefit form faster full width u64 multiplications where such __multi3 arises

Implementation

If someone would give a few hints about where to start I'd try to implement it by myself

Alternatives

Not that I'm aware of. Patching into calling come native library function is a huge overhead for modern CPUs (4 cycles for x86_64 for e.g. mulx or mul), and while it would be faster most likely, it's still far from optimal case on a hot path

As an example a simple multiply-add-carry function like a*b + c + carry -> (high, low) that accumulates into u128 without overflows compiles down to the listing below, and it can be a good test subject (transformed from wasm into wat, may be not the best readable)

(module
  (type (;0;) (func (param i32 i64 i64 i64 i64)))
  (func $mac (type 0) (param i32 i64 i64 i64 i64)
    (local i32)
    global.get $__stack_pointer
    i32.const 16
    i32.sub
    local.tee 5
    global.set $__stack_pointer
    local.get 5
    local.get 2
    i64.const 0
    local.get 1
    i64.const 0
    call $__multi3
    local.get 0
    local.get 5
    i64.load
    local.tee 2
    local.get 3
    i64.add
    local.tee 3
    local.get 4
    i64.add
    local.tee 4
    i64.store
    local.get 0
    local.get 5
    i32.const 8
    i32.add
    i64.load
    local.get 3
    local.get 2
    i64.lt_u
    i64.extend_i32_u
    i64.add
    local.get 4
    local.get 3
    i64.lt_u
    i64.extend_i32_u
    i64.add
    i64.store offset=8
    local.get 5
    i32.const 16
    i32.add
    global.set $__stack_pointer
  )
  (func $__multi3 (type 0) (param i32 i64 i64 i64 i64)
    (local i64 i64 i64 i64 i64 i64)
    local.get 0
    local.get 3
    i64.const 4294967295
    i64.and
    local.tee 5
    local.get 1
    i64.const 4294967295
    i64.and
    local.tee 6
    i64.mul
    local.tee 7
    local.get 5
    local.get 1
    i64.const 32
    i64.shr_u
    local.tee 8
    i64.mul
    local.tee 9
    local.get 3
    i64.const 32
    i64.shr_u
    local.tee 10
    local.get 6
    i64.mul
    i64.add
    local.tee 5
    i64.const 32
    i64.shl
    i64.add
    local.tee 6
    i64.store
    local.get 0
    local.get 10
    local.get 8
    i64.mul
    local.get 5
    local.get 9
    i64.lt_u
    i64.extend_i32_u
    i64.const 32
    i64.shl
    local.get 5
    i64.const 32
    i64.shr_u
    i64.or
    i64.add
    local.get 6
    local.get 7
    i64.lt_u
    i64.extend_i32_u
    i64.add
    local.get 4
    local.get 1
    i64.mul
    local.get 3
    local.get 2
    i64.mul
    i64.add
    i64.add
    i64.store offset=8
  )
  (table (;0;) 1 1 funcref)
  (memory (;0;) 16)
  (global $__stack_pointer (mut i32) i32.const 1048576)
  (global (;1;) i32 i32.const 1048576)
  (global (;2;) i32 i32.const 1048576)
  (export "memory" (memory 0))
  (export "mac" (func $mac))
  (export "__data_end" (global 1))
  (export "__heap_base" (global 2))
)

Metadata

Metadata

Assignees

No one assigned

    Labels

    craneliftIssues related to the Cranelift code generatorenhancement

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions