-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Description
Feature
Implement an optimization pass that would eliminate the __multi3
function from WASM binary during JIT by replacing it with ISA specific (mainly for x86_64
and arm64
) sequences, and then inline such sequences into callsites that would allow further optimizations
Benefit
A lot of code dealing with cryptography would benefit form faster full width u64
multiplications where such __multi3
arises
Implementation
If someone would give a few hints about where to start I'd try to implement it by myself
Alternatives
Not that I'm aware of. Patching into calling come native library function is a huge overhead for modern CPUs (4 cycles for x86_64
for e.g. mulx
or mul
), and while it would be faster most likely, it's still far from optimal case on a hot path
As an example a simple multiply-add-carry function like a*b + c + carry -> (high, low)
that accumulates into u128
without overflows compiles down to the listing below, and it can be a good test subject (transformed from wasm
into wat
, may be not the best readable)
(module
(type (;0;) (func (param i32 i64 i64 i64 i64)))
(func $mac (type 0) (param i32 i64 i64 i64 i64)
(local i32)
global.get $__stack_pointer
i32.const 16
i32.sub
local.tee 5
global.set $__stack_pointer
local.get 5
local.get 2
i64.const 0
local.get 1
i64.const 0
call $__multi3
local.get 0
local.get 5
i64.load
local.tee 2
local.get 3
i64.add
local.tee 3
local.get 4
i64.add
local.tee 4
i64.store
local.get 0
local.get 5
i32.const 8
i32.add
i64.load
local.get 3
local.get 2
i64.lt_u
i64.extend_i32_u
i64.add
local.get 4
local.get 3
i64.lt_u
i64.extend_i32_u
i64.add
i64.store offset=8
local.get 5
i32.const 16
i32.add
global.set $__stack_pointer
)
(func $__multi3 (type 0) (param i32 i64 i64 i64 i64)
(local i64 i64 i64 i64 i64 i64)
local.get 0
local.get 3
i64.const 4294967295
i64.and
local.tee 5
local.get 1
i64.const 4294967295
i64.and
local.tee 6
i64.mul
local.tee 7
local.get 5
local.get 1
i64.const 32
i64.shr_u
local.tee 8
i64.mul
local.tee 9
local.get 3
i64.const 32
i64.shr_u
local.tee 10
local.get 6
i64.mul
i64.add
local.tee 5
i64.const 32
i64.shl
i64.add
local.tee 6
i64.store
local.get 0
local.get 10
local.get 8
i64.mul
local.get 5
local.get 9
i64.lt_u
i64.extend_i32_u
i64.const 32
i64.shl
local.get 5
i64.const 32
i64.shr_u
i64.or
i64.add
local.get 6
local.get 7
i64.lt_u
i64.extend_i32_u
i64.add
local.get 4
local.get 1
i64.mul
local.get 3
local.get 2
i64.mul
i64.add
i64.add
i64.store offset=8
)
(table (;0;) 1 1 funcref)
(memory (;0;) 16)
(global $__stack_pointer (mut i32) i32.const 1048576)
(global (;1;) i32 i32.const 1048576)
(global (;2;) i32 i32.const 1048576)
(export "memory" (memory 0))
(export "mac" (func $mac))
(export "__data_end" (global 1))
(export "__heap_base" (global 2))
)