Description
Related to #422 and #725, I'm wondering what I should to to have fast reads of big-endian numbers from a buffer.
My use case is I have a large data structure that uses network byte ordering (aka big-endian) and wasm is always little-endian.
I'm trying to write C code for clang that emits effecient bytecode, but it gets really nasty whenever I need to load a BE uint64_t
to a native number in memory.
Suppose the following test program:
#include <stdint.h>
#define memcpy(dst, src, len) __builtin_memcpy(dst, src, len)
int popcnt_u32(uint32_t val) {
int count = 0;
while (val) {
count++;
val &= (val - (uint32_t)1);
}
return count;
}
int popcnt_u64(uint64_t val) {
int count = 0;
while (val) {
count++;
val &= (val - (uint64_t)1);
}
return count;
}
int is_big_endian(void) {
union {
uint32_t i;
char c[4];
} bint = {0x01020304};
return bint.c[0] == 1;
}
uint32_t swap_u32(uint32_t val) {
if (is_big_endian())
return val;
return ((val & 0xFF000000u) >> 24u) | ((val & 0x00FF0000u) >> 8u) | ((val & 0x0000FF00u) << 8u) |
((val & 0x000000FFu) << 24u);
}
uint64_t swap_u64(uint64_t val) {
if (is_big_endian())
return val;
return ((val & 0xFF00000000000000u) >> 56u) | ((val & 0x00FF000000000000u) >> 40u) |
((val & 0x0000FF0000000000u) >> 24u) | ((val & 0x000000FF00000000u) >> 8u) |
((val & 0x00000000FF000000u) << 8u) | ((val & 0x0000000000FF0000u) << 24u) |
((val & 0x000000000000FF00u) << 40u) | ((val & 0x00000000000000FFu) << 56u);
}
uint32_t read_u32_swap(uint8_t* storage) {
uint32_t val;
memcpy(&val, storage, sizeof(uint32_t));
return swap_u32(val);
}
uint32_t read_u32(uint8_t* storage) {
uint8_t* data = storage;
if (is_big_endian())
return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U;
else
return (uint64_t)data[0] << 24U | (uint64_t)data[1] << 16U | (uint64_t)data[2] << 8U | (uint64_t)data[3] << 0U;
}
uint64_t read_u64_swap(uint8_t* storage) {
uint64_t val;
memcpy(&val, storage, sizeof(uint64_t));
return swap_u64(val);
}
uint64_t read_u64(uint8_t* storage) {
uint8_t* data = storage;
if (is_big_endian())
return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U |
(uint64_t)data[4] << 32U | (uint64_t)data[5] << 40U | (uint64_t)data[6] << 48U | (uint64_t)data[7] << 56U;
else
return (uint64_t)data[0] << 56U | (uint64_t)data[1] << 48U | (uint64_t)data[2] << 40U | (uint64_t)data[3] << 32U |
(uint64_t)data[4] << 24U | (uint64_t)data[5] << 16U | (uint64_t)data[6] << 8U | (uint64_t)data[7] << 0U;
}
If I compile this to native assembly, clang/llvm detects the patterns for the byteswaping and generates optimal instructions:
popcnt_u32: # @popcnt_u32
popcnt eax, edi
cmove eax, edi
ret
popcnt_u64: # @popcnt_u64
popcnt rax, rdi
ret
is_big_endian: # @is_big_endian
xor eax, eax
ret
swap_u32: # @swap_u32
mov eax, edi
bswap eax
ret
swap_u64: # @swap_u64
mov rax, rdi
bswap rax
ret
read_u32_swap: # @read_u32_swap
movbe eax, dword ptr [rdi]
ret
read_u32: # @read_u32
movbe eax, dword ptr [rdi]
ret
read_u64_swap: # @read_u64_swap
movbe rax, qword ptr [rdi]
ret
read_u64: # @read_u64
movbe rax, qword ptr [rdi]
ret
Notice how the functions reduce down to things like movbe
, bswap
and popcnt
!
But if I try the same thing with webassembly, it doesn't optimize the swaps at all:
popcnt_u32: # @popcnt_u32
local.get 0
i32.popcnt
i32.const 0
local.get 0
i32.select
end_function
popcnt_u64: # @popcnt_u64
local.get 0
i64.popcnt
i32.wrap_i64
end_function
is_big_endian: # @is_big_endian
i32.const 0
end_function
swap_u32: # @swap_u32
local.get 0
i32.const 24
i32.shl
local.get 0
i32.const 8
i32.shl
i32.const 16711680
i32.and
i32.or
local.get 0
i32.const 8
i32.shr_u
i32.const 65280
i32.and
local.get 0
i32.const 24
i32.shr_u
i32.or
i32.or
end_function
swap_u64: # @swap_u64
local.get 0
i64.const 56
i64.shl
local.get 0
i64.const 40
i64.shl
i64.const 71776119061217280
i64.and
i64.or
local.get 0
i64.const 24
i64.shl
i64.const 280375465082880
i64.and
local.get 0
i64.const 8
i64.shl
i64.const 1095216660480
i64.and
i64.or
i64.or
local.get 0
i64.const 8
i64.shr_u
i64.const 4278190080
i64.and
local.get 0
i64.const 24
i64.shr_u
i64.const 16711680
i64.and
i64.or
local.get 0
i64.const 40
i64.shr_u
i64.const 65280
i64.and
local.get 0
i64.const 56
i64.shr_u
i64.or
i64.or
i64.or
end_function
read_u32_swap: # @read_u32_swap
local.get 0
i32.load 0:p2align=0
local.tee 0
i32.const 24
i32.shl
local.get 0
i32.const 8
i32.shl
i32.const 16711680
i32.and
i32.or
local.get 0
i32.const 8
i32.shr_u
i32.const 65280
i32.and
local.get 0
i32.const 24
i32.shr_u
i32.or
i32.or
end_function
read_u32: # @read_u32
local.get 0
i32.load 0:p2align=0
local.tee 0
i32.const 24
i32.shl
local.get 0
i32.const 8
i32.shl
i32.const 16711680
i32.and
i32.or
local.get 0
i32.const 8
i32.shr_u
i32.const 65280
i32.and
local.get 0
i32.const 24
i32.shr_u
i32.or
i32.or
end_function
read_u64_swap: # @read_u64_swap
local.get 0
i64.load 0:p2align=0
local.tee 1
i64.const 56
i64.shl
local.get 1
i64.const 40
i64.shl
i64.const 71776119061217280
i64.and
i64.or
local.get 1
i64.const 24
i64.shl
i64.const 280375465082880
i64.and
local.get 1
i64.const 8
i64.shl
i64.const 1095216660480
i64.and
i64.or
i64.or
local.get 1
i64.const 8
i64.shr_u
i64.const 4278190080
i64.and
local.get 1
i64.const 24
i64.shr_u
i64.const 16711680
i64.and
i64.or
local.get 1
i64.const 40
i64.shr_u
i64.const 65280
i64.and
local.get 1
i64.const 56
i64.shr_u
i64.or
i64.or
i64.or
end_function
read_u64: # @read_u64
local.get 0
i64.load 0:p2align=0
local.tee 1
i64.const 56
i64.shl
local.get 1
i64.const 40
i64.shl
i64.const 71776119061217280
i64.and
i64.or
local.get 1
i64.const 24
i64.shl
i64.const 280375465082880
i64.and
local.get 1
i64.const 8
i64.shl
i64.const 1095216660480
i64.and
i64.or
i64.or
local.get 1
i64.const 8
i64.shr_u
i64.const 4278190080
i64.and
local.get 1
i64.const 24
i64.shr_u
i64.const 16711680
i64.and
i64.or
local.get 1
i64.const 40
i64.shr_u
i64.const 65280
i64.and
local.get 1
i64.const 56
i64.shr_u
i64.or
i64.or
i64.or
end_function
Is there anything I can do to help this not bloat so terribly? Are there proposals for adding bswap
and/or loadbe
to the instruction set? The linked issues above seem to be closed with no progress in years.
I understand that not all real machines have these instructions, but many do. I would say the vast majority of machines that support wasm have these fast instructions natively. And even if the underlying machine can't emit a single native instruction when compiling the wasm to native, it can emit whatever is optimal for the platform. But the real effect will be much smaller wasm code downloaded over the network!
Please reconsider adding these to the spec or if there is an existing primitive that gets me close, please point me in the right direction.