`bswap` and `movebe` equivalents?

Related to #422 and  #725, I'm wondering what I should to to have fast reads of big-endian numbers from a buffer.

My use case is I have a large data structure that uses network byte ordering (aka big-endian) and wasm is always little-endian.

I'm trying to write C code for clang that emits effecient bytecode, but it gets really nasty whenever I need to load a BE `uint64_t` to a native number in memory.

Suppose the following test program:

```c
#include <stdint.h>

#define memcpy(dst, src, len) __builtin_memcpy(dst, src, len)

int popcnt_u32(uint32_t val) {
  int count = 0;
  while (val) {
    count++;
    val &= (val - (uint32_t)1);
  }
  return count;
}

int popcnt_u64(uint64_t val) {
  int count = 0;
  while (val) {
    count++;
    val &= (val - (uint64_t)1);
  }
  return count;
}

int is_big_endian(void) {
  union {
    uint32_t i;
    char c[4];
  } bint = {0x01020304};

  return bint.c[0] == 1;
}

uint32_t swap_u32(uint32_t val) {
  if (is_big_endian())
    return val;
  return ((val & 0xFF000000u) >> 24u) | ((val & 0x00FF0000u) >> 8u) | ((val & 0x0000FF00u) << 8u) |
         ((val & 0x000000FFu) << 24u);
}

uint64_t swap_u64(uint64_t val) {
  if (is_big_endian())
    return val;
  return ((val & 0xFF00000000000000u) >> 56u) | ((val & 0x00FF000000000000u) >> 40u) |
         ((val & 0x0000FF0000000000u) >> 24u) | ((val & 0x000000FF00000000u) >> 8u) |
         ((val & 0x00000000FF000000u) << 8u) | ((val & 0x0000000000FF0000u) << 24u) |
         ((val & 0x000000000000FF00u) << 40u) | ((val & 0x00000000000000FFu) << 56u);
}

uint32_t read_u32_swap(uint8_t* storage) {
  uint32_t val;
  memcpy(&val, storage, sizeof(uint32_t));
  return swap_u32(val);
}

uint32_t read_u32(uint8_t* storage) {
  uint8_t* data = storage;
  if (is_big_endian())
    return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U;
  else
    return (uint64_t)data[0] << 24U | (uint64_t)data[1] << 16U | (uint64_t)data[2] << 8U | (uint64_t)data[3] << 0U;
}

uint64_t read_u64_swap(uint8_t* storage) {
  uint64_t val;
  memcpy(&val, storage, sizeof(uint64_t));
  return swap_u64(val);
}

uint64_t read_u64(uint8_t* storage) {
  uint8_t* data = storage;
  if (is_big_endian())
    return (uint64_t)data[0] << 0U | (uint64_t)data[1] << 8U | (uint64_t)data[2] << 16U | (uint64_t)data[3] << 24U |
           (uint64_t)data[4] << 32U | (uint64_t)data[5] << 40U | (uint64_t)data[6] << 48U | (uint64_t)data[7] << 56U;
  else
    return (uint64_t)data[0] << 56U | (uint64_t)data[1] << 48U | (uint64_t)data[2] << 40U | (uint64_t)data[3] << 32U |
           (uint64_t)data[4] << 24U | (uint64_t)data[5] << 16U | (uint64_t)data[6] << 8U | (uint64_t)data[7] << 0U;
}
```

If I compile this to native assembly, clang/llvm detects the patterns for the byteswaping and generates optimal instructions:
```asm
popcnt_u32:                             # @popcnt_u32
        popcnt  eax, edi
        cmove   eax, edi
        ret
popcnt_u64:                             # @popcnt_u64
        popcnt  rax, rdi
        ret
is_big_endian:                          # @is_big_endian
        xor     eax, eax
        ret
swap_u32:                               # @swap_u32
        mov     eax, edi
        bswap   eax
        ret
swap_u64:                               # @swap_u64
        mov     rax, rdi
        bswap   rax
        ret
read_u32_swap:                          # @read_u32_swap
        movbe   eax, dword ptr [rdi]
        ret
read_u32:                               # @read_u32
        movbe   eax, dword ptr [rdi]
        ret
read_u64_swap:                          # @read_u64_swap
        movbe   rax, qword ptr [rdi]
        ret
read_u64:                               # @read_u64
        movbe   rax, qword ptr [rdi]
        ret
```

Notice how the functions reduce down to things like `movbe`, `bswap` and `popcnt`!

But if I try the same thing with webassembly, it doesn't optimize the swaps at all:
```asm
popcnt_u32:                             # @popcnt_u32
        local.get       0
        i32.popcnt
        i32.const       0
        local.get       0
        i32.select
        end_function
popcnt_u64:                             # @popcnt_u64
        local.get       0
        i64.popcnt
        i32.wrap_i64
        end_function
is_big_endian:                          # @is_big_endian
        i32.const       0
        end_function
swap_u32:                               # @swap_u32
        local.get       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
swap_u64:                               # @swap_u64
        local.get       0
        i64.const       56
        i64.shl 
        local.get       0
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       0
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       0
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       0
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       0
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       0
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       0
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function
read_u32_swap:                          # @read_u32_swap
        local.get       0
        i32.load        0:p2align=0
        local.tee       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
read_u32:                               # @read_u32
        local.get       0
        i32.load        0:p2align=0
        local.tee       0
        i32.const       24
        i32.shl 
        local.get       0
        i32.const       8
        i32.shl 
        i32.const       16711680
        i32.and 
        i32.or  
        local.get       0
        i32.const       8
        i32.shr_u
        i32.const       65280
        i32.and 
        local.get       0
        i32.const       24
        i32.shr_u
        i32.or  
        i32.or  
        end_function
read_u64_swap:                          # @read_u64_swap
        local.get       0
        i64.load        0:p2align=0
        local.tee       1
        i64.const       56
        i64.shl 
        local.get       1
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       1
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       1
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       1
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       1
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       1
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       1
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function
read_u64:                               # @read_u64
        local.get       0
        i64.load        0:p2align=0
        local.tee       1
        i64.const       56
        i64.shl 
        local.get       1
        i64.const       40
        i64.shl 
        i64.const       71776119061217280
        i64.and 
        i64.or  
        local.get       1
        i64.const       24
        i64.shl 
        i64.const       280375465082880
        i64.and 
        local.get       1
        i64.const       8
        i64.shl 
        i64.const       1095216660480
        i64.and 
        i64.or  
        i64.or  
        local.get       1
        i64.const       8
        i64.shr_u
        i64.const       4278190080
        i64.and 
        local.get       1
        i64.const       24
        i64.shr_u
        i64.const       16711680
        i64.and 
        i64.or  
        local.get       1
        i64.const       40
        i64.shr_u
        i64.const       65280
        i64.and 
        local.get       1
        i64.const       56
        i64.shr_u
        i64.or  
        i64.or  
        i64.or  
        end_function
```

Is there anything I can do to help this not bloat so terribly?  Are there proposals for adding `bswap` and/or `loadbe` to the instruction set?  The linked issues above seem to be closed with no progress in years.

I understand that not all real machines have these instructions, but many do.  I would say the vast majority of machines that support wasm have these fast instructions natively.  And even if the underlying machine can't emit a single native instruction when compiling the wasm to native, it can emit whatever is optimal for the platform.  But the real effect will be much smaller wasm code downloaded over the network!

Please reconsider adding these to the spec or if there is an existing primitive that gets me close, please point me in the right direction.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`bswap` and `movebe` equivalents? #1426

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

bswap and movebe equivalents? #1426

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`bswap` and `movebe` equivalents? #1426