Skip to content

Commit

Permalink
Simplify and clarify some value type parsing (#1601)
Browse files Browse the repository at this point in the history
Reading over #1600 I remembered that I have a difficult time
understanding how value types are encoded in wasm. There's a number of
overlapping concerns and a bit of duplication within `wasmparser`
itself. I've tried to leave an explanatory comment for myself in the
future which can also hopefully help serve others as well.

Along the way I've also managed to remove `ValType::is_valtype_byte`
with some simpler logic to avoid duplication of all the value type bytes
that are supported.
  • Loading branch information
alexcrichton authored Jun 10, 2024
1 parent 10d2e21 commit 8dc6ddf
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 30 deletions.
29 changes: 21 additions & 8 deletions crates/wasmparser/src/binary_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,14 +732,27 @@ impl<'a> BinaryReader<'a> {
pub(crate) fn read_block_type(&mut self) -> Result<BlockType> {
let b = self.peek()?;

// Check for empty block
if b == 0x40 {
self.position += 1;
return Ok(BlockType::Empty);
}

// Check for a block type of form [] -> [t].
if ValType::is_valtype_byte(b) {
// Block types are encoded as either 0x40, a `valtype`, or `s33`. All
// current `valtype` encodings are negative numbers when encoded with
// sleb128, but it's also required that valtype encodings are in their
// canonical form. For example an overlong encoding of -1 as `0xff 0x7f`
// is not valid and it is required to be `0x7f`. This means that we
// can't simply match on the `s33` that pops out below since reading the
// whole `s33` might read an overlong encoding.
//
// To test for this the first byte `b` is inspected. The highest bit,
// the continuation bit in LEB128 encoding, must be clear. The next bit,
// the sign bit, must be set to indicate that the number is negative. If
// these two conditions hold then we're guaranteed that this is a
// negative number.
//
// After this a value type is read directly instead of looking for an
// indexed value type.
if b & 0x80 == 0 && b & 0x40 != 0 {
if b == 0x40 {
self.position += 1;
return Ok(BlockType::Empty);
}
return Ok(BlockType::Type(self.read()?));
}

Expand Down
93 changes: 71 additions & 22 deletions crates/wasmparser/src/readers/core/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1369,18 +1369,10 @@ pub enum HeapType {
NoExn,
}

impl ValType {
pub(crate) fn is_valtype_byte(byte: u8) -> bool {
match byte {
0x7F | 0x7E | 0x7D | 0x7C | 0x7B | 0x70 | 0x6F | 0x64 | 0x63 | 0x6E | 0x71 | 0x72
| 0x74 | 0x73 | 0x6D | 0x6B | 0x6A | 0x6C | 0x69 => true,
_ => false,
}
}
}

impl<'a> FromReader<'a> for StorageType {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
match reader.peek()? {
0x78 => {
reader.read_u8()?;
Expand All @@ -1397,6 +1389,53 @@ impl<'a> FromReader<'a> for StorageType {

impl<'a> FromReader<'a> for ValType {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
// Decoding value types is sort of subtle because the space of what's
// being decoded here is actually spread out across an number of
// locations. This comment here is intended to serve as a bit of a
// reference to what's being decoded here and how it interacts with
// other locations.
//
// Note that all value types are encoded as canonical-form negative
// numbers in the sleb128 encoding scheme. Currently in the wasm spec
// sleb128 isn't actually used but it looks to be modelled to allow it
// one day. In the meantime the current values used are:
//
// | sleb128 | decimal | type | notes |
// |---------|---------|--------------|------------------------------|
// | 0x7F | -1 | i32 | |
// | 0x7E | -2 | i64 | |
// | 0x7D | -3 | f32 | |
// | 0x7C | -4 | f64 | |
// | 0x7B | -5 | v128 | simd proposal |
// | 0x78 | -8 | i8 | gc proposal, in `FieldType` |
// | 0x77 | -9 | i16 | gc proposal, in `FieldType` |
// | 0x74 | -12 | noexn | gc + exceptions proposal |
// | 0x73 | -13 | nofunc | gc proposal |
// | 0x72 | -14 | noextern | gc proposal |
// | 0x71 | -15 | nullref | gc proposal |
// | 0x70 | -16 | func | reference types proposal |
// | 0x6F | -17 | extern | reference types proposal |
// | 0x6E | -18 | any | gc proposal |
// | 0x6D | -19 | eq | gc proposal |
// | 0x6C | -20 | i31 | gc proposal |
// | 0x6B | -21 | struct | gc proposal |
// | 0x6A | -22 | array | gc proposal |
// | 0x69 | -23 | exnref | gc + exceptions proposal |
// | 0x64 | -28 | ref $t | gc proposal, prefix byte |
// | 0x63 | -29 | ref null $t | gc proposal, prefix byte |
// | 0x60 | -32 | func $t | prefix byte |
// | 0x5f | -33 | struct $t | gc proposal, prefix byte |
// | 0x5e | -34 | array $t | gc proposal, prefix byte |
// | 0x50 | -48 | sub $t | gc proposal, prefix byte |
// | 0x4F | -49 | sub final $t | gc proposal, prefix byte |
// | 0x4E | -50 | rec $t | gc proposal, prefix byte |
// | 0x40 | -64 | ε | empty block type |
//
// Note that not all of these encodings are parsed here, for example
// 0x78 as the encoding for `i8` is parsed only in `FieldType`. The
// parsing of `FieldType` will delegate here without actually consuming
// anything though so the encoding 0x78 still must be disjoint and not
// read here otherwise.
match reader.peek()? {
0x7F => {
reader.read_u8()?;
Expand Down Expand Up @@ -1427,32 +1466,36 @@ impl<'a> FromReader<'a> for ValType {

impl<'a> FromReader<'a> for RefType {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
match reader.read()? {
0x70 => Ok(RefType::FUNC.nullable()),
0x6F => Ok(RefType::EXTERN.nullable()),
0x6E => Ok(RefType::ANY.nullable()),
0x71 => Ok(RefType::NONE.nullable()),
0x72 => Ok(RefType::NOEXTERN.nullable()),
0x73 => Ok(RefType::NOFUNC.nullable()),
0x6D => Ok(RefType::EQ.nullable()),
0x6B => Ok(RefType::STRUCT.nullable()),
0x6A => Ok(RefType::ARRAY.nullable()),
0x6C => Ok(RefType::I31.nullable()),
0x69 => Ok(RefType::EXN.nullable()),
0x74 => Ok(RefType::NOEXN.nullable()),
byte @ (0x63 | 0x64) => {
let nullable = byte == 0x63;
let pos = reader.original_position();
RefType::new(nullable, reader.read()?)
.ok_or_else(|| crate::BinaryReaderError::new("type index too large", pos))
}
0x69 => Ok(RefType::EXN.nullable()),
0x6A => Ok(RefType::ARRAY.nullable()),
0x6B => Ok(RefType::STRUCT.nullable()),
0x6C => Ok(RefType::I31.nullable()),
0x6D => Ok(RefType::EQ.nullable()),
0x6E => Ok(RefType::ANY.nullable()),
0x6F => Ok(RefType::EXTERN.nullable()),
0x70 => Ok(RefType::FUNC.nullable()),
0x71 => Ok(RefType::NONE.nullable()),
0x72 => Ok(RefType::NOEXTERN.nullable()),
0x73 => Ok(RefType::NOFUNC.nullable()),
0x74 => Ok(RefType::NOEXN.nullable()),
_ => bail!(reader.original_position(), "malformed reference type"),
}
}
}

impl<'a> FromReader<'a> for HeapType {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
match reader.peek()? {
0x70 => {
reader.read_u8()?;
Expand Down Expand Up @@ -1669,6 +1712,8 @@ fn read_composite_type(
opcode: u8,
reader: &mut BinaryReader,
) -> Result<CompositeType, BinaryReaderError> {
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
Ok(match opcode {
0x60 => CompositeType::Func(reader.read()?),
0x5e => CompositeType::Array(reader.read()?),
Expand All @@ -1679,6 +1724,8 @@ fn read_composite_type(

impl<'a> FromReader<'a> for RecGroup {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
match reader.peek()? {
0x4e => {
reader.read_u8()?;
Expand All @@ -1702,6 +1749,8 @@ impl<'a> FromReader<'a> for RecGroup {
impl<'a> FromReader<'a> for SubType {
fn from_reader(reader: &mut BinaryReader<'a>) -> Result<Self> {
let pos = reader.original_position();
// NB: See `FromReader<'a> for ValType` for a table of how this
// interacts with other value encodings.
Ok(match reader.read_u8()? {
opcode @ (0x4f | 0x50) => {
let idx_iter = reader.read_iter(MAX_WASM_SUPERTYPES, "supertype idxs")?;
Expand Down

0 comments on commit 8dc6ddf

Please sign in to comment.