Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add f16 formatting and parsing #127013

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
dec2flt: Refactor float traits
  • Loading branch information
tgross35 committed Dec 9, 2024
commit a67248cff1f13606d53cc328a28c65df1c605d52
250 changes: 160 additions & 90 deletions library/core/src/num/dec2flt/float.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,56 @@
//! Helper trait for generic float types.

use core::f64;

use crate::fmt::{Debug, LowerExp};
use crate::num::FpCategory;
use crate::ops::{Add, Div, Mul, Neg};
use crate::ops::{self, Add, Div, Mul, Neg};

pub trait CastInto<T: Copy>: Copy {
fn cast(self) -> T;
}

pub trait Integer:
Sized
+ Clone
+ Copy
+ Debug
+ ops::Shr<u32, Output = Self>
+ ops::Shl<u32, Output = Self>
+ ops::BitAnd<Output = Self>
+ ops::BitOr<Output = Self>
+ PartialEq
+ CastInto<i16>
{
const ZERO: Self;
const ONE: Self;
}

/// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
macro_rules! int {
($($ty:ty),+) => {
$(
impl CastInto<i16> for $ty {
fn cast(self) -> i16 {
self as i16
}
}


impl Integer for $ty {
const ZERO: Self = 0;
const ONE: Self = 1;
}
)+
}
}

int!(u16, u32, u64);

/// A helper trait to avoid duplicating basically all the conversion code for IEEE floats.
///
/// See the parent module's doc comment for why this is necessary.
///
/// Should **never ever** be implemented for other types or be used outside the dec2flt module.
/// Should **never ever** be implemented for other types or be used outside the `dec2flt` module.
#[doc(hidden)]
pub trait RawFloat:
Sized
Expand All @@ -24,62 +66,93 @@ pub trait RawFloat:
+ Copy
+ Debug
{
/// The unsigned integer with the same size as the float
type Int: Integer + Into<u64>;

/* general constants */

const INFINITY: Self;
const NEG_INFINITY: Self;
const NAN: Self;
const NEG_NAN: Self;

/// Bit width of the float
const BITS: u32;

/// Mantissa digits including the hidden bit (provided by core)
const MANTISSA_BITS: u32;

const EXPONENT_MASK: Self::Int;
const MANTISSA_MASK: Self::Int;

/// The number of bits in the significand, *excluding* the hidden bit.
const MANTISSA_EXPLICIT_BITS: usize;

// Round-to-even only happens for negative values of q
// when q ≥ −4 in the 64-bit case and when q ≥ −17 in
// the 32-bitcase.
//
// When q ≥ 0,we have that 5^q ≤ 2m+1. In the 64-bit case,we
// have 5^q ≤ 2m+1 ≤ 2^54 or q ≤ 23. In the 32-bit case,we have
// 5^q ≤ 2m+1 ≤ 2^25 or q ≤ 10.
//
// When q < 0, we have w ≥ (2m+1)×5^−q. We must have that w < 2^64
// so (2m+1)×5^−q < 2^64. We have that 2m+1 > 2^53 (64-bit case)
// or 2m+1 > 2^24 (32-bit case). Hence,we must have 2^53×5^−q < 2^64
// (64-bit) and 2^24×5^−q < 2^64 (32-bit). Hence we have 5^−q < 2^11
// or q ≥ −4 (64-bit case) and 5^−q < 2^40 or q ≥ −17 (32-bitcase).
//
// Thus we have that we only need to round ties to even when
// we have that q ∈ [−4,23](in the 64-bit case) or q∈[−17,10]
// (in the 32-bit case). In both cases,the power of five(5^|q|)
// fits in a 64-bit word.
const MANTISSA_EXPLICIT_BITS: u32 = Self::MANTISSA_BITS - 1;

/// Bits for the exponent
const EXPONENT_BITS: u32 = Self::BITS - Self::MANTISSA_EXPLICIT_BITS - 1;

/// Minimum exponent value `-(1 << (EXP_BITS - 1)) + 1`.
const MINIMUM_EXPONENT: i32 = -(1 << (Self::EXPONENT_BITS - 1)) + 1;

/// Maximum exponent without overflowing to infinity
const MAXIMUM_EXPONENT: u32 = (1 << Self::EXPONENT_BITS) - 1;

/// The exponent bias value
const EXPONENT_BIAS: u32 = Self::MAXIMUM_EXPONENT >> 1;

/// Largest exponent value `(1 << EXP_BITS) - 1`.
const INFINITE_POWER: i32 = (1 << Self::EXPONENT_BITS) - 1;

/// Round-to-even only happens for negative values of q
/// when q ≥ −4 in the 64-bit case and when q ≥ −17 in
/// the 32-bitcase.
///
/// When q ≥ 0,we have that 5^q ≤ 2m+1. In the 64-bit case,we
/// have 5^q ≤ 2m+1 ≤ 2^54 or q ≤ 23. In the 32-bit case,we have
/// 5^q ≤ 2m+1 ≤ 2^25 or q ≤ 10.
///
/// When q < 0, we have w ≥ (2m+1)×5^−q. We must have that w < 2^64
/// so (2m+1)×5^−q < 2^64. We have that 2m+1 > 2^53 (64-bit case)
/// or 2m+1 > 2^24 (32-bit case). Hence,we must have 2^53×5^−q < 2^64
/// (64-bit) and 2^24×5^−q < 2^64 (32-bit). Hence we have 5^−q < 2^11
/// or q ≥ −4 (64-bit case) and 5^−q < 2^40 or q ≥ −17 (32-bitcase).
///
/// Thus we have that we only need to round ties to even when
/// we have that q ∈ [−4,23](in the 64-bit case) or q∈[−17,10]
/// (in the 32-bit case). In both cases,the power of five(5^|q|)
/// fits in a 64-bit word.
const MIN_EXPONENT_ROUND_TO_EVEN: i32;
const MAX_EXPONENT_ROUND_TO_EVEN: i32;

// Minimum exponent that for a fast path case, or `-⌊(MANTISSA_EXPLICIT_BITS+1)/log2(5)⌋`
const MIN_EXPONENT_FAST_PATH: i64;

// Maximum exponent that for a fast path case, or `⌊(MANTISSA_EXPLICIT_BITS+1)/log2(5)⌋`
const MAX_EXPONENT_FAST_PATH: i64;
/// Largest decimal exponent for a non-infinite value.
///
/// This is the max exponent in binary converted to the max exponent in decimal. Allows fast
/// pathing anything larger than `10^LARGEST_POWER_OF_TEN`, which will round to infinity.
const LARGEST_POWER_OF_TEN: i32 =
((Self::EXPONENT_BIAS as f64 + 1.0) / f64::consts::LOG2_10) as i32;

// Maximum exponent that can be represented for a disguised-fast path case.
// This is `MAX_EXPONENT_FAST_PATH + ⌊(MANTISSA_EXPLICIT_BITS+1)/log2(10)⌋`
const MAX_EXPONENT_DISGUISED_FAST_PATH: i64;
/// Smallest decimal exponent for a non-zero value. This allows for fast pathing anything
/// smaller than `10^SMALLEST_POWER_OF_TEN`.
const SMALLEST_POWER_OF_TEN: i32 =
-(((Self::EXPONENT_BIAS + Self::MANTISSA_BITS + 64) as f64) / f64::consts::LOG2_10) as i32;

// Minimum exponent value `-(1 << (EXP_BITS - 1)) + 1`.
const MINIMUM_EXPONENT: i32;
/* Fast pathing */

// Largest exponent value `(1 << EXP_BITS) - 1`.
const INFINITE_POWER: i32;
/// Maximum exponent for a fast path case, or `⌊(MANTISSA_EXPLICIT_BITS+1)/log2(5)⌋`
// assuming FLT_EVAL_METHOD = 0
const MAX_EXPONENT_FAST_PATH: i64 =
((Self::MANTISSA_BITS as f64) / (f64::consts::LOG2_10 - 1.0)) as i64;

// Index (in bits) of the sign.
const SIGN_INDEX: usize;
/// Minimum exponent for a fast path case, or `-⌊(MANTISSA_EXPLICIT_BITS+1)/log2(5)⌋`
const MIN_EXPONENT_FAST_PATH: i64 = -Self::MAX_EXPONENT_FAST_PATH;

// Smallest decimal exponent for a non-zero value.
const SMALLEST_POWER_OF_TEN: i32;
/// Maximum exponent that can be represented for a disguised-fast path case.
/// This is `MAX_EXPONENT_FAST_PATH + ⌊(MANTISSA_EXPLICIT_BITS+1)/log2(10)⌋`
const MAX_EXPONENT_DISGUISED_FAST_PATH: i64 =
Self::MAX_EXPONENT_FAST_PATH + (Self::MANTISSA_BITS as f64 / f64::consts::LOG2_10) as i64;

// Largest decimal exponent for a non-infinite value.
const LARGEST_POWER_OF_TEN: i32;

// Maximum mantissa for the fast-path (`1 << 53` for f64).
const MAX_MANTISSA_FAST_PATH: u64 = 2_u64 << Self::MANTISSA_EXPLICIT_BITS;
/// Maximum mantissa for the fast-path (`1 << 53` for f64).
const MAX_MANTISSA_FAST_PATH: u64 = 1 << Self::MANTISSA_BITS;

/// Converts integer into float through an as cast.
/// This is only called in the fast-path algorithm, and therefore
Expand All @@ -96,27 +169,45 @@ pub trait RawFloat:
/// Returns the category that this number falls into.
fn classify(self) -> FpCategory;

/// Transmute to the integer representation
fn to_bits(self) -> Self::Int;

/// Returns the mantissa, exponent and sign as integers.
fn integer_decode(self) -> (u64, i16, i8);
///
/// That is, this returns `(m, p, s)` such that `s * m * 2^p` represents the original float.
/// For 0, the exponent will be `-(EXPONENT_BIAS + MANTISSA_EXPLICIT_BITS`, which is the
/// minimum subnormal power.
fn integer_decode(self) -> (u64, i16, i8) {
let bits = self.to_bits();
let sign: i8 = if bits >> (Self::BITS - 1) == Self::Int::ZERO { 1 } else { -1 };
let mut exponent: i16 =
((bits & Self::EXPONENT_MASK) >> Self::MANTISSA_EXPLICIT_BITS).cast();
let mantissa = if exponent == 0 {
(bits & Self::MANTISSA_MASK) << 1
} else {
(bits & Self::MANTISSA_MASK) | (Self::Int::ONE << Self::MANTISSA_EXPLICIT_BITS)
};
// Exponent bias + mantissa shift
exponent -= (Self::EXPONENT_BIAS + Self::MANTISSA_EXPLICIT_BITS) as i16;
(mantissa.into(), exponent, sign)
}
}

impl RawFloat for f32 {
type Int = u32;

const INFINITY: Self = f32::INFINITY;
const NEG_INFINITY: Self = f32::NEG_INFINITY;
const NAN: Self = f32::NAN;
const NEG_NAN: Self = -f32::NAN;

const MANTISSA_EXPLICIT_BITS: usize = 23;
const BITS: u32 = 32;
const MANTISSA_BITS: u32 = Self::MANTISSA_DIGITS;
const EXPONENT_MASK: Self::Int = Self::EXP_MASK;
const MANTISSA_MASK: Self::Int = Self::MAN_MASK;

const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -17;
const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 10;
const MIN_EXPONENT_FAST_PATH: i64 = -10; // assuming FLT_EVAL_METHOD = 0
const MAX_EXPONENT_FAST_PATH: i64 = 10;
const MAX_EXPONENT_DISGUISED_FAST_PATH: i64 = 17;
const MINIMUM_EXPONENT: i32 = -127;
const INFINITE_POWER: i32 = 0xFF;
const SIGN_INDEX: usize = 31;
const SMALLEST_POWER_OF_TEN: i32 = -65;
const LARGEST_POWER_OF_TEN: i32 = 38;

#[inline]
fn from_u64(v: u64) -> Self {
Expand All @@ -136,16 +227,8 @@ impl RawFloat for f32 {
TABLE[exponent & 15]
}

/// Returns the mantissa, exponent and sign as integers.
fn integer_decode(self) -> (u64, i16, i8) {
let bits = self.to_bits();
let sign: i8 = if bits >> 31 == 0 { 1 } else { -1 };
let mut exponent: i16 = ((bits >> 23) & 0xff) as i16;
let mantissa =
if exponent == 0 { (bits & 0x7fffff) << 1 } else { (bits & 0x7fffff) | 0x800000 };
// Exponent bias + mantissa shift
exponent -= 127 + 23;
(mantissa as u64, exponent, sign)
fn to_bits(self) -> Self::Int {
self.to_bits()
}

fn classify(self) -> FpCategory {
Expand All @@ -154,22 +237,20 @@ impl RawFloat for f32 {
}

impl RawFloat for f64 {
const INFINITY: Self = f64::INFINITY;
const NEG_INFINITY: Self = f64::NEG_INFINITY;
const NAN: Self = f64::NAN;
const NEG_NAN: Self = -f64::NAN;
type Int = u64;

const INFINITY: Self = Self::INFINITY;
const NEG_INFINITY: Self = Self::NEG_INFINITY;
const NAN: Self = Self::NAN;
const NEG_NAN: Self = -Self::NAN;

const BITS: u32 = 64;
const MANTISSA_BITS: u32 = Self::MANTISSA_DIGITS;
const EXPONENT_MASK: Self::Int = Self::EXP_MASK;
const MANTISSA_MASK: Self::Int = Self::MAN_MASK;

const MANTISSA_EXPLICIT_BITS: usize = 52;
const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -4;
const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 23;
const MIN_EXPONENT_FAST_PATH: i64 = -22; // assuming FLT_EVAL_METHOD = 0
const MAX_EXPONENT_FAST_PATH: i64 = 22;
const MAX_EXPONENT_DISGUISED_FAST_PATH: i64 = 37;
const MINIMUM_EXPONENT: i32 = -1023;
const INFINITE_POWER: i32 = 0x7FF;
const SIGN_INDEX: usize = 63;
const SMALLEST_POWER_OF_TEN: i32 = -342;
const LARGEST_POWER_OF_TEN: i32 = 308;

#[inline]
fn from_u64(v: u64) -> Self {
Expand All @@ -190,19 +271,8 @@ impl RawFloat for f64 {
TABLE[exponent & 31]
}

/// Returns the mantissa, exponent and sign as integers.
fn integer_decode(self) -> (u64, i16, i8) {
let bits = self.to_bits();
let sign: i8 = if bits >> 63 == 0 { 1 } else { -1 };
let mut exponent: i16 = ((bits >> 52) & 0x7ff) as i16;
let mantissa = if exponent == 0 {
(bits & 0xfffffffffffff) << 1
} else {
(bits & 0xfffffffffffff) | 0x10000000000000
};
// Exponent bias + mantissa shift
exponent -= 1023 + 52;
(mantissa, exponent, sign)
fn to_bits(self) -> Self::Int {
self.to_bits()
}

fn classify(self) -> FpCategory {
Expand Down
4 changes: 2 additions & 2 deletions library/core/src/num/dec2flt/lemire.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub fn compute_float<F: RawFloat>(q: i64, mut w: u64) -> BiasedFp {
// Normalize our significant digits, so the most-significant bit is set.
let lz = w.leading_zeros();
w <<= lz;
let (lo, hi) = compute_product_approx(q, w, F::MANTISSA_EXPLICIT_BITS + 3);
let (lo, hi) = compute_product_approx(q, w, F::MANTISSA_EXPLICIT_BITS as usize + 3);
if lo == 0xFFFF_FFFF_FFFF_FFFF {
// If we have failed to approximate w x 5^-q with our 128-bit value.
// Since the addition of 1 could lead to an overflow which could then
Expand Down Expand Up @@ -89,7 +89,7 @@ pub fn compute_float<F: RawFloat>(q: i64, mut w: u64) -> BiasedFp {
if lo <= 1
&& q >= F::MIN_EXPONENT_ROUND_TO_EVEN as i64
&& q <= F::MAX_EXPONENT_ROUND_TO_EVEN as i64
&& mantissa & 3 == 1
&& mantissa & 0b11 == 0b01
&& (mantissa << (upperbit + 64 - F::MANTISSA_EXPLICIT_BITS as i32 - 3)) == hi
{
// Zero the lowest bit, so we don't round up.
Expand Down
2 changes: 1 addition & 1 deletion library/core/src/num/dec2flt/slow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ pub(crate) fn parse_long_mantissa<F: RawFloat>(s: &[u8]) -> BiasedFp {
}
// Shift the decimal to the hidden bit, and then round the value
// to get the high mantissa+1 bits.
d.left_shift(F::MANTISSA_EXPLICIT_BITS + 1);
d.left_shift(F::MANTISSA_EXPLICIT_BITS as usize + 1);
let mut mantissa = d.round();
if mantissa >= (1_u64 << (F::MANTISSA_EXPLICIT_BITS + 1)) {
// Rounding up overflowed to the carry bit, need to
Expand Down
6 changes: 3 additions & 3 deletions src/etc/test-float-parse/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,12 @@ pub trait Float:
}

macro_rules! impl_float {
($($fty:ty, $ity:ty, $bits:literal);+) => {
($($fty:ty, $ity:ty);+) => {
$(
impl Float for $fty {
type Int = $ity;
type SInt = <Self::Int as Int>::Signed;
const BITS: u32 = $bits;
const BITS: u32 = <$ity>::BITS;
const MAN_BITS: u32 = Self::MANTISSA_DIGITS - 1;
const MAN_MASK: Self::Int = (Self::Int::ONE << Self::MAN_BITS) - Self::Int::ONE;
const SIGN_MASK: Self::Int = Self::Int::ONE << (Self::BITS-1);
Expand All @@ -168,7 +168,7 @@ macro_rules! impl_float {
}
}

impl_float!(f32, u32, 32; f64, u64, 64);
impl_float!(f32, u32; f64, u64;

/// A test generator. Should provide an iterator that produces unique patterns to parse.
///
Expand Down