Skip to content

Improve accuracy of conversions from floating-point numbers (Fixes #102) #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/FixedPointNumbers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import Base: ==, <, <=, -, +, *, /, ~, isapprox,
using Base: @pure

# T => BaseType
# f => Number of Bytes reserved for fractional part
# f => Number of bits reserved for fractional part
abstract type FixedPoint{T <: Integer, f} <: Real end


Expand All @@ -25,7 +25,7 @@ export
Normed,
floattype,
# "special" typealiases
# Q and U typealiases are exported in separate source files
# Q and N typealiases are exported in separate source files
# Functions
scaledual

Expand Down
62 changes: 48 additions & 14 deletions src/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
end
N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))

(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)

function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
y = round(widen1(rawone(U))*x)
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
U(_unsafe_trunc(T, y), 0)
(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)

function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
if T == UInt128 # for UInt128, we can't widen
# the upper limit is not exact
(0 <= x) & (x <= (typemax(T)/rawone(U))) || throw_converterror(U, x)
y = round(rawone(U)*x)
else
y = round(widen1(rawone(U))*x)
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
end
reinterpret(U, _unsafe_trunc(T, y))
end
# Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
_convert(U, T, Float32(x))
_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
_convert(U, UInt128, Float32(x))
function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
y = round(rawone(U)*x) # for UInt128, we can't widen
(0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
U(_unsafe_trunc(UInt128, y), 0)
function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
x == Float16(typemax(T)/rawone(U)) && return typemax(U)
end
return _convert(U, Float32(x))
end
function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
if T == UInt128 && f == 53
0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
else
0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
end
Comment on lines +70 to +74
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(typemax(T)-rawone(U))/rawone(U)+1 may have slightly higher accuracy than typemax(T)/rawone(U) because the 1s in the dividend are reduced. However, this have no effect for UInt128. So, the specialization for Normed{UInt128, 53} is needed (f == 53 is one of singular points). FYI, using BigFloat hinders the constant pre-calculation and inline expansion.

Although this can be a temporary measure, I think the problem with pre-calculating constants is potentially troublesome.


significand_bits = Tf == Float64 ? 52 : 23
if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
end
# cf. the implementation of `frexp`
Tw = f < sizeof(T) * 8 ? T : widen1(T)
bits = sizeof(Tw) * 8 - 1
xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
k = Int(xu >> significand_bits)
k == 0 && return zero(U) # neglect subnormal numbers
significand = xu | (one(xu) << significand_bits)
yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
exponent_bias = Tf == Float64 ? 1023 : 127
ex = exponent_bias - k + bits - f
yi = bits >= f ? yh - (yh >> f) : yh
if ex <= 0
ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
ex != -1 || signbit(signed(yi)) && return typemax(U)
return reinterpret(U, unsafe_trunc(T, yi + yi))
end
ex > bits && return reinterpret(U, ex == bits + 1 ? one(T) : zero(T))
yi += one(Tw)<<((ex - 1) & bits) # RoundNearestTiesUp
return reinterpret(U, unsafe_trunc(T, yi >> (ex & bits)))
end

rem(x::T, ::Type{T}) where {T <: Normed} = x
Expand Down
34 changes: 34 additions & 0 deletions test/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,40 @@ end
@test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
end

@testset "conversion from float" begin
# issue 102
for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
for Tf in (Float16, Float32, Float64)
@testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
U = Normed{T,f}
r = FixedPointNumbers.rawone(U)

@test reinterpret(U(zero(Tf))) == 0x0

# TODO: fix issue #129
# input_typemax = Tf(typemax(U))
input_typemax = Tf(BigFloat(typemax(T)) / r)
if isinf(input_typemax)
@test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
else
@test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
end

input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
isinf(input_upper) && continue # for Julia v0.7
@test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))

input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
isinf(input_exp2) && continue
@test reinterpret(U(input_exp2)) == T(input_exp2) * r
end
end
end
@test N0f32(Float32(0x0.7FFFFFp-32)) == zero(N0f32)
@test N0f32(Float32(0x0.800000p-32)) <= eps(N0f32) # should be zero in RoundNearest mode
@test N0f32(Float32(0x0.800001p-32)) == eps(N0f32)
end

@testset "modulus" begin
@test N0f8(0.2) % N0f8 === N0f8(0.2)
@test N2f14(1.2) % N0f16 === N0f16(0.20002)
Expand Down