From f622800400208ecea446c3deac37d16a87c3a8e3 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Fri, 1 Nov 2019 03:46:04 -0500 Subject: [PATCH 01/10] Change `scaledual` to match original intent. Fixes #130. (#132) --- src/FixedPointNumbers.jl | 23 +++++++++++++++-------- test/normed.jl | 40 +++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl index b008f4be..7f0c800a 100644 --- a/src/FixedPointNumbers.jl +++ b/src/FixedPointNumbers.jl @@ -188,14 +188,21 @@ for f in (:rem, :mod, :mod1, :min, :max) end end -# When multiplying by a float, reduce two multiplies to one. -# Particularly useful for arrays. -scaledual(Tdual::Type, x) = oneunit(Tdual), x -scaledual(b::Tdual, x) where {Tdual <: Number} = b, x -scaledual(Tdual::Type, x::Union{T,AbstractArray{T}}) where {T <: FixedPoint} = - convert(Tdual, 1/oneunit(T)), reinterpret(rawtype(T), x) -scaledual(b::Tdual, x::Union{T,AbstractArray{T}}) where {Tdual <: Number,T <: FixedPoint} = - convert(Tdual, b/oneunit(T)), reinterpret(rawtype(T), x) +""" + sd, ad = scaledual(s::Number, a) + +Return `sd` and `ad` such that `sd * ad == s * a`. +When `a` is an array of FixedPoint numbers, `sd*ad` might be faster to compute than `s*a`. +""" +scaledual(b::Number, x::Union{Number,AbstractArray{<:Number}}) = b, x +scaledual(b::Number, x::FixedPoint) = b/rawone(x), reinterpret(x) +scaledual(b::Number, x::AbstractArray{T}) where T <: FixedPoint = + b/rawone(T), reinterpret(rawtype(T), x) + +scaledual(::Type{Tdual}, x::Union{Number,AbstractArray{<:Number}}) where Tdual = oneunit(Tdual), x +scaledual(::Type{Tdual}, x::FixedPoint) where Tdual = convert(Tdual, 1/rawone(x)), reinterpret(x) +scaledual(::Type{Tdual}, x::AbstractArray{T}) where {Tdual, T <: FixedPoint} = + convert(Tdual, 1/rawone(T)), reinterpret(rawtype(T), x) @noinline function throw_converterror(::Type{T}, x) where {T <: FixedPoint} n = 2^(8*sizeof(T)) diff --git a/test/normed.jl b/test/normed.jl index 185fde44..4513fc67 100644 --- a/test/normed.jl +++ b/test/normed.jl @@ -274,32 +274,30 @@ end @test eval(Meta.parse(str)) == x end -# scaledual -function generic_scale!(C::AbstractArray, X::AbstractArray, s::Number) - length(C) == length(X) || error("C must be the same length as X") - for i = 1:length(X) - @inbounds C[i] = X[i]*s - end - C -end - @testset "scaledual" begin a = rand(UInt8, 10) - rfloat = similar(a, Float32) - rfixed = similar(rfloat) af8 = reinterpret(N0f8, a) - b = 0.5 + bd, eld = scaledual(b, af8[1]) - @assert b*a[1] == bd*eld - - b, ad = scaledual(0.5, a) - @test b == 0.5 - @test ad == a - b, ad = scaledual(0.5, ad) - generic_scale!(rfloat, a, 0.5) - generic_scale!(rfixed, ad, b) - @test rfloat == rfixed + @test b*af8[1] == bd*eld + bd, ad = scaledual(b, af8) + @test b*af8 == bd*ad + + bd, eld = scaledual(b, a[1]) + @test b*a[1] == bd*eld + bd, ad = scaledual(b, a) + @test b*a == bd*ad + + bd, eld = scaledual(Float64, af8[1]) + @test 1.0*af8[1] == bd*eld + bd, ad = scaledual(Float64, af8) + @test 1.0*af8 == bd*ad + + bd, eld = scaledual(Float64, a[1]) + @test 1.0*a[1] == bd*eld + bd, ad = scaledual(Float64, a) + @test 1.0*a == bd*ad end @testset "reductions" begin From 28860708c032f7d2111043a4034339b378b90b1e Mon Sep 17 00:00:00 2001 From: kimikage Date: Sat, 26 Oct 2019 23:26:58 +0900 Subject: [PATCH 02/10] Fix comments --- src/FixedPointNumbers.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl index 7f0c800a..eb75e2c8 100644 --- a/src/FixedPointNumbers.jl +++ b/src/FixedPointNumbers.jl @@ -15,7 +15,7 @@ import Base: ==, <, <=, -, +, *, /, ~, isapprox, using Base: @pure # T => BaseType -# f => Number of Bytes reserved for fractional part +# f => Number of bits reserved for fractional part abstract type FixedPoint{T <: Integer, f} <: Real end @@ -25,7 +25,7 @@ export Normed, floattype, # "special" typealiases - # Q and U typealiases are exported in separate source files + # Q and N typealiases are exported in separate source files # Functions scaledual From 547eb9558f484a062f5654df63140c46f55d4505 Mon Sep 17 00:00:00 2001 From: kimikage Date: Fri, 1 Nov 2019 09:59:33 +0900 Subject: [PATCH 03/10] Improve accuracy of conversions from floating-point numbers This fixes a problem with the input range checking. --- src/normed.jl | 62 ++++++++++++++++++++++++++++++++++++++------------ test/normed.jl | 34 +++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/src/normed.jl b/src/normed.jl index bf1311c0..ec92ee5f 100644 --- a/src/normed.jl +++ b/src/normed.jl @@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f} end N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x))) -(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x) - -function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T} - y = round(widen1(rawone(U))*x) - (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x) - U(_unsafe_trunc(T, y), 0) +(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x) + +function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}} + if T == UInt128 # for UInt128, we can't widen + # the upper limit is not exact + (0 <= x) & (x <= (typemax(T)/rawone(U))) || throw_converterror(U, x) + y = round(rawone(U)*x) + else + y = round(widen1(rawone(U))*x) + (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x) + end + reinterpret(U, _unsafe_trunc(T, y)) end # Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057) -_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} = - _convert(U, T, Float32(x)) -_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} = - _convert(U, UInt128, Float32(x)) -function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed} - y = round(rawone(U)*x) # for UInt128, we can't widen - (0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x) - U(_unsafe_trunc(UInt128, y), 0) +function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}} + if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U)) + x == Float16(typemax(T)/rawone(U)) && return typemax(U) + end + return _convert(U, Float32(x)) +end +function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}} + if T == UInt128 && f == 53 + 0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x) + else + 0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x) + end + + significand_bits = Tf == Float64 ? 52 : 23 + if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits + return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x))) + end + # cf. the implementation of `frexp` + Tw = f < sizeof(T) * 8 ? T : widen1(T) + bits = sizeof(Tw) * 8 - 1 + xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x) + k = Int(xu >> significand_bits) + k == 0 && return zero(U) # neglect subnormal numbers + significand = xu | (one(xu) << significand_bits) + yh = unsafe_trunc(Tw, significand) << (bits - significand_bits) + exponent_bias = Tf == Float64 ? 1023 : 127 + ex = exponent_bias - k + bits - f + yi = bits >= f ? yh - (yh >> f) : yh + if ex <= 0 + ex == 0 && return reinterpret(U, unsafe_trunc(T, yi)) + ex != -1 || signbit(signed(yi)) && return typemax(U) + return reinterpret(U, unsafe_trunc(T, yi + yi)) + end + ex > bits && return reinterpret(U, ex == bits + 1 ? one(T) : zero(T)) + yi += one(Tw)<<((ex - 1) & bits) # RoundNearestTiesUp + return reinterpret(U, unsafe_trunc(T, yi >> (ex & bits))) end rem(x::T, ::Type{T}) where {T <: Normed} = x diff --git a/test/normed.jl b/test/normed.jl index 4513fc67..4369fe75 100644 --- a/test/normed.jl +++ b/test/normed.jl @@ -102,6 +102,40 @@ end @test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504) end +@testset "conversion from float" begin + # issue 102 + for T in (UInt8, UInt16, UInt32, UInt64, UInt128) + for Tf in (Float16, Float32, Float64) + @testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8 + U = Normed{T,f} + r = FixedPointNumbers.rawone(U) + + @test reinterpret(U(zero(Tf))) == 0x0 + + # TODO: fix issue #129 + # input_typemax = Tf(typemax(U)) + input_typemax = Tf(BigFloat(typemax(T)) / r) + if isinf(input_typemax) + @test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf)) + else + @test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check + end + + input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown) + isinf(input_upper) && continue # for Julia v0.7 + @test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T))) + + input_exp2 = Tf(exp2(sizeof(T) * 8 - f)) + isinf(input_exp2) && continue + @test reinterpret(U(input_exp2)) == T(input_exp2) * r + end + end + end + @test N0f32(Float32(0x0.7FFFFFp-32)) == zero(N0f32) + @test N0f32(Float32(0x0.800000p-32)) <= eps(N0f32) # should be zero in RoundNearest mode + @test N0f32(Float32(0x0.800001p-32)) == eps(N0f32) +end + @testset "modulus" begin @test N0f8(0.2) % N0f8 === N0f8(0.2) @test N2f14(1.2) % N0f16 === N0f16(0.20002) From ee5bd547bf73cf9ec6b976088c0c2565630f4ea5 Mon Sep 17 00:00:00 2001 From: kimikage Date: Thu, 7 Nov 2019 13:48:26 +0900 Subject: [PATCH 04/10] Fix `rem` problem with negative float inputs on ARMv7 --- src/normed.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/normed.jl b/src/normed.jl index ec92ee5f..e3375ab9 100644 --- a/src/normed.jl +++ b/src/normed.jl @@ -203,3 +203,9 @@ end _unsafe_trunc(::Type{T}, x::Integer) where {T} = x % T _unsafe_trunc(::Type{T}, x) where {T} = unsafe_trunc(T, x) +if !signbit(signed(unsafe_trunc(UInt, -12.345))) + # a workaround for 32-bit ARMv7 (issue #134) + function _unsafe_trunc(::Type{T}, x::AbstractFloat) where {T} + unsafe_trunc(T, unsafe_trunc(typeof(signed(zero(T))), x)) + end +end From 8d177395f327abe8cf9725b81a3b302fab9c856b Mon Sep 17 00:00:00 2001 From: kimikage Date: Tue, 19 Nov 2019 20:05:59 +0900 Subject: [PATCH 05/10] Remove `testapprox` dependency and Reduce the number of test cases for display (#135) --- test/fixed.jl | 7 +++++-- test/normed.jl | 16 +++++----------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/test/fixed.jl b/test/fixed.jl index 3ffab5ff..cc8ea449 100644 --- a/test/fixed.jl +++ b/test/fixed.jl @@ -84,8 +84,11 @@ end end @testset "testapprox" begin - for T in [Fixed{Int8,7}, Fixed{Int16,8}, Fixed{Int16,10}] - testapprox(T) # defined in ufixed.jl + @testset "approx $T" for T in [Fixed{Int8,7}, Fixed{Int16,8}, Fixed{Int16,10}] + xs = typemin(T):eps(T):typemax(T)-eps(T) + @test all(x -> x ≈ x + eps(T), xs) + @test all(x -> x + eps(T) ≈ x, xs) + @test !any(x -> x - eps(T) ≈ x + eps(T), xs) end end diff --git a/test/normed.jl b/test/normed.jl index 4369fe75..e674e487 100644 --- a/test/normed.jl +++ b/test/normed.jl @@ -230,18 +230,12 @@ end end end -function testapprox(::Type{T}) where {T} - for x = typemin(T):eps(T):typemax(T)-eps(T) - y = x+eps(T) - @test x ≈ y - @test y ≈ x - @test !(x ≈ y+eps(T)) - end -end - @testset "approx" begin - for T in FixedPointNumbers.UF - testapprox(T) + @testset "approx $T" for T in FixedPointNumbers.UF + xs = typemin(T):eps(T):typemax(T)-eps(T) + @test all(x -> x ≈ x + eps(T), xs) + @test all(x -> x + eps(T) ≈ x, xs) + @test !any(x -> x - eps(T) ≈ x + eps(T), xs) end end From 4341fe59898dfb5ed3342ac10d1d42ea5cf7f8a7 Mon Sep 17 00:00:00 2001 From: kimikage Date: Tue, 15 Oct 2019 09:18:58 +0900 Subject: [PATCH 06/10] Improve accuracy of conversions to floating-point numbers --- src/normed.jl | 128 ++++++++++++++++++++++++++++++++++++++++++++++++- test/normed.jl | 63 +++++++++++++++++------- 2 files changed, 173 insertions(+), 18 deletions(-) diff --git a/src/normed.jl b/src/normed.jl index e3375ab9..737657b4 100644 --- a/src/normed.jl +++ b/src/normed.jl @@ -105,11 +105,135 @@ rem(x::Float16, ::Type{T}) where {T <: Normed} = rem(Float32(x), T) # avoid ove float(x::Normed) = convert(floattype(x), x) -Base.BigFloat(x::Normed) = reinterpret(x)*(1/BigFloat(rawone(x))) +macro f32(x::Float64) # just for hexadecimal floating-point literals + :(Float32($x)) +end +macro exp2(n) + :(_exp2(Val($(esc(n))))) +end +_exp2(::Val{N}) where {N} = exp2(N) + +# for Julia v1.0, which does not fold `div_float` before inlining +inv_rawone(x) = (@generated) ? (y = 1.0 / rawone(x); :($y)) : 1.0 / rawone(x) + function (::Type{T})(x::Normed) where {T <: AbstractFloat} - y = reinterpret(x)*(one(rawtype(x))/convert(T, rawone(x))) + # The following optimization for constant division may cause rounding errors. + # y = reinterpret(x)*(one(rawtype(x))/convert(T, rawone(x))) + # Therefore, we use a simple form here. + # If you prefer speed over accuracy, consider using `scaledual` instead. + y = reinterpret(x) / convert(promote_type(T, floattype(x)), rawone(x)) convert(T, y) # needed for types like Float16 which promote arithmetic to Float32 end + +function Base.Float16(x::Normed{Ti,f}) where {Ti <: Union{UInt8, UInt16, UInt32}, f} + f == 1 ? Float16(x.i) : Float16(Float32(x)) +end +function Base.Float16(x::Normed{Ti,f}) where {Ti <: Union{UInt64, UInt128}, f} + f == 1 ? Float16(x.i) : Float16(Float64(x)) +end + +function Base.Float32(x::Normed{UInt8,f}) where f + f == 1 && return Float32(x.i) + f == 2 && return Float32(Int32(x.i) * 0x101) * @f32(0x550055p-32) + f == 3 && return Float32(Int32(x.i) * 0x00b) * @f32(0xd4c77bp-30) + f == 4 && return Float32(Int32(x.i) * 0x101) * @f32(0x110011p-32) + f == 5 && return Float32(Int32(x.i) * 0x003) * @f32(0xb02c0bp-30) + f == 6 && return Float32(Int32(x.i) * 0x049) * @f32(0xe40039p-36) + f == 7 && return Float32(Int32(x.i) * 0x01f) * @f32(0x852b5fp-35) + f == 8 && return Float32(Int32(x.i) * 0x155) * @f32(0xc0f0fdp-40) + 0.0f0 +end +function Base.Float32(x::Normed{UInt16,f}) where f + f32 = Float32(x.i) + f == 1 && return f32 + f == 2 && return f32 * @f32(0x55p-8) + f32 * @f32(0x555555p-32) + f == 3 && return f32 * @f32(0x49p-9) + f32 * @f32(0x249249p-33) + f == 4 && return f32 * @f32(0x11p-8) + f32 * @f32(0x111111p-32) + f == 5 && return f32 * @f32(0x21p-10) + f32 * @f32(0x108421p-35) + f == 6 && return f32 * @f32(0x41p-12) + f32 * @f32(0x041041p-36) + f == 7 && return f32 * @f32(0x81p-14) + f32 * @f32(0x204081p-42) + f == 16 && return f32 * @f32(0x01p-16) + f32 * @f32(0x010001p-48) + Float32(x.i / rawone(x)) +end +function Base.Float32(x::Normed{UInt32,f}) where f + f == 1 && return Float32(x.i) + i32 = unsafe_trunc(Int32, x.i) + if f == 32 + rh, rl = Float32(i32>>>16), Float32((i32&0xFFFF)<<8 | (i32>>>24)) + return muladd(rh, @f32(0x1p-16), rl * @f32(0x1p-40)) + elseif f >= 25 + rh, rl = Float32(i32>>>16),Float32(((i32&0xFFFF)<<14) + (i32>>>(f-14))) + return muladd(rh, Float32(@exp2(16-f)), rl * Float32(@exp2(-14-f))) + end + # FIXME: avoid the branch in native x86_64 (non-SIMD) codes + m = ifelse(i32 < 0, 0x1p32 * inv_rawone(x), 0.0) + Float32(muladd(Float64(i32), inv_rawone(x), m)) +end +function Base.Float32(x::Normed{Ti,f}) where {Ti <: Union{UInt64, UInt128}, f} + f == 1 ? Float32(x.i) : Float32(Float64(x)) +end + +function Base.Float64(x::Normed{Ti,f}) where {Ti <: Union{UInt8, UInt16}, f} + Float64(Normed{UInt32,f}(x)) +end +function Base.Float64(x::Normed{UInt32,f}) where f + f64 = Float64(x.i) + f == 1 && return f64 + f == 2 && return (f64 * 0x040001) * 0x15555000015555p-72 + f == 3 && return (f64 * 0x108421) * 0x11b6db76924929p-75 + f == 4 && return (f64 * 0x010101) * 0x11000011000011p-72 + f == 5 && return (f64 * 0x108421) * 0x04000002000001p-75 + f == 6 && return (f64 * 0x09dfb1) * 0x1a56b8e38e6d91p-78 + f == 7 && return (f64 * 0x000899) * 0x0f01480001e029p-70 + f == 8 && return (f64 * 0x0a5a5b) * 0x18d300000018d3p-80 + f == 9 && return (f64 * 0x001001) * 0x080381c8e3f201p-72 + f == 10 && return (f64 * 0x100001) * 0x04010000000401p-80 + f == 11 && return (f64 * 0x000009) * 0x0e3aaae3955639p-66 + f == 12 && return (f64 * 0x0a8055) * 0x186246e46e4cfdp-84 + f == 13 && return (f64 * 0x002001) * 0x10000004000001p-78 + f == 14 && return (f64 * 0x03400d) * 0x13b13b14ec4ec5p-84 + f == 15 && return (f64 * 0x000259) * 0x06d0c5a4f3a5e9p-75 + f == 16 && return (f64 * 0x011111) * 0x00f000ff00fff1p-80 + f == 18 && return (f64 * 0x0b06d1) * 0x17377445dd1231p-90 + f == 19 && return (f64 * 0x080001) * 0x00004000000001p-76 + f == 20 && return (f64 * 0x000101) * 0x0ff010ef10ff01p-80 + f == 21 && return (f64 * 0x004001) * 0x01fff8101fc001p-84 + f == 22 && return (f64 * 0x002945) * 0x18d0000000018dp-88 + f == 23 && return (f64 * 0x044819) * 0x07794a23729429p-92 + f == 27 && return (f64 * 0x000a21) * 0x0006518c7df9e1p-81 + f == 28 && return (f64 * 0x00000d) * 0x13b13b14ec4ec5p-84 + f == 30 && return (f64 * 0x001041) * 0x00fc003f03ffc1p-90 + f == 32 && return (f64 * 0x010101) * 0x00ff0000ffff01p-96 + f64 / rawone(x) +end +function Base.Float64(x::Normed{UInt64,f}) where f + f == 1 && return Float64(x.i) + if f >= 53 + rh = Float64(unsafe_trunc(Int64, x.i >> 16)) * @exp2(16-f) # upper 48 bits + rl = Float64(unsafe_trunc(Int32, x.i&0xFFFF)) * @exp2(-f) # lower 16 bits + return rh + muladd(rh, @exp2(-f), rl) + end + x.i / rawone(x) +end +function Base.Float64(x::Normed{UInt128,f}) where f + f == 1 && return Float64(x.i) + ih, il = unsafe_trunc(Int64, x.i>>64), unsafe_trunc(Int64, x.i) + rh = Float64(ih>>>16) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits + km = @exp2(f <= 53 ? 48 : 48 - f) # for middle 32 bits + rm = Float64(unsafe_trunc(Int32, ih&0xFFFF)) * (0x1p16 * km) + + Float64(unsafe_trunc(Int32, il>>>48)) * km + rl = Float64(il&0xFFFFFFFFFFFF) * @exp2(f <= 53 ? 0 : -f) # lower 48 bits + if f <= 53 + return (rh + (rm + rl)) / unsafe_trunc(Int64, rawone(x)) + elseif f < 76 + return rh + (rm + muladd(rh, @exp2(-f), rl)) + else + return rh + (rm + rl) + end +end + +Base.BigFloat(x::Normed) = reinterpret(x)*(1/BigFloat(rawone(x))) + Base.Bool(x::Normed) = x == zero(x) ? false : true Base.Integer(x::Normed) = convert(Integer, x*1.0) (::Type{T})(x::Normed) where {T <: Integer} = convert(T, x*(1/oneunit(T))) diff --git a/test/normed.jl b/test/normed.jl index e674e487..6fb4d125 100644 --- a/test/normed.jl +++ b/test/normed.jl @@ -112,9 +112,7 @@ end @test reinterpret(U(zero(Tf))) == 0x0 - # TODO: fix issue #129 - # input_typemax = Tf(typemax(U)) - input_typemax = Tf(BigFloat(typemax(T)) / r) + input_typemax = Tf(typemax(U)) if isinf(input_typemax) @test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf)) else @@ -136,6 +134,46 @@ end @test N0f32(Float32(0x0.800001p-32)) == eps(N0f32) end +@testset "conversions to float" begin + x = N0f8(0.3) + for T in (Float16, Float32, Float64, BigFloat) + y = convert(T, x) + @test isa(y, T) + end + + for Tf in (Float16, Float32, Float64) + @testset "$Tf(::Normed{$Ti})" for Ti in (UInt8, UInt16) + @testset "$Tf(::Normed{$Ti,$f})" for f = 1:(sizeof(Ti)*8) + T = Normed{Ti,f} + float_err = 0.0 + for i = typemin(Ti):typemax(Ti) + f_expected = Tf(i / BigFloat(FixedPointNumbers.rawone(T))) + isinf(f_expected) && break # for Float16(::Normed{UInt16,1}) + f_actual = Tf(reinterpret(T, i)) + float_err += abs(f_actual - f_expected) + end + @test float_err == 0.0 + end + end + @testset "$Tf(::Normed{$Ti})" for Ti in (UInt32, UInt64, UInt128) + @testset "$Tf(::Normed{$Ti,$f})" for f = 1:(sizeof(Ti)*8) + T = Normed{Ti,f} + error_count = 0 + for i in vcat(Ti(0x00):Ti(0xFF), (typemax(Ti)-0xFF):typemax(Ti)) + f_expected = Tf(i / BigFloat(FixedPointNumbers.rawone(T))) + isinf(f_expected) && break # for Float16() and Float32() + f_actual = Tf(reinterpret(T, i)) + f_actual == f_expected && continue + f_actual == prevfloat(f_expected) && continue + f_actual == nextfloat(f_expected) && continue + error_count += 1 + end + @test error_count == 0 + end + end + end +end + @testset "modulus" begin @test N0f8(0.2) % N0f8 === N0f8(0.2) @test N2f14(1.2) % N0f16 === N0f16(0.20002) @@ -307,10 +345,11 @@ end af8 = reinterpret(N0f8, a) b = 0.5 + # LHSs of the following `@test`s with `af8` can be slightly more accurate bd, eld = scaledual(b, af8[1]) - @test b*af8[1] == bd*eld + @test b*af8[1] ≈ bd*eld rtol=1e-15 bd, ad = scaledual(b, af8) - @test b*af8 == bd*ad + @test b*af8 ≈ bd*ad rtol=1e-15 bd, eld = scaledual(b, a[1]) @test b*a[1] == bd*eld @@ -318,14 +357,14 @@ end @test b*a == bd*ad bd, eld = scaledual(Float64, af8[1]) - @test 1.0*af8[1] == bd*eld + @test 1.0*af8[1] ≈ bd*eld rtol=1e-15 bd, ad = scaledual(Float64, af8) - @test 1.0*af8 == bd*ad + @test 1.0*af8 ≈ bd*ad rtol=1e-15 bd, eld = scaledual(Float64, a[1]) @test 1.0*a[1] == bd*eld bd, ad = scaledual(Float64, a) - @test 1.0*a == bd*ad + @test 1.0*a == bd*ad end @testset "reductions" begin @@ -339,14 +378,6 @@ end @test prod(a, dims=1) == [acmp] end -@testset "convert" begin - x = N0f8(0.3) - for T in (Float16, Float32, Float64, BigFloat) - y = convert(T, x) - @test isa(y, T) - end -end - @testset "rand" begin for T in (Normed{UInt8,8}, Normed{UInt8,6}, Normed{UInt16,16}, Normed{UInt16,14}, From 4b9142b828801e0dbaca0574450058d70042d9df Mon Sep 17 00:00:00 2001 From: kimikage Date: Fri, 22 Nov 2019 23:07:17 +0900 Subject: [PATCH 07/10] Remove the obsolete promotions for reductions (#141) --- src/FixedPointNumbers.jl | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl index eb75e2c8..ccd86e50 100644 --- a/src/FixedPointNumbers.jl +++ b/src/FixedPointNumbers.jl @@ -2,8 +2,6 @@ VERSION < v"0.7.0-beta2.199" && __precompile__() module FixedPointNumbers -using Base: reducedim_initarray - import Base: ==, <, <=, -, +, *, /, ~, isapprox, convert, promote_rule, show, isinteger, abs, decompose, isnan, isinf, isfinite, @@ -154,27 +152,12 @@ sizeof(::Type{T}) where {T <: FixedPoint} = sizeof(rawtype(T)) # Promotions for reductions const Treduce = Float64 -if isdefined(Base, :r_promote) - # Julia v0.6 - Base.r_promote(::typeof(+), x::FixedPoint{T}) where {T} = Treduce(x) - Base.r_promote(::typeof(*), x::FixedPoint{T}) where {T} = Treduce(x) - Base.reducedim_init(f::typeof(identity), - op::typeof(+), - A::AbstractArray{T}, region) where {T <: FixedPoint} = - Base.reducedim_initarray(A, region, zero(Treduce)) - Base.reducedim_init(f::typeof(identity), - op::typeof(*), - A::AbstractArray{T}, region) where {T <: FixedPoint} = - Base.reducedim_initarray(A, region, oneunit(Treduce)) -else - # Julia v0.7 - Base.add_sum(x::FixedPoint, y::FixedPoint) = Treduce(x) + Treduce(y) - Base.reduce_empty(::typeof(Base.add_sum), ::Type{F}) where {F<:FixedPoint} = zero(Treduce) - Base.reduce_first(::typeof(Base.add_sum), x::FixedPoint) = Treduce(x) - Base.mul_prod(x::FixedPoint, y::FixedPoint) = Treduce(x) * Treduce(y) - Base.reduce_empty(::typeof(Base.mul_prod), ::Type{F}) where {F<:FixedPoint} = one(Treduce) - Base.reduce_first(::typeof(Base.mul_prod), x::FixedPoint) = Treduce(x) -end +Base.add_sum(x::FixedPoint, y::FixedPoint) = Treduce(x) + Treduce(y) +Base.reduce_empty(::typeof(Base.add_sum), ::Type{F}) where {F<:FixedPoint} = zero(Treduce) +Base.reduce_first(::typeof(Base.add_sum), x::FixedPoint) = Treduce(x) +Base.mul_prod(x::FixedPoint, y::FixedPoint) = Treduce(x) * Treduce(y) +Base.reduce_empty(::typeof(Base.mul_prod), ::Type{F}) where {F<:FixedPoint} = one(Treduce) +Base.reduce_first(::typeof(Base.mul_prod), x::FixedPoint) = Treduce(x) for f in (:div, :fld, :fld1) From 49627e217c6fe125a0d2bba3ddb2f5432d3ac1cb Mon Sep 17 00:00:00 2001 From: kimikage Date: Sat, 30 Nov 2019 22:54:35 +0900 Subject: [PATCH 08/10] Fix performance regression with inlining failure on Julia v1.3.0 (Fixes #144) (#145) Julia 1.3.0 generates more redundant intermediate codes which are eventually optimized. This trivial change reduces the redundant intermediate codes to promote inlining. --- src/normed.jl | 62 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/src/normed.jl b/src/normed.jl index 737657b4..f0fa8aad 100644 --- a/src/normed.jl +++ b/src/normed.jl @@ -159,10 +159,10 @@ function Base.Float32(x::Normed{UInt32,f}) where f f == 1 && return Float32(x.i) i32 = unsafe_trunc(Int32, x.i) if f == 32 - rh, rl = Float32(i32>>>16), Float32((i32&0xFFFF)<<8 | (i32>>>24)) + rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0x8 | i32>>>0x18) return muladd(rh, @f32(0x1p-16), rl * @f32(0x1p-40)) elseif f >= 25 - rh, rl = Float32(i32>>>16),Float32(((i32&0xFFFF)<<14) + (i32>>>(f-14))) + rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0xE + i32>>>UInt8(f-14)) return muladd(rh, Float32(@exp2(16-f)), rl * Float32(@exp2(-14-f))) end # FIXME: avoid the branch in native x86_64 (non-SIMD) codes @@ -179,37 +179,37 @@ end function Base.Float64(x::Normed{UInt32,f}) where f f64 = Float64(x.i) f == 1 && return f64 - f == 2 && return (f64 * 0x040001) * 0x15555000015555p-72 - f == 3 && return (f64 * 0x108421) * 0x11b6db76924929p-75 - f == 4 && return (f64 * 0x010101) * 0x11000011000011p-72 - f == 5 && return (f64 * 0x108421) * 0x04000002000001p-75 - f == 6 && return (f64 * 0x09dfb1) * 0x1a56b8e38e6d91p-78 - f == 7 && return (f64 * 0x000899) * 0x0f01480001e029p-70 - f == 8 && return (f64 * 0x0a5a5b) * 0x18d300000018d3p-80 - f == 9 && return (f64 * 0x001001) * 0x080381c8e3f201p-72 - f == 10 && return (f64 * 0x100001) * 0x04010000000401p-80 - f == 11 && return (f64 * 0x000009) * 0x0e3aaae3955639p-66 - f == 12 && return (f64 * 0x0a8055) * 0x186246e46e4cfdp-84 - f == 13 && return (f64 * 0x002001) * 0x10000004000001p-78 - f == 14 && return (f64 * 0x03400d) * 0x13b13b14ec4ec5p-84 - f == 15 && return (f64 * 0x000259) * 0x06d0c5a4f3a5e9p-75 - f == 16 && return (f64 * 0x011111) * 0x00f000ff00fff1p-80 - f == 18 && return (f64 * 0x0b06d1) * 0x17377445dd1231p-90 - f == 19 && return (f64 * 0x080001) * 0x00004000000001p-76 - f == 20 && return (f64 * 0x000101) * 0x0ff010ef10ff01p-80 - f == 21 && return (f64 * 0x004001) * 0x01fff8101fc001p-84 - f == 22 && return (f64 * 0x002945) * 0x18d0000000018dp-88 - f == 23 && return (f64 * 0x044819) * 0x07794a23729429p-92 - f == 27 && return (f64 * 0x000a21) * 0x0006518c7df9e1p-81 - f == 28 && return (f64 * 0x00000d) * 0x13b13b14ec4ec5p-84 - f == 30 && return (f64 * 0x001041) * 0x00fc003f03ffc1p-90 - f == 32 && return (f64 * 0x010101) * 0x00ff0000ffff01p-96 + f == 2 && return (f64 * 0x040001p0) * 0x15555000015555p-72 + f == 3 && return (f64 * 0x108421p0) * 0x11b6db76924929p-75 + f == 4 && return (f64 * 0x010101p0) * 0x11000011000011p-72 + f == 5 && return (f64 * 0x108421p0) * 0x04000002000001p-75 + f == 6 && return (f64 * 0x09dfb1p0) * 0x1a56b8e38e6d91p-78 + f == 7 && return (f64 * 0x000899p0) * 0x0f01480001e029p-70 + f == 8 && return (f64 * 0x0a5a5bp0) * 0x18d300000018d3p-80 + f == 9 && return (f64 * 0x001001p0) * 0x080381c8e3f201p-72 + f == 10 && return (f64 * 0x100001p0) * 0x04010000000401p-80 + f == 11 && return (f64 * 0x000009p0) * 0x0e3aaae3955639p-66 + f == 12 && return (f64 * 0x0a8055p0) * 0x186246e46e4cfdp-84 + f == 13 && return (f64 * 0x002001p0) * 0x10000004000001p-78 + f == 14 && return (f64 * 0x03400dp0) * 0x13b13b14ec4ec5p-84 + f == 15 && return (f64 * 0x000259p0) * 0x06d0c5a4f3a5e9p-75 + f == 16 && return (f64 * 0x011111p0) * 0x00f000ff00fff1p-80 + f == 18 && return (f64 * 0x0b06d1p0) * 0x17377445dd1231p-90 + f == 19 && return (f64 * 0x080001p0) * 0x00004000000001p-76 + f == 20 && return (f64 * 0x000101p0) * 0x0ff010ef10ff01p-80 + f == 21 && return (f64 * 0x004001p0) * 0x01fff8101fc001p-84 + f == 22 && return (f64 * 0x002945p0) * 0x18d0000000018dp-88 + f == 23 && return (f64 * 0x044819p0) * 0x07794a23729429p-92 + f == 27 && return (f64 * 0x000a21p0) * 0x0006518c7df9e1p-81 + f == 28 && return (f64 * 0x00000dp0) * 0x13b13b14ec4ec5p-84 + f == 30 && return (f64 * 0x001041p0) * 0x00fc003f03ffc1p-90 + f == 32 && return (f64 * 0x010101p0) * 0x00ff0000ffff01p-96 f64 / rawone(x) end function Base.Float64(x::Normed{UInt64,f}) where f f == 1 && return Float64(x.i) if f >= 53 - rh = Float64(unsafe_trunc(Int64, x.i >> 16)) * @exp2(16-f) # upper 48 bits + rh = Float64(unsafe_trunc(Int64, x.i>>0x10)) * @exp2(16-f) # upper 48 bits rl = Float64(unsafe_trunc(Int32, x.i&0xFFFF)) * @exp2(-f) # lower 16 bits return rh + muladd(rh, @exp2(-f), rl) end @@ -217,11 +217,11 @@ function Base.Float64(x::Normed{UInt64,f}) where f end function Base.Float64(x::Normed{UInt128,f}) where f f == 1 && return Float64(x.i) - ih, il = unsafe_trunc(Int64, x.i>>64), unsafe_trunc(Int64, x.i) - rh = Float64(ih>>>16) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits + ih, il = unsafe_trunc(Int64, x.i>>0x40), unsafe_trunc(Int64, x.i) + rh = Float64(ih>>>0x10) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits km = @exp2(f <= 53 ? 48 : 48 - f) # for middle 32 bits rm = Float64(unsafe_trunc(Int32, ih&0xFFFF)) * (0x1p16 * km) + - Float64(unsafe_trunc(Int32, il>>>48)) * km + Float64(unsafe_trunc(Int32, il>>>0x30)) * km rl = Float64(il&0xFFFFFFFFFFFF) * @exp2(f <= 53 ? 0 : -f) # lower 48 bits if f <= 53 return (rh + (rm + rl)) / unsafe_trunc(Int64, rawone(x)) From f647feb0d48e49458fad11c965da637194a168d1 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Thu, 12 Dec 2019 05:28:00 -0600 Subject: [PATCH 09/10] Precompile methods for common types (#148) --- .gitignore | 1 + .travis.yml | 2 +- src/FixedPointNumbers.jl | 5 +++-- src/precompile.jl | 30 ++++++++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 src/precompile.jl diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ba39cc53 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +Manifest.toml diff --git a/.travis.yml b/.travis.yml index a22f5779..5c8dca12 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ os: - osx julia: - 1.0 - - 1.1 + - 1.2 - nightly notifications: email: false diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl index ccd86e50..60ec762d 100644 --- a/src/FixedPointNumbers.jl +++ b/src/FixedPointNumbers.jl @@ -1,5 +1,3 @@ -VERSION < v"0.7.0-beta2.199" && __precompile__() - module FixedPointNumbers import Base: ==, <, <=, -, +, *, /, ~, isapprox, @@ -199,4 +197,7 @@ end rand(::Type{T}) where {T <: FixedPoint} = reinterpret(T, rand(rawtype(T))) rand(::Type{T}, sz::Dims) where {T <: FixedPoint} = reinterpret(T, rand(rawtype(T), sz)) +include("precompile.jl") +_precompile_() + end # module diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 00000000..178be83f --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,30 @@ +function _precompile_() + ccall(:jl_generating_output, Cint, ()) == 1 || return nothing + normedtypes = (N0f8, N0f16) # precompiled Normed types + realtypes = (Float16, Float32, Float64, Int) # types for mixed Normed/Real operations + for T in normedtypes + for f in (+, -, abs, eps, rand) # unary operations + @assert precompile(Tuple{typeof(f),T}) + end + @assert precompile(Tuple{typeof(rand),T,Tuple{Int}}) + @assert precompile(Tuple{typeof(rand),T,Tuple{Int,Int}}) + for f in (trunc, floor, ceil, round) # rounding operations + @assert precompile(Tuple{typeof(f),T}) + @assert precompile(Tuple{typeof(f),Type{Int},T}) + end + for f in (+, -, *, /, <, <=, ==) # binary operations + @assert precompile(Tuple{typeof(f),T,T}) + for S in realtypes + @assert precompile(Tuple{typeof(f),T,S}) + @assert precompile(Tuple{typeof(f),S,T}) + end + end + # conversions + for S in realtypes + @assert precompile(Tuple{Type{T},S}) + @assert precompile(Tuple{Type{S},T}) + @assert precompile(Tuple{typeof(convert),Type{T},S}) + @assert precompile(Tuple{typeof(convert),Type{S},T}) + end + end +end From 7ad0f0c460961024191f672332fcf3d7cef655d1 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Mon, 16 Dec 2019 18:10:16 -0600 Subject: [PATCH 10/10] Version 0.7 (#149) Since it will likely be a while before we can release 1.0, how about a 0.7 release? I'm bumping the minor version because it is breaking (`scaledual`, and the improved accuracy of conversions). --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 60fd45cd..4aecfcf2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FixedPointNumbers" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.6.1" +version = "0.7.0" [compat] julia = "1"