From f622800400208ecea446c3deac37d16a87c3a8e3 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Fri, 1 Nov 2019 03:46:04 -0500
Subject: [PATCH 01/10] Change `scaledual` to match original intent. Fixes
 #130. (#132)

---
 src/FixedPointNumbers.jl | 23 +++++++++++++++--------
 test/normed.jl           | 40 +++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl
index b008f4be..7f0c800a 100644
--- a/src/FixedPointNumbers.jl
+++ b/src/FixedPointNumbers.jl
@@ -188,14 +188,21 @@ for f in (:rem, :mod, :mod1, :min, :max)
     end
 end
 
-# When multiplying by a float, reduce two multiplies to one.
-# Particularly useful for arrays.
-scaledual(Tdual::Type, x) = oneunit(Tdual), x
-scaledual(b::Tdual, x) where {Tdual <: Number} = b, x
-scaledual(Tdual::Type, x::Union{T,AbstractArray{T}}) where {T <: FixedPoint} =
-    convert(Tdual, 1/oneunit(T)), reinterpret(rawtype(T), x)
-scaledual(b::Tdual, x::Union{T,AbstractArray{T}}) where {Tdual <: Number,T <: FixedPoint} =
-    convert(Tdual, b/oneunit(T)), reinterpret(rawtype(T), x)
+"""
+    sd, ad = scaledual(s::Number, a)
+
+Return `sd` and `ad` such that `sd * ad == s * a`.
+When `a` is an array of FixedPoint numbers, `sd*ad` might be faster to compute than `s*a`.
+"""
+scaledual(b::Number, x::Union{Number,AbstractArray{<:Number}}) = b, x
+scaledual(b::Number, x::FixedPoint) = b/rawone(x), reinterpret(x)
+scaledual(b::Number, x::AbstractArray{T}) where T <: FixedPoint =
+    b/rawone(T), reinterpret(rawtype(T), x)
+
+scaledual(::Type{Tdual}, x::Union{Number,AbstractArray{<:Number}}) where Tdual = oneunit(Tdual), x
+scaledual(::Type{Tdual}, x::FixedPoint) where Tdual = convert(Tdual, 1/rawone(x)), reinterpret(x)
+scaledual(::Type{Tdual}, x::AbstractArray{T}) where {Tdual, T <: FixedPoint} =
+    convert(Tdual, 1/rawone(T)), reinterpret(rawtype(T), x)
 
 @noinline function throw_converterror(::Type{T}, x) where {T <: FixedPoint}
     n = 2^(8*sizeof(T))
diff --git a/test/normed.jl b/test/normed.jl
index 185fde44..4513fc67 100644
--- a/test/normed.jl
+++ b/test/normed.jl
@@ -274,32 +274,30 @@ end
     @test eval(Meta.parse(str)) == x
 end
 
-# scaledual
-function generic_scale!(C::AbstractArray, X::AbstractArray, s::Number)
-    length(C) == length(X) || error("C must be the same length as X")
-    for i = 1:length(X)
-        @inbounds C[i] = X[i]*s
-    end
-    C
-end
-
 @testset "scaledual" begin
     a = rand(UInt8, 10)
-    rfloat = similar(a, Float32)
-    rfixed = similar(rfloat)
     af8 = reinterpret(N0f8, a)
-
     b = 0.5
+
     bd, eld = scaledual(b, af8[1])
-    @assert b*a[1] == bd*eld
-
-    b, ad = scaledual(0.5, a)
-    @test b == 0.5
-    @test ad == a
-    b, ad = scaledual(0.5, ad)
-    generic_scale!(rfloat, a, 0.5)
-    generic_scale!(rfixed, ad, b)
-    @test rfloat == rfixed
+    @test b*af8[1] == bd*eld
+    bd, ad = scaledual(b, af8)
+    @test b*af8 == bd*ad
+
+    bd, eld = scaledual(b, a[1])
+    @test b*a[1] == bd*eld
+    bd, ad = scaledual(b, a)
+    @test b*a == bd*ad
+
+    bd, eld = scaledual(Float64, af8[1])
+    @test 1.0*af8[1] == bd*eld
+    bd, ad = scaledual(Float64, af8)
+    @test 1.0*af8 == bd*ad
+
+    bd, eld = scaledual(Float64, a[1])
+    @test 1.0*a[1] == bd*eld
+    bd, ad = scaledual(Float64, a)
+    @test 1.0*a == bd*ad    
 end
 
 @testset "reductions" begin

From 28860708c032f7d2111043a4034339b378b90b1e Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Sat, 26 Oct 2019 23:26:58 +0900
Subject: [PATCH 02/10] Fix comments

---
 src/FixedPointNumbers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl
index 7f0c800a..eb75e2c8 100644
--- a/src/FixedPointNumbers.jl
+++ b/src/FixedPointNumbers.jl
@@ -15,7 +15,7 @@ import Base: ==, <, <=, -, +, *, /, ~, isapprox,
 using Base: @pure
 
 # T => BaseType
-# f => Number of Bytes reserved for fractional part
+# f => Number of bits reserved for fractional part
 abstract type FixedPoint{T <: Integer, f} <: Real end
 
 
@@ -25,7 +25,7 @@ export
     Normed,
     floattype,
 # "special" typealiases
-    # Q and U typealiases are exported in separate source files
+    # Q and N typealiases are exported in separate source files
 # Functions
     scaledual
 

From 547eb9558f484a062f5654df63140c46f55d4505 Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Fri, 1 Nov 2019 09:59:33 +0900
Subject: [PATCH 03/10] Improve accuracy of conversions from floating-point
 numbers

This fixes a problem with the input range checking.
---
 src/normed.jl  | 62 ++++++++++++++++++++++++++++++++++++++------------
 test/normed.jl | 34 +++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/src/normed.jl b/src/normed.jl
index bf1311c0..ec92ee5f 100644
--- a/src/normed.jl
+++ b/src/normed.jl
@@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
 end
 N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))
 
-(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)
-
-function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
-    y = round(widen1(rawone(U))*x)
-    (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
-    U(_unsafe_trunc(T, y), 0)
+(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)
+
+function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
+    if T == UInt128 # for UInt128, we can't widen
+        # the upper limit is not exact
+        (0 <= x) & (x <= (typemax(T)/rawone(U))) || throw_converterror(U, x)
+        y = round(rawone(U)*x)
+    else
+        y = round(widen1(rawone(U))*x)
+        (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
+    end
+    reinterpret(U, _unsafe_trunc(T, y))
 end
 # Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
-_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
-    _convert(U, T, Float32(x))
-_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
-    _convert(U, UInt128, Float32(x))
-function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
-    y = round(rawone(U)*x)   # for UInt128, we can't widen
-    (0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
-    U(_unsafe_trunc(UInt128, y), 0)
+function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
+    if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
+        x == Float16(typemax(T)/rawone(U)) && return typemax(U)
+    end
+    return _convert(U, Float32(x))
+end
+function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
+    if T == UInt128 && f == 53
+        0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
+    else
+        0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
+    end
+
+    significand_bits = Tf == Float64 ? 52 : 23
+    if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
+        return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
+    end
+    # cf. the implementation of `frexp`
+    Tw = f < sizeof(T) * 8 ? T : widen1(T)
+    bits = sizeof(Tw) * 8 - 1
+    xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
+    k = Int(xu >> significand_bits)
+    k == 0 && return zero(U) # neglect subnormal numbers
+    significand = xu | (one(xu) << significand_bits)
+    yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
+    exponent_bias = Tf == Float64 ? 1023 : 127
+    ex = exponent_bias - k + bits - f
+    yi = bits >= f ? yh - (yh >> f) : yh
+    if ex <= 0
+        ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
+        ex != -1 || signbit(signed(yi)) && return typemax(U)
+        return reinterpret(U, unsafe_trunc(T, yi + yi))
+    end
+    ex > bits && return reinterpret(U, ex == bits + 1 ? one(T) : zero(T))
+    yi += one(Tw)<<((ex - 1) & bits) # RoundNearestTiesUp
+    return reinterpret(U, unsafe_trunc(T, yi >> (ex & bits)))
 end
 
 rem(x::T, ::Type{T}) where {T <: Normed} = x
diff --git a/test/normed.jl b/test/normed.jl
index 4513fc67..4369fe75 100644
--- a/test/normed.jl
+++ b/test/normed.jl
@@ -102,6 +102,40 @@ end
     @test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
 end
 
+@testset "conversion from float" begin
+    # issue 102
+    for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
+        for Tf in (Float16, Float32, Float64)
+            @testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
+                U = Normed{T,f}
+                r = FixedPointNumbers.rawone(U)
+
+                @test reinterpret(U(zero(Tf))) == 0x0
+
+                # TODO: fix issue #129
+                # input_typemax = Tf(typemax(U))
+                input_typemax = Tf(BigFloat(typemax(T)) / r)
+                if isinf(input_typemax)
+                    @test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
+                else
+                    @test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
+                end
+
+                input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
+                isinf(input_upper) && continue # for Julia v0.7
+                @test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))
+
+                input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
+                isinf(input_exp2) && continue
+                @test reinterpret(U(input_exp2)) == T(input_exp2) * r
+            end
+        end
+    end
+    @test N0f32(Float32(0x0.7FFFFFp-32)) == zero(N0f32)
+    @test N0f32(Float32(0x0.800000p-32)) <= eps(N0f32) # should be zero in RoundNearest mode
+    @test N0f32(Float32(0x0.800001p-32)) == eps(N0f32)
+end
+
 @testset "modulus" begin
     @test  N0f8(0.2) % N0f8  === N0f8(0.2)
     @test N2f14(1.2) % N0f16 === N0f16(0.20002)

From ee5bd547bf73cf9ec6b976088c0c2565630f4ea5 Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Thu, 7 Nov 2019 13:48:26 +0900
Subject: [PATCH 04/10] Fix `rem` problem with negative float inputs on ARMv7

---
 src/normed.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/normed.jl b/src/normed.jl
index ec92ee5f..e3375ab9 100644
--- a/src/normed.jl
+++ b/src/normed.jl
@@ -203,3 +203,9 @@ end
 
 _unsafe_trunc(::Type{T}, x::Integer) where {T} = x % T
 _unsafe_trunc(::Type{T}, x) where {T}          = unsafe_trunc(T, x)
+if !signbit(signed(unsafe_trunc(UInt, -12.345)))
+    # a workaround for 32-bit ARMv7 (issue #134)
+    function _unsafe_trunc(::Type{T}, x::AbstractFloat) where {T}
+        unsafe_trunc(T, unsafe_trunc(typeof(signed(zero(T))), x))
+    end
+end

From 8d177395f327abe8cf9725b81a3b302fab9c856b Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Tue, 19 Nov 2019 20:05:59 +0900
Subject: [PATCH 05/10] Remove `testapprox` dependency and Reduce the number of
 test cases for display (#135)

---
 test/fixed.jl  |  7 +++++--
 test/normed.jl | 16 +++++-----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/test/fixed.jl b/test/fixed.jl
index 3ffab5ff..cc8ea449 100644
--- a/test/fixed.jl
+++ b/test/fixed.jl
@@ -84,8 +84,11 @@ end
 end
 
 @testset "testapprox" begin
-    for T in [Fixed{Int8,7}, Fixed{Int16,8}, Fixed{Int16,10}]
-        testapprox(T)  # defined in ufixed.jl
+    @testset "approx $T" for T in [Fixed{Int8,7}, Fixed{Int16,8}, Fixed{Int16,10}]
+        xs = typemin(T):eps(T):typemax(T)-eps(T)
+        @test all(x -> x ≈ x + eps(T), xs)
+        @test all(x -> x + eps(T) ≈ x, xs)
+        @test !any(x -> x - eps(T) ≈ x + eps(T), xs)
     end
 end
 
diff --git a/test/normed.jl b/test/normed.jl
index 4369fe75..e674e487 100644
--- a/test/normed.jl
+++ b/test/normed.jl
@@ -230,18 +230,12 @@ end
     end
 end
 
-function testapprox(::Type{T}) where {T}
-    for x = typemin(T):eps(T):typemax(T)-eps(T)
-        y = x+eps(T)
-        @test x ≈ y
-        @test y ≈ x
-        @test !(x ≈ y+eps(T))
-    end
-end
-
 @testset "approx" begin
-    for T in FixedPointNumbers.UF
-        testapprox(T)
+    @testset "approx $T" for T in FixedPointNumbers.UF
+        xs = typemin(T):eps(T):typemax(T)-eps(T)
+        @test all(x -> x ≈ x + eps(T), xs)
+        @test all(x -> x + eps(T) ≈ x, xs)
+        @test !any(x -> x - eps(T) ≈ x + eps(T), xs)
     end
 end
 

From 4341fe59898dfb5ed3342ac10d1d42ea5cf7f8a7 Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Tue, 15 Oct 2019 09:18:58 +0900
Subject: [PATCH 06/10] Improve accuracy of conversions to floating-point
 numbers

---
 src/normed.jl  | 128 ++++++++++++++++++++++++++++++++++++++++++++++++-
 test/normed.jl |  63 +++++++++++++++++-------
 2 files changed, 173 insertions(+), 18 deletions(-)

diff --git a/src/normed.jl b/src/normed.jl
index e3375ab9..737657b4 100644
--- a/src/normed.jl
+++ b/src/normed.jl
@@ -105,11 +105,135 @@ rem(x::Float16, ::Type{T}) where {T <: Normed} = rem(Float32(x), T)  # avoid ove
 
 float(x::Normed) = convert(floattype(x), x)
 
-Base.BigFloat(x::Normed) = reinterpret(x)*(1/BigFloat(rawone(x)))
+macro f32(x::Float64) # just for hexadecimal floating-point literals
+    :(Float32($x))
+end
+macro exp2(n)
+     :(_exp2(Val($(esc(n)))))
+end
+_exp2(::Val{N}) where {N} = exp2(N)
+
+# for Julia v1.0, which does not fold `div_float` before inlining
+inv_rawone(x) = (@generated) ? (y = 1.0 / rawone(x); :($y)) : 1.0 / rawone(x)
+
 function (::Type{T})(x::Normed) where {T <: AbstractFloat}
-    y = reinterpret(x)*(one(rawtype(x))/convert(T, rawone(x)))
+    # The following optimization for constant division may cause rounding errors.
+    # y = reinterpret(x)*(one(rawtype(x))/convert(T, rawone(x)))
+    # Therefore, we use a simple form here.
+    # If you prefer speed over accuracy, consider using `scaledual` instead.
+    y = reinterpret(x) / convert(promote_type(T, floattype(x)), rawone(x))
     convert(T, y)  # needed for types like Float16 which promote arithmetic to Float32
 end
+
+function Base.Float16(x::Normed{Ti,f}) where {Ti <: Union{UInt8, UInt16, UInt32}, f}
+    f == 1 ? Float16(x.i) : Float16(Float32(x))
+end
+function Base.Float16(x::Normed{Ti,f}) where {Ti <: Union{UInt64, UInt128}, f}
+    f == 1 ? Float16(x.i) : Float16(Float64(x))
+end
+
+function Base.Float32(x::Normed{UInt8,f}) where f
+    f == 1 && return Float32(x.i)
+    f == 2 && return Float32(Int32(x.i) * 0x101) * @f32(0x550055p-32)
+    f == 3 && return Float32(Int32(x.i) * 0x00b) * @f32(0xd4c77bp-30)
+    f == 4 && return Float32(Int32(x.i) * 0x101) * @f32(0x110011p-32)
+    f == 5 && return Float32(Int32(x.i) * 0x003) * @f32(0xb02c0bp-30)
+    f == 6 && return Float32(Int32(x.i) * 0x049) * @f32(0xe40039p-36)
+    f == 7 && return Float32(Int32(x.i) * 0x01f) * @f32(0x852b5fp-35)
+    f == 8 && return Float32(Int32(x.i) * 0x155) * @f32(0xc0f0fdp-40)
+    0.0f0
+end
+function Base.Float32(x::Normed{UInt16,f}) where f
+    f32 = Float32(x.i)
+    f ==  1 && return f32
+    f ==  2 && return f32 * @f32(0x55p-8)  + f32 * @f32(0x555555p-32)
+    f ==  3 && return f32 * @f32(0x49p-9)  + f32 * @f32(0x249249p-33)
+    f ==  4 && return f32 * @f32(0x11p-8)  + f32 * @f32(0x111111p-32)
+    f ==  5 && return f32 * @f32(0x21p-10) + f32 * @f32(0x108421p-35)
+    f ==  6 && return f32 * @f32(0x41p-12) + f32 * @f32(0x041041p-36)
+    f ==  7 && return f32 * @f32(0x81p-14) + f32 * @f32(0x204081p-42)
+    f == 16 && return f32 * @f32(0x01p-16) + f32 * @f32(0x010001p-48)
+    Float32(x.i / rawone(x))
+end
+function Base.Float32(x::Normed{UInt32,f}) where f
+    f == 1 && return Float32(x.i)
+    i32 = unsafe_trunc(Int32, x.i)
+    if f == 32
+        rh, rl = Float32(i32>>>16), Float32((i32&0xFFFF)<<8 | (i32>>>24))
+        return muladd(rh, @f32(0x1p-16), rl * @f32(0x1p-40))
+    elseif f >= 25
+        rh, rl = Float32(i32>>>16),Float32(((i32&0xFFFF)<<14) + (i32>>>(f-14)))
+        return muladd(rh, Float32(@exp2(16-f)), rl * Float32(@exp2(-14-f)))
+    end
+    # FIXME: avoid the branch in native x86_64 (non-SIMD) codes
+    m = ifelse(i32 < 0, 0x1p32 * inv_rawone(x), 0.0)
+    Float32(muladd(Float64(i32), inv_rawone(x), m))
+end
+function Base.Float32(x::Normed{Ti,f}) where {Ti <: Union{UInt64, UInt128}, f}
+    f == 1 ? Float32(x.i) : Float32(Float64(x))
+end
+
+function Base.Float64(x::Normed{Ti,f}) where {Ti <: Union{UInt8, UInt16}, f}
+    Float64(Normed{UInt32,f}(x))
+end
+function Base.Float64(x::Normed{UInt32,f}) where f
+    f64 = Float64(x.i)
+    f ==  1 && return f64
+    f ==  2 && return (f64 * 0x040001) * 0x15555000015555p-72
+    f ==  3 && return (f64 * 0x108421) * 0x11b6db76924929p-75
+    f ==  4 && return (f64 * 0x010101) * 0x11000011000011p-72
+    f ==  5 && return (f64 * 0x108421) * 0x04000002000001p-75
+    f ==  6 && return (f64 * 0x09dfb1) * 0x1a56b8e38e6d91p-78
+    f ==  7 && return (f64 * 0x000899) * 0x0f01480001e029p-70
+    f ==  8 && return (f64 * 0x0a5a5b) * 0x18d300000018d3p-80
+    f ==  9 && return (f64 * 0x001001) * 0x080381c8e3f201p-72
+    f == 10 && return (f64 * 0x100001) * 0x04010000000401p-80
+    f == 11 && return (f64 * 0x000009) * 0x0e3aaae3955639p-66
+    f == 12 && return (f64 * 0x0a8055) * 0x186246e46e4cfdp-84
+    f == 13 && return (f64 * 0x002001) * 0x10000004000001p-78
+    f == 14 && return (f64 * 0x03400d) * 0x13b13b14ec4ec5p-84
+    f == 15 && return (f64 * 0x000259) * 0x06d0c5a4f3a5e9p-75
+    f == 16 && return (f64 * 0x011111) * 0x00f000ff00fff1p-80
+    f == 18 && return (f64 * 0x0b06d1) * 0x17377445dd1231p-90
+    f == 19 && return (f64 * 0x080001) * 0x00004000000001p-76
+    f == 20 && return (f64 * 0x000101) * 0x0ff010ef10ff01p-80
+    f == 21 && return (f64 * 0x004001) * 0x01fff8101fc001p-84
+    f == 22 && return (f64 * 0x002945) * 0x18d0000000018dp-88
+    f == 23 && return (f64 * 0x044819) * 0x07794a23729429p-92
+    f == 27 && return (f64 * 0x000a21) * 0x0006518c7df9e1p-81
+    f == 28 && return (f64 * 0x00000d) * 0x13b13b14ec4ec5p-84
+    f == 30 && return (f64 * 0x001041) * 0x00fc003f03ffc1p-90
+    f == 32 && return (f64 * 0x010101) * 0x00ff0000ffff01p-96
+    f64 / rawone(x)
+end
+function Base.Float64(x::Normed{UInt64,f}) where f
+    f == 1 && return Float64(x.i)
+    if f >= 53
+        rh = Float64(unsafe_trunc(Int64, x.i >> 16)) * @exp2(16-f) # upper 48 bits
+        rl = Float64(unsafe_trunc(Int32, x.i&0xFFFF)) * @exp2(-f)  # lower 16 bits
+        return rh + muladd(rh, @exp2(-f), rl)
+    end
+    x.i / rawone(x)
+end
+function Base.Float64(x::Normed{UInt128,f}) where f
+    f == 1 && return Float64(x.i)
+    ih, il = unsafe_trunc(Int64, x.i>>64), unsafe_trunc(Int64, x.i)
+    rh = Float64(ih>>>16) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits
+    km = @exp2(f <= 53 ? 48 : 48 - f) # for middle 32 bits
+    rm = Float64(unsafe_trunc(Int32, ih&0xFFFF)) * (0x1p16 * km) +
+         Float64(unsafe_trunc(Int32, il>>>48)) * km
+    rl = Float64(il&0xFFFFFFFFFFFF) * @exp2(f <= 53 ? 0 : -f) # lower 48 bits
+    if f <= 53
+        return (rh + (rm + rl)) / unsafe_trunc(Int64, rawone(x))
+    elseif f < 76
+        return rh + (rm + muladd(rh, @exp2(-f), rl))
+    else
+        return rh + (rm + rl)
+    end
+end
+
+Base.BigFloat(x::Normed) = reinterpret(x)*(1/BigFloat(rawone(x)))
+
 Base.Bool(x::Normed) = x == zero(x) ? false : true
 Base.Integer(x::Normed) = convert(Integer, x*1.0)
 (::Type{T})(x::Normed) where {T <: Integer} = convert(T, x*(1/oneunit(T)))
diff --git a/test/normed.jl b/test/normed.jl
index e674e487..6fb4d125 100644
--- a/test/normed.jl
+++ b/test/normed.jl
@@ -112,9 +112,7 @@ end
 
                 @test reinterpret(U(zero(Tf))) == 0x0
 
-                # TODO: fix issue #129
-                # input_typemax = Tf(typemax(U))
-                input_typemax = Tf(BigFloat(typemax(T)) / r)
+                input_typemax = Tf(typemax(U))
                 if isinf(input_typemax)
                     @test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
                 else
@@ -136,6 +134,46 @@ end
     @test N0f32(Float32(0x0.800001p-32)) == eps(N0f32)
 end
 
+@testset "conversions to float" begin
+    x = N0f8(0.3)
+    for T in (Float16, Float32, Float64, BigFloat)
+        y = convert(T, x)
+        @test isa(y, T)
+    end
+
+    for Tf in (Float16, Float32, Float64)
+        @testset "$Tf(::Normed{$Ti})" for Ti in (UInt8, UInt16)
+            @testset "$Tf(::Normed{$Ti,$f})" for f = 1:(sizeof(Ti)*8)
+                T = Normed{Ti,f}
+                float_err = 0.0
+                for i = typemin(Ti):typemax(Ti)
+                    f_expected = Tf(i / BigFloat(FixedPointNumbers.rawone(T)))
+                    isinf(f_expected) && break # for Float16(::Normed{UInt16,1})
+                    f_actual = Tf(reinterpret(T, i))
+                    float_err += abs(f_actual - f_expected)
+                end
+                @test float_err == 0.0
+            end
+        end
+        @testset "$Tf(::Normed{$Ti})" for Ti in (UInt32, UInt64, UInt128)
+            @testset "$Tf(::Normed{$Ti,$f})" for f = 1:(sizeof(Ti)*8)
+                T = Normed{Ti,f}
+                error_count = 0
+                for i in vcat(Ti(0x00):Ti(0xFF), (typemax(Ti)-0xFF):typemax(Ti))
+                    f_expected = Tf(i / BigFloat(FixedPointNumbers.rawone(T)))
+                    isinf(f_expected) && break # for Float16() and Float32()
+                    f_actual = Tf(reinterpret(T, i))
+                    f_actual == f_expected && continue
+                    f_actual == prevfloat(f_expected) && continue
+                    f_actual == nextfloat(f_expected) && continue
+                    error_count += 1
+                end
+                @test error_count == 0
+            end
+        end
+    end
+end
+
 @testset "modulus" begin
     @test  N0f8(0.2) % N0f8  === N0f8(0.2)
     @test N2f14(1.2) % N0f16 === N0f16(0.20002)
@@ -307,10 +345,11 @@ end
     af8 = reinterpret(N0f8, a)
     b = 0.5
 
+    # LHSs of the following `@test`s with `af8` can be slightly more accurate
     bd, eld = scaledual(b, af8[1])
-    @test b*af8[1] == bd*eld
+    @test b*af8[1] ≈ bd*eld rtol=1e-15
     bd, ad = scaledual(b, af8)
-    @test b*af8 == bd*ad
+    @test b*af8 ≈ bd*ad rtol=1e-15
 
     bd, eld = scaledual(b, a[1])
     @test b*a[1] == bd*eld
@@ -318,14 +357,14 @@ end
     @test b*a == bd*ad
 
     bd, eld = scaledual(Float64, af8[1])
-    @test 1.0*af8[1] == bd*eld
+    @test 1.0*af8[1] ≈ bd*eld rtol=1e-15
     bd, ad = scaledual(Float64, af8)
-    @test 1.0*af8 == bd*ad
+    @test 1.0*af8 ≈ bd*ad rtol=1e-15
 
     bd, eld = scaledual(Float64, a[1])
     @test 1.0*a[1] == bd*eld
     bd, ad = scaledual(Float64, a)
-    @test 1.0*a == bd*ad    
+    @test 1.0*a == bd*ad
 end
 
 @testset "reductions" begin
@@ -339,14 +378,6 @@ end
     @test prod(a, dims=1) == [acmp]
 end
 
-@testset "convert" begin
-    x = N0f8(0.3)
-    for T in (Float16, Float32, Float64, BigFloat)
-        y = convert(T, x)
-        @test isa(y, T)
-    end
-end
-
 @testset "rand" begin
     for T in (Normed{UInt8,8}, Normed{UInt8,6},
               Normed{UInt16,16}, Normed{UInt16,14},

From 4b9142b828801e0dbaca0574450058d70042d9df Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Fri, 22 Nov 2019 23:07:17 +0900
Subject: [PATCH 07/10] Remove the obsolete promotions for reductions (#141)

---
 src/FixedPointNumbers.jl | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl
index eb75e2c8..ccd86e50 100644
--- a/src/FixedPointNumbers.jl
+++ b/src/FixedPointNumbers.jl
@@ -2,8 +2,6 @@ VERSION < v"0.7.0-beta2.199" && __precompile__()
 
 module FixedPointNumbers
 
-using Base: reducedim_initarray
-
 import Base: ==, <, <=, -, +, *, /, ~, isapprox,
              convert, promote_rule, show, isinteger, abs, decompose,
              isnan, isinf, isfinite,
@@ -154,27 +152,12 @@ sizeof(::Type{T}) where {T <: FixedPoint} = sizeof(rawtype(T))
 
 # Promotions for reductions
 const Treduce = Float64
-if isdefined(Base, :r_promote)
-    # Julia v0.6
-    Base.r_promote(::typeof(+), x::FixedPoint{T}) where {T} = Treduce(x)
-    Base.r_promote(::typeof(*), x::FixedPoint{T}) where {T} = Treduce(x)
-    Base.reducedim_init(f::typeof(identity),
-                   op::typeof(+),
-                   A::AbstractArray{T}, region) where {T <: FixedPoint} =
-                       Base.reducedim_initarray(A, region, zero(Treduce))
-    Base.reducedim_init(f::typeof(identity),
-                   op::typeof(*),
-                   A::AbstractArray{T}, region) where {T <: FixedPoint} =
-                       Base.reducedim_initarray(A, region, oneunit(Treduce))
-else
-    # Julia v0.7
-    Base.add_sum(x::FixedPoint, y::FixedPoint) = Treduce(x) + Treduce(y)
-    Base.reduce_empty(::typeof(Base.add_sum), ::Type{F}) where {F<:FixedPoint}  = zero(Treduce)
-    Base.reduce_first(::typeof(Base.add_sum), x::FixedPoint)   = Treduce(x)
-    Base.mul_prod(x::FixedPoint, y::FixedPoint) = Treduce(x) * Treduce(y)
-    Base.reduce_empty(::typeof(Base.mul_prod), ::Type{F}) where {F<:FixedPoint} = one(Treduce)
-    Base.reduce_first(::typeof(Base.mul_prod), x::FixedPoint)  = Treduce(x)
-end
+Base.add_sum(x::FixedPoint, y::FixedPoint) = Treduce(x) + Treduce(y)
+Base.reduce_empty(::typeof(Base.add_sum), ::Type{F}) where {F<:FixedPoint}  = zero(Treduce)
+Base.reduce_first(::typeof(Base.add_sum), x::FixedPoint)   = Treduce(x)
+Base.mul_prod(x::FixedPoint, y::FixedPoint) = Treduce(x) * Treduce(y)
+Base.reduce_empty(::typeof(Base.mul_prod), ::Type{F}) where {F<:FixedPoint} = one(Treduce)
+Base.reduce_first(::typeof(Base.mul_prod), x::FixedPoint)  = Treduce(x)
 
 
 for f in (:div, :fld, :fld1)

From 49627e217c6fe125a0d2bba3ddb2f5432d3ac1cb Mon Sep 17 00:00:00 2001
From: kimikage <kimikage.ceo@gmail.com>
Date: Sat, 30 Nov 2019 22:54:35 +0900
Subject: [PATCH 08/10] Fix performance regression with inlining failure on
 Julia v1.3.0 (Fixes #144) (#145)

Julia 1.3.0 generates more redundant intermediate codes which are eventually optimized.
This trivial change reduces the redundant intermediate codes to promote inlining.
---
 src/normed.jl | 62 +++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/normed.jl b/src/normed.jl
index 737657b4..f0fa8aad 100644
--- a/src/normed.jl
+++ b/src/normed.jl
@@ -159,10 +159,10 @@ function Base.Float32(x::Normed{UInt32,f}) where f
     f == 1 && return Float32(x.i)
     i32 = unsafe_trunc(Int32, x.i)
     if f == 32
-        rh, rl = Float32(i32>>>16), Float32((i32&0xFFFF)<<8 | (i32>>>24))
+        rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0x8 | i32>>>0x18)
         return muladd(rh, @f32(0x1p-16), rl * @f32(0x1p-40))
     elseif f >= 25
-        rh, rl = Float32(i32>>>16),Float32(((i32&0xFFFF)<<14) + (i32>>>(f-14)))
+        rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0xE + i32>>>UInt8(f-14))
         return muladd(rh, Float32(@exp2(16-f)), rl * Float32(@exp2(-14-f)))
     end
     # FIXME: avoid the branch in native x86_64 (non-SIMD) codes
@@ -179,37 +179,37 @@ end
 function Base.Float64(x::Normed{UInt32,f}) where f
     f64 = Float64(x.i)
     f ==  1 && return f64
-    f ==  2 && return (f64 * 0x040001) * 0x15555000015555p-72
-    f ==  3 && return (f64 * 0x108421) * 0x11b6db76924929p-75
-    f ==  4 && return (f64 * 0x010101) * 0x11000011000011p-72
-    f ==  5 && return (f64 * 0x108421) * 0x04000002000001p-75
-    f ==  6 && return (f64 * 0x09dfb1) * 0x1a56b8e38e6d91p-78
-    f ==  7 && return (f64 * 0x000899) * 0x0f01480001e029p-70
-    f ==  8 && return (f64 * 0x0a5a5b) * 0x18d300000018d3p-80
-    f ==  9 && return (f64 * 0x001001) * 0x080381c8e3f201p-72
-    f == 10 && return (f64 * 0x100001) * 0x04010000000401p-80
-    f == 11 && return (f64 * 0x000009) * 0x0e3aaae3955639p-66
-    f == 12 && return (f64 * 0x0a8055) * 0x186246e46e4cfdp-84
-    f == 13 && return (f64 * 0x002001) * 0x10000004000001p-78
-    f == 14 && return (f64 * 0x03400d) * 0x13b13b14ec4ec5p-84
-    f == 15 && return (f64 * 0x000259) * 0x06d0c5a4f3a5e9p-75
-    f == 16 && return (f64 * 0x011111) * 0x00f000ff00fff1p-80
-    f == 18 && return (f64 * 0x0b06d1) * 0x17377445dd1231p-90
-    f == 19 && return (f64 * 0x080001) * 0x00004000000001p-76
-    f == 20 && return (f64 * 0x000101) * 0x0ff010ef10ff01p-80
-    f == 21 && return (f64 * 0x004001) * 0x01fff8101fc001p-84
-    f == 22 && return (f64 * 0x002945) * 0x18d0000000018dp-88
-    f == 23 && return (f64 * 0x044819) * 0x07794a23729429p-92
-    f == 27 && return (f64 * 0x000a21) * 0x0006518c7df9e1p-81
-    f == 28 && return (f64 * 0x00000d) * 0x13b13b14ec4ec5p-84
-    f == 30 && return (f64 * 0x001041) * 0x00fc003f03ffc1p-90
-    f == 32 && return (f64 * 0x010101) * 0x00ff0000ffff01p-96
+    f ==  2 && return (f64 * 0x040001p0) * 0x15555000015555p-72
+    f ==  3 && return (f64 * 0x108421p0) * 0x11b6db76924929p-75
+    f ==  4 && return (f64 * 0x010101p0) * 0x11000011000011p-72
+    f ==  5 && return (f64 * 0x108421p0) * 0x04000002000001p-75
+    f ==  6 && return (f64 * 0x09dfb1p0) * 0x1a56b8e38e6d91p-78
+    f ==  7 && return (f64 * 0x000899p0) * 0x0f01480001e029p-70
+    f ==  8 && return (f64 * 0x0a5a5bp0) * 0x18d300000018d3p-80
+    f ==  9 && return (f64 * 0x001001p0) * 0x080381c8e3f201p-72
+    f == 10 && return (f64 * 0x100001p0) * 0x04010000000401p-80
+    f == 11 && return (f64 * 0x000009p0) * 0x0e3aaae3955639p-66
+    f == 12 && return (f64 * 0x0a8055p0) * 0x186246e46e4cfdp-84
+    f == 13 && return (f64 * 0x002001p0) * 0x10000004000001p-78
+    f == 14 && return (f64 * 0x03400dp0) * 0x13b13b14ec4ec5p-84
+    f == 15 && return (f64 * 0x000259p0) * 0x06d0c5a4f3a5e9p-75
+    f == 16 && return (f64 * 0x011111p0) * 0x00f000ff00fff1p-80
+    f == 18 && return (f64 * 0x0b06d1p0) * 0x17377445dd1231p-90
+    f == 19 && return (f64 * 0x080001p0) * 0x00004000000001p-76
+    f == 20 && return (f64 * 0x000101p0) * 0x0ff010ef10ff01p-80
+    f == 21 && return (f64 * 0x004001p0) * 0x01fff8101fc001p-84
+    f == 22 && return (f64 * 0x002945p0) * 0x18d0000000018dp-88
+    f == 23 && return (f64 * 0x044819p0) * 0x07794a23729429p-92
+    f == 27 && return (f64 * 0x000a21p0) * 0x0006518c7df9e1p-81
+    f == 28 && return (f64 * 0x00000dp0) * 0x13b13b14ec4ec5p-84
+    f == 30 && return (f64 * 0x001041p0) * 0x00fc003f03ffc1p-90
+    f == 32 && return (f64 * 0x010101p0) * 0x00ff0000ffff01p-96
     f64 / rawone(x)
 end
 function Base.Float64(x::Normed{UInt64,f}) where f
     f == 1 && return Float64(x.i)
     if f >= 53
-        rh = Float64(unsafe_trunc(Int64, x.i >> 16)) * @exp2(16-f) # upper 48 bits
+        rh = Float64(unsafe_trunc(Int64, x.i>>0x10)) * @exp2(16-f) # upper 48 bits
         rl = Float64(unsafe_trunc(Int32, x.i&0xFFFF)) * @exp2(-f)  # lower 16 bits
         return rh + muladd(rh, @exp2(-f), rl)
     end
@@ -217,11 +217,11 @@ function Base.Float64(x::Normed{UInt64,f}) where f
 end
 function Base.Float64(x::Normed{UInt128,f}) where f
     f == 1 && return Float64(x.i)
-    ih, il = unsafe_trunc(Int64, x.i>>64), unsafe_trunc(Int64, x.i)
-    rh = Float64(ih>>>16) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits
+    ih, il = unsafe_trunc(Int64, x.i>>0x40), unsafe_trunc(Int64, x.i)
+    rh = Float64(ih>>>0x10) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits
     km = @exp2(f <= 53 ? 48 : 48 - f) # for middle 32 bits
     rm = Float64(unsafe_trunc(Int32, ih&0xFFFF)) * (0x1p16 * km) +
-         Float64(unsafe_trunc(Int32, il>>>48)) * km
+         Float64(unsafe_trunc(Int32, il>>>0x30)) * km
     rl = Float64(il&0xFFFFFFFFFFFF) * @exp2(f <= 53 ? 0 : -f) # lower 48 bits
     if f <= 53
         return (rh + (rm + rl)) / unsafe_trunc(Int64, rawone(x))

From f647feb0d48e49458fad11c965da637194a168d1 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 12 Dec 2019 05:28:00 -0600
Subject: [PATCH 09/10] Precompile methods for common types (#148)

---
 .gitignore               |  1 +
 .travis.yml              |  2 +-
 src/FixedPointNumbers.jl |  5 +++--
 src/precompile.jl        | 30 ++++++++++++++++++++++++++++++
 4 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 src/precompile.jl

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..ba39cc53
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+Manifest.toml
diff --git a/.travis.yml b/.travis.yml
index a22f5779..5c8dca12 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,7 @@ os:
     - osx
 julia:
     - 1.0
-    - 1.1
+    - 1.2
     - nightly
 notifications:
   email: false
diff --git a/src/FixedPointNumbers.jl b/src/FixedPointNumbers.jl
index ccd86e50..60ec762d 100644
--- a/src/FixedPointNumbers.jl
+++ b/src/FixedPointNumbers.jl
@@ -1,5 +1,3 @@
-VERSION < v"0.7.0-beta2.199" && __precompile__()
-
 module FixedPointNumbers
 
 import Base: ==, <, <=, -, +, *, /, ~, isapprox,
@@ -199,4 +197,7 @@ end
 rand(::Type{T}) where {T <: FixedPoint} = reinterpret(T, rand(rawtype(T)))
 rand(::Type{T}, sz::Dims) where {T <: FixedPoint} = reinterpret(T, rand(rawtype(T), sz))
 
+include("precompile.jl")
+_precompile_()
+
 end # module
diff --git a/src/precompile.jl b/src/precompile.jl
new file mode 100644
index 00000000..178be83f
--- /dev/null
+++ b/src/precompile.jl
@@ -0,0 +1,30 @@
+function _precompile_()
+    ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
+    normedtypes = (N0f8, N0f16)                      # precompiled Normed types
+    realtypes = (Float16, Float32, Float64, Int)     # types for mixed Normed/Real operations
+    for T in normedtypes
+        for f in (+, -, abs, eps, rand)       # unary operations
+            @assert precompile(Tuple{typeof(f),T})
+        end
+        @assert precompile(Tuple{typeof(rand),T,Tuple{Int}})
+        @assert precompile(Tuple{typeof(rand),T,Tuple{Int,Int}})
+        for f in (trunc, floor, ceil, round)  # rounding operations
+            @assert precompile(Tuple{typeof(f),T})
+            @assert precompile(Tuple{typeof(f),Type{Int},T})
+        end
+        for f in (+, -, *, /, <, <=, ==)      # binary operations
+            @assert precompile(Tuple{typeof(f),T,T})
+            for S in realtypes
+                @assert precompile(Tuple{typeof(f),T,S})
+                @assert precompile(Tuple{typeof(f),S,T})
+            end
+        end
+        # conversions
+        for S in realtypes
+            @assert precompile(Tuple{Type{T},S})
+            @assert precompile(Tuple{Type{S},T})
+            @assert precompile(Tuple{typeof(convert),Type{T},S})
+            @assert precompile(Tuple{typeof(convert),Type{S},T})
+        end
+    end
+end

From 7ad0f0c460961024191f672332fcf3d7cef655d1 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Mon, 16 Dec 2019 18:10:16 -0600
Subject: [PATCH 10/10] Version 0.7 (#149)

Since it will likely be a while before we can release 1.0, how about a 0.7 release? I'm bumping the minor version because it is breaking (`scaledual`, and the improved accuracy of conversions).
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 60fd45cd..4aecfcf2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "FixedPointNumbers"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.6.1"
+version = "0.7.0"
 
 [compat]
 julia = "1"