Skip to content

Commit

Permalink
Merge pull request #131 from kimikage/issue102
Browse files Browse the repository at this point in the history
Improve accuracy of conversions from floating-point numbers (Fixes #102)
  • Loading branch information
kimikage authored Nov 2, 2019
2 parents f622800 + 547eb95 commit 70ae1d6
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 16 deletions.
4 changes: 2 additions & 2 deletions src/FixedPointNumbers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import Base: ==, <, <=, -, +, *, /, ~, isapprox,
using Base: @pure

# T => BaseType
# f => Number of Bytes reserved for fractional part
# f => Number of bits reserved for fractional part
abstract type FixedPoint{T <: Integer, f} <: Real end


Expand All @@ -25,7 +25,7 @@ export
Normed,
floattype,
# "special" typealiases
# Q and U typealiases are exported in separate source files
# Q and N typealiases are exported in separate source files
# Functions
scaledual

Expand Down
62 changes: 48 additions & 14 deletions src/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
end
N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))

(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)

function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
y = round(widen1(rawone(U))*x)
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
U(_unsafe_trunc(T, y), 0)
(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)

function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
if T == UInt128 # for UInt128, we can't widen
# the upper limit is not exact
(0 <= x) & (x <= (typemax(T)/rawone(U))) || throw_converterror(U, x)
y = round(rawone(U)*x)
else
y = round(widen1(rawone(U))*x)
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
end
reinterpret(U, _unsafe_trunc(T, y))
end
# Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
_convert(U, T, Float32(x))
_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
_convert(U, UInt128, Float32(x))
function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
y = round(rawone(U)*x) # for UInt128, we can't widen
(0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
U(_unsafe_trunc(UInt128, y), 0)
function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
x == Float16(typemax(T)/rawone(U)) && return typemax(U)
end
return _convert(U, Float32(x))
end
function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
if T == UInt128 && f == 53
0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
else
0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
end

significand_bits = Tf == Float64 ? 52 : 23
if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
end
# cf. the implementation of `frexp`
Tw = f < sizeof(T) * 8 ? T : widen1(T)
bits = sizeof(Tw) * 8 - 1
xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
k = Int(xu >> significand_bits)
k == 0 && return zero(U) # neglect subnormal numbers
significand = xu | (one(xu) << significand_bits)
yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
exponent_bias = Tf == Float64 ? 1023 : 127
ex = exponent_bias - k + bits - f
yi = bits >= f ? yh - (yh >> f) : yh
if ex <= 0
ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
ex != -1 || signbit(signed(yi)) && return typemax(U)
return reinterpret(U, unsafe_trunc(T, yi + yi))
end
ex > bits && return reinterpret(U, ex == bits + 1 ? one(T) : zero(T))
yi += one(Tw)<<((ex - 1) & bits) # RoundNearestTiesUp
return reinterpret(U, unsafe_trunc(T, yi >> (ex & bits)))
end

rem(x::T, ::Type{T}) where {T <: Normed} = x
Expand Down
34 changes: 34 additions & 0 deletions test/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,40 @@ end
@test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
end

@testset "conversion from float" begin
# issue 102
for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
for Tf in (Float16, Float32, Float64)
@testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
U = Normed{T,f}
r = FixedPointNumbers.rawone(U)

@test reinterpret(U(zero(Tf))) == 0x0

# TODO: fix issue #129
# input_typemax = Tf(typemax(U))
input_typemax = Tf(BigFloat(typemax(T)) / r)
if isinf(input_typemax)
@test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
else
@test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
end

input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
isinf(input_upper) && continue # for Julia v0.7
@test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))

input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
isinf(input_exp2) && continue
@test reinterpret(U(input_exp2)) == T(input_exp2) * r
end
end
end
@test N0f32(Float32(0x0.7FFFFFp-32)) == zero(N0f32)
@test N0f32(Float32(0x0.800000p-32)) <= eps(N0f32) # should be zero in RoundNearest mode
@test N0f32(Float32(0x0.800001p-32)) == eps(N0f32)
end

@testset "modulus" begin
@test N0f8(0.2) % N0f8 === N0f8(0.2)
@test N2f14(1.2) % N0f16 === N0f16(0.20002)
Expand Down

0 comments on commit 70ae1d6

Please sign in to comment.