diff --git a/base/float.jl b/base/float.jl index eda0865c9fbac..415364fc17f3d 100644 --- a/base/float.jl +++ b/base/float.jl @@ -537,19 +537,19 @@ end isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y) -# interpret as sign-magnitude integer -@inline function _fpint(x) - IntT = inttype(typeof(x)) - ix = reinterpret(IntT, x) - return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix) -end - @inline function isless(a::T, b::T) where T<:IEEEFloat - (isnan(a) || isnan(b)) && return !isnan(a) - - return _fpint(a) < _fpint(b) + botval = flipifneg(reinterpret(Signed, typemin(T))) + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset = typemin(botval) - botval # adding offset will place typemin(T) at the bottom, wrapping NaN to the top + u = flipifneg(reinterpret(Signed, a)) + offset + v = flipifneg(reinterpret(Signed, b)) + offset + return (u < v) & (u <= topval+offset) # (a < b) & (a <= Inf), with the proper signed zero handling end +# if negative, flip all bits except the top bit +# used for permuting the order of IEEEFloat values +flipifneg(x::BitSigned) = xor(x, ifelse(signbit(x), typemax(x), zero(x))) + # Exact Float (Tf) vs Integer (Ti) comparisons # Assumes: # - typemax(Ti) == 2^n-1 diff --git a/base/reduce.jl b/base/reduce.jl index ae2671a2e746a..3b5f6e92dd3f2 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -252,6 +252,8 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...) # certain `op` (e.g. `min` and `max`) may have their own specialized versions. @noinline function mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer, blksize::Int) + Eltype = _mapped_eltype(f, A) + pre, op_fast, post = _makefast_reduction(op, Eltype) if ifirst == ilast @inbounds a1 = A[ifirst] return mapreduce_first(f, op, a1) @@ -259,12 +261,12 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...) # sequential portion @inbounds a1 = A[ifirst] @inbounds a2 = A[ifirst+1] - v = op(f(a1), f(a2)) + v = op_fast(pre(f(a1)), pre(f(a2))) @simd for i = ifirst + 2 : ilast @inbounds ai = A[i] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end - return v + return post(v) else # pairwise portion imid = ifirst + (ilast - ifirst) >> 1 @@ -314,6 +316,9 @@ pairwise_blocksize(f, op) = 1024 # This combination appears to show a benefit from a larger block size pairwise_blocksize(::typeof(abs2), ::typeof(+)) = 4096 +# The following operations prefer non-pairwise reduction. +pairwise_blocksize(_, ::Union{typeof(min),typeof(max)}) = typemax(Int) +pairwise_blocksize(_, ::Union{typeof(|),typeof(&)}) = typemax(Int) # handling empty arrays _empty_reduce_error() = throw(ArgumentError("reducing over an empty collection is not allowed")) @@ -619,64 +624,6 @@ julia> prod(1:5; init = 1.0) """ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...) -## maximum, minimum, & extrema -_fast(::typeof(min),x,y) = min(x,y) -_fast(::typeof(max),x,y) = max(x,y) -function _fast(::typeof(max), x::AbstractFloat, y::AbstractFloat) - ifelse(isnan(x), - x, - ifelse(x > y, x, y)) -end - -function _fast(::typeof(min),x::AbstractFloat, y::AbstractFloat) - ifelse(isnan(x), - x, - ifelse(x < y, x, y)) -end - -isbadzero(::typeof(max), x::AbstractFloat) = (x == zero(x)) & signbit(x) -isbadzero(::typeof(min), x::AbstractFloat) = (x == zero(x)) & !signbit(x) -isbadzero(op, x) = false -isgoodzero(::typeof(max), x) = isbadzero(min, x) -isgoodzero(::typeof(min), x) = isbadzero(max, x) - -function mapreduce_impl(f, op::Union{typeof(max), typeof(min)}, - A::AbstractArrayOrBroadcasted, first::Int, last::Int) - a1 = @inbounds A[first] - v1 = mapreduce_first(f, op, a1) - v2 = v3 = v4 = v1 - chunk_len = 256 - start = first + 1 - simdstop = start + chunk_len - 4 - while simdstop <= last - 3 - @inbounds for i in start:4:simdstop - v1 = _fast(op, v1, f(A[i+0])) - v2 = _fast(op, v2, f(A[i+1])) - v3 = _fast(op, v3, f(A[i+2])) - v4 = _fast(op, v4, f(A[i+3])) - end - checkbounds(A, simdstop+3) - start += chunk_len - simdstop += chunk_len - end - v = op(op(v1,v2),op(v3,v4)) - for i in start:last - @inbounds ai = A[i] - v = op(v, f(ai)) - end - - # enforce correct order of 0.0 and -0.0 - # e.g. maximum([0.0, -0.0]) === 0.0 - # should hold - if isbadzero(op, v) - for i in first:last - x = @inbounds A[i] - isgoodzero(op,x) && return x - end - end - return v -end - """ maximum(f, itr; [init]) @@ -872,6 +819,8 @@ function _extrema_rf(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:IEEEFloat} z1, z2 end +pairwise_blocksize(::ExtremaMap, ::typeof(_extrema_rf)) = typemax(Int) + ## findmax, findmin, argmax & argmin """ @@ -1379,3 +1328,58 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T} end return n end + +_mapped_eltype(f, A) = _return_type(f, Tuple{@default_eltype(A)}) + +""" + Base._makefast_reduction(op, T) -> (pre, op_fast, post) + +Define a series of functions that can be used accelerate the computation of `op(x, y)`. +Incorrect results may be produced if it does not hold that `op(x::T, y::T)` and +`post(op_fast(pre(x::T), pre(y::T)))` produce equivalent results (and further, that this +holds for arbitrary-length reductions). This can be useful if `op_fast` is much faster +than `op` after the preprocessing provided by `pre`. + +The fallback return value for this method is `(identity, op, identity)`, corresponding +to no change to the processing. +""" +_makefast_reduction(op, T) = (identity, op, identity) + +# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior. +# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value. +# In the case that both arguments are `NaN`, either value may be returned. +# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is: +# -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN +# The desired ordering for min is: +# NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf +# The desired ordering for max is: +# -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign) +# Achieving this ordering requires two steps: +# 1) flip the ordering of all values with a negative sign +# 2) circularly shift the values so that NaNs are together at the correct end of the spectrum +# The following functions perform this transformation and reverse it. +# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. +for (T, S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64)) + topval = flipifneg(reinterpret(S, typemax(T))) + botval = flipifneg(reinterpret(S, typemin(T))) + offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + + for (op, offset) in ((min, offset_min), (max, offset_max)) + @eval function _makefast_reduction(::typeof($op), ::Type{$T}) + preprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x)) + $offset) + reduction(x::$T, y::$T) = reinterpret($T, $op(reinterpret($S, x), reinterpret($S, y))) + postprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x) - $offset)) + return preprocess, reduction, postprocess + end + end + + @eval function _makefast_reduction(::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) # used for extrema + premin, reducemin, postmin = _makefast_reduction(min, $T) + premax, reducemax, postmax = _makefast_reduction(max, $T) + preprocess((x, y)::NTuple{2,$T}) = (premin(x), premax(y)) + reduction((x1, y1)::NTuple{2,$T}, (x2, y2)::NTuple{2,$T}) = (reducemin(x1, x2), reducemax(y1, y2)) + postprocess((x, y)::NTuple{2,$T}) = (postmin(x), postmax(y)) + return preprocess, reduction, postprocess + end +end diff --git a/base/reducedim.jl b/base/reducedim.jl index dc34b4feb1f6a..baa7549d05d45 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -301,14 +301,17 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted) keep, Idefault = Broadcast.shapeindexer(indsRt) if reducedim1(R, A) # keep the accumulator as a local variable when reducing along the first dimension + Eltype = _mapped_eltype(f, A) + Eltype = Eltype === eltype(R) ? Eltype : Any + pre, op_fast, post = _makefast_reduction(op, Eltype) i1 = first(axes1(R)) @inbounds for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] + r = pre(R[i1,IR]) @simd for i in axes(A, 1) - r = op(r, f(A[i, IA])) + r = op_fast(r, pre(f(A[i, IA]))) end - R[i1,IR] = r + R[i1,IR] = post(r) end else @inbounds for IA in CartesianIndices(indsAt) diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl index 1fe0788a1739a..af8c6eeb4f718 100644 --- a/base/reinterpretarray.jl +++ b/base/reinterpretarray.jl @@ -757,23 +757,25 @@ end @noinline function mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted, ifirst::SCI, ilast::SCI, blksize::Int) where {F,OP,SCI<:SCartesianIndex2{K}} where K + Eltype = _mapped_eltype(f, A) + pre, op_fast, post = _makefast_reduction(op, Eltype) if ilast.j - ifirst.j < blksize # sequential portion @inbounds a1 = A[ifirst] @inbounds a2 = A[SCI(2,ifirst.j)] - v = op(f(a1), f(a2)) + v = op_fast(pre(f(a1)), pre(f(a2))) @simd for i = ifirst.i + 2 : K @inbounds ai = A[SCI(i,ifirst.j)] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end # Remaining columns for j = ifirst.j+1 : ilast.j @simd for i = 1:K @inbounds ai = A[SCI(i,j)] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end end - return v + return post(v) else # pairwise portion jmid = ifirst.j + (ilast.j - ifirst.j) >> 1