Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster minimum/maximum/extrema #45581

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
20 changes: 10 additions & 10 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -537,19 +537,19 @@ end

isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y)

# interpret as sign-magnitude integer
@inline function _fpint(x)
IntT = inttype(typeof(x))
ix = reinterpret(IntT, x)
return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix)
end

@inline function isless(a::T, b::T) where T<:IEEEFloat
LilithHafner marked this conversation as resolved.
Show resolved Hide resolved
(isnan(a) || isnan(b)) && return !isnan(a)

return _fpint(a) < _fpint(b)
botval = flipifneg(reinterpret(Signed, typemin(T)))
topval = flipifneg(reinterpret(Signed, typemax(T)))
offset = typemin(botval) - botval # adding offset will place typemin(T) at the bottom, wrapping NaN to the top
u = flipifneg(reinterpret(Signed, a)) + offset
v = flipifneg(reinterpret(Signed, b)) + offset
return (u < v) & (u <= topval+offset) # (a < b) & (a <= Inf), with the proper signed zero handling
end

# if negative, flip all bits except the top bit
# used for permuting the order of IEEEFloat values
flipifneg(x::BitSigned) = xor(x, ifelse(signbit(x), typemax(x), zero(x)))

# Exact Float (Tf) vs Integer (Ti) comparisons
# Assumes:
# - typemax(Ti) == 2^n-1
Expand Down
126 changes: 65 additions & 61 deletions base/reduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -252,19 +252,21 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...)
# certain `op` (e.g. `min` and `max`) may have their own specialized versions.
@noinline function mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted,
ifirst::Integer, ilast::Integer, blksize::Int)
Eltype = _mapped_eltype(f, A)
pre, op_fast, post = _makefast_reduction(op, Eltype)
if ifirst == ilast
@inbounds a1 = A[ifirst]
return mapreduce_first(f, op, a1)
elseif ilast - ifirst < blksize
# sequential portion
@inbounds a1 = A[ifirst]
@inbounds a2 = A[ifirst+1]
v = op(f(a1), f(a2))
v = op_fast(pre(f(a1)), pre(f(a2)))
@simd for i = ifirst + 2 : ilast
@inbounds ai = A[i]
v = op(v, f(ai))
v = op_fast(v, pre(f(ai)))
end
return v
return post(v)
else
# pairwise portion
imid = ifirst + (ilast - ifirst) >> 1
Expand Down Expand Up @@ -314,6 +316,9 @@ pairwise_blocksize(f, op) = 1024
# This combination appears to show a benefit from a larger block size
pairwise_blocksize(::typeof(abs2), ::typeof(+)) = 4096

# The following operations prefer non-pairwise reduction.
pairwise_blocksize(_, ::Union{typeof(min),typeof(max)}) = typemax(Int)
pairwise_blocksize(_, ::Union{typeof(|),typeof(&)}) = typemax(Int)

# handling empty arrays
_empty_reduce_error() = throw(ArgumentError("reducing over an empty collection is not allowed"))
Expand Down Expand Up @@ -619,64 +624,6 @@ julia> prod(1:5; init = 1.0)
"""
prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)

## maximum, minimum, & extrema
_fast(::typeof(min),x,y) = min(x,y)
_fast(::typeof(max),x,y) = max(x,y)
function _fast(::typeof(max), x::AbstractFloat, y::AbstractFloat)
ifelse(isnan(x),
x,
ifelse(x > y, x, y))
end

function _fast(::typeof(min),x::AbstractFloat, y::AbstractFloat)
ifelse(isnan(x),
x,
ifelse(x < y, x, y))
end

isbadzero(::typeof(max), x::AbstractFloat) = (x == zero(x)) & signbit(x)
isbadzero(::typeof(min), x::AbstractFloat) = (x == zero(x)) & !signbit(x)
isbadzero(op, x) = false
isgoodzero(::typeof(max), x) = isbadzero(min, x)
isgoodzero(::typeof(min), x) = isbadzero(max, x)

function mapreduce_impl(f, op::Union{typeof(max), typeof(min)},
A::AbstractArrayOrBroadcasted, first::Int, last::Int)
a1 = @inbounds A[first]
v1 = mapreduce_first(f, op, a1)
v2 = v3 = v4 = v1
chunk_len = 256
start = first + 1
simdstop = start + chunk_len - 4
while simdstop <= last - 3
@inbounds for i in start:4:simdstop
v1 = _fast(op, v1, f(A[i+0]))
v2 = _fast(op, v2, f(A[i+1]))
v3 = _fast(op, v3, f(A[i+2]))
v4 = _fast(op, v4, f(A[i+3]))
end
checkbounds(A, simdstop+3)
start += chunk_len
simdstop += chunk_len
end
v = op(op(v1,v2),op(v3,v4))
for i in start:last
@inbounds ai = A[i]
v = op(v, f(ai))
end

# enforce correct order of 0.0 and -0.0
# e.g. maximum([0.0, -0.0]) === 0.0
# should hold
if isbadzero(op, v)
for i in first:last
x = @inbounds A[i]
isgoodzero(op,x) && return x
end
end
return v
end

"""
maximum(f, itr; [init])

Expand Down Expand Up @@ -872,6 +819,8 @@ function _extrema_rf(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:IEEEFloat}
z1, z2
end

pairwise_blocksize(::ExtremaMap, ::typeof(_extrema_rf)) = typemax(Int)

## findmax, findmin, argmax & argmin

"""
Expand Down Expand Up @@ -1379,3 +1328,58 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T}
end
return n
end

_mapped_eltype(f, A) = _return_type(f, Tuple{@default_eltype(A)})

"""
Base._makefast_reduction(op, T) -> (pre, op_fast, post)

Define a series of functions that can be used accelerate the computation of `op(x, y)`.
Incorrect results may be produced if it does not hold that `op(x::T, y::T)` and
`post(op_fast(pre(x::T), pre(y::T)))` produce equivalent results (and further, that this
holds for arbitrary-length reductions). This can be useful if `op_fast` is much faster
than `op` after the preprocessing provided by `pre`.

The fallback return value for this method is `(identity, op, identity)`, corresponding
to no change to the processing.
"""
_makefast_reduction(op, T) = (identity, op, identity)

# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior.
# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value.
# In the case that both arguments are `NaN`, either value may be returned.
# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is:
# -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN
# The desired ordering for min is:
# NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf
# The desired ordering for max is:
# -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign)
# Achieving this ordering requires two steps:
# 1) flip the ordering of all values with a negative sign
# 2) circularly shift the values so that NaNs are together at the correct end of the spectrum
# The following functions perform this transformation and reverse it.
# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
for (T, S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64))
topval = flipifneg(reinterpret(S, typemax(T)))
botval = flipifneg(reinterpret(S, typemin(T)))
offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom

for (op, offset) in ((min, offset_min), (max, offset_max))
@eval function _makefast_reduction(::typeof($op), ::Type{$T})
preprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x)) + $offset)
reduction(x::$T, y::$T) = reinterpret($T, $op(reinterpret($S, x), reinterpret($S, y)))
postprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x) - $offset))
return preprocess, reduction, postprocess
end
end

@eval function _makefast_reduction(::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) # used for extrema
premin, reducemin, postmin = _makefast_reduction(min, $T)
premax, reducemax, postmax = _makefast_reduction(max, $T)
preprocess((x, y)::NTuple{2,$T}) = (premin(x), premax(y))
reduction((x1, y1)::NTuple{2,$T}, (x2, y2)::NTuple{2,$T}) = (reducemin(x1, x2), reducemax(y1, y2))
postprocess((x, y)::NTuple{2,$T}) = (postmin(x), postmax(y))
return preprocess, reduction, postprocess
end
end
9 changes: 6 additions & 3 deletions base/reducedim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,17 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted)
keep, Idefault = Broadcast.shapeindexer(indsRt)
if reducedim1(R, A)
# keep the accumulator as a local variable when reducing along the first dimension
Eltype = _mapped_eltype(f, A)
Eltype = Eltype === eltype(R) ? Eltype : Any
pre, op_fast, post = _makefast_reduction(op, Eltype)
i1 = first(axes1(R))
@inbounds for IA in CartesianIndices(indsAt)
IR = Broadcast.newindex(IA, keep, Idefault)
r = R[i1,IR]
r = pre(R[i1,IR])
@simd for i in axes(A, 1)
r = op(r, f(A[i, IA]))
r = op_fast(r, pre(f(A[i, IA])))
end
R[i1,IR] = r
R[i1,IR] = post(r)
end
else
@inbounds for IA in CartesianIndices(indsAt)
Copy link
Contributor Author

@mikmoore mikmoore Jan 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it be possible to apply the same series of transformations when reducing along non-first dimensions? Or am I missing where that has already been handled?

EDIT: maybe now I see. This was your comment about the destination array possibly being of a different type. Could this still be done if it has a same-size type (or at least the exact same type) so that they are reinterpret-able?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly. And all the optimizations here are reinterpret-able.

Copy link
Member

@N5N3 N5N3 Jan 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think non-dim1 cases would be accelerated a lot as we will have to add a map!(pre, dest, dest) berfore the kernal loop, and a map!(post, dest, dest) after it.
reinterpret might be 0-cost, but flipneg+add offset+mem access not.

And non dim1 minimum/maximum is vectorlized quite well

julia> A = randn(512, 512);

julia> @btime minimum($A, dims = 1);
  65.400 μs (7 allocations: 4.33 KiB)

julia> @btime minimum($A, dims = 2);
  56.500 μs (7 allocations: 4.33 KiB)

On my PC it's faster than dim1 with optimizaiton.

Copy link
Member

@N5N3 N5N3 Jan 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A simple bench

f(b, c) = begin
    pre, op, post = Base._makefast_reducution(min, Float64)
    Base.initarray!(b, identity, min, true, c)
    map!(pre, b, b)
    Base.mapreducedim!(pre, op, b, c);
    map!(post, b, b)
end

A = randn(512, 512); B = randn(512);
----------------------------------------
julia> @btime f($B, $A);
  58.500 μs (0 allocations: 0 bytes)

julia> @btime minimum!($B, $A);
  57.700 μs (0 allocations: 0 bytes)

EDIT: profile shows that the overhead of map!(pre, b, b) & map!(post, b, b) is neglible is this case.

Expand Down
10 changes: 6 additions & 4 deletions base/reinterpretarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -757,23 +757,25 @@ end

@noinline function mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted,
ifirst::SCI, ilast::SCI, blksize::Int) where {F,OP,SCI<:SCartesianIndex2{K}} where K
Eltype = _mapped_eltype(f, A)
pre, op_fast, post = _makefast_reduction(op, Eltype)
if ilast.j - ifirst.j < blksize
# sequential portion
@inbounds a1 = A[ifirst]
@inbounds a2 = A[SCI(2,ifirst.j)]
v = op(f(a1), f(a2))
v = op_fast(pre(f(a1)), pre(f(a2)))
@simd for i = ifirst.i + 2 : K
@inbounds ai = A[SCI(i,ifirst.j)]
v = op(v, f(ai))
v = op_fast(v, pre(f(ai)))
end
# Remaining columns
for j = ifirst.j+1 : ilast.j
@simd for i = 1:K
@inbounds ai = A[SCI(i,j)]
v = op(v, f(ai))
v = op_fast(v, pre(f(ai)))
end
end
return v
return post(v)
else
# pairwise portion
jmid = ifirst.j + (ilast.j - ifirst.j) >> 1
Expand Down