Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Iterator to loop over vector using VecRange #112

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ using SIMD
function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
@assert length(ys) == length(xs)
@assert length(xs) % N == 0
lane = VecRange{N}(0)
@inbounds for i in 1:N:length(xs)
xs[lane + i] += ys[lane + i]
@inbounds for lane in LoopVecRange{N}(xs)
xs[lane] += ys[lane]
end
end
```
Expand Down
2 changes: 1 addition & 1 deletion src/SIMD.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using Base: @propagate_inbounds

export Vec, vload, vloada, vloadnt, vloadx, vstore, vstorea, vstorent, vstorec,
vgather, vgathera, vscatter, vscattera, shufflevector, vifelse, valloc,
VecRange
VecRange, LoopVecRange

const VE = Base.VecElement
const LVec{N, T} = NTuple{N, VE{T}}
Expand Down
57 changes: 57 additions & 0 deletions src/arrayops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,63 @@ Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
all(first(inds) <= idx) && all(idx <= last(inds))

export LoopVecRange

"""
LoopVecRange{N}(start::Int, stop::Int)
Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`.
# Examples
```jldoctest
julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
for lane in LoopVecRange{N}(xs)
xs[lane] += ys[lane]
end
end
```
"""
struct LoopVecRange{N} <: AbstractRange{Int}
start::Int
stop::Int

Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N
N == 0 && throw(ArgumentError("Step cannot be zero"))

if !unsafe
@boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width"))
end

return new{N}(start, stop)
end
end

Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = LoopVecRange{N}(1, r.stop, unsafe=unsafe)
Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe)
Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe)

Base.isempty(r::LoopVecRange{N}) where N = (r.start != r.stop) & ((N > zero(N)) != (r.stop > r.start))

Base.step(r::LoopVecRange{N}) where N = N
Base.has_offset_axes(::LoopVecRange) = false

Base.first(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.start + (N < 0 ? N + 1 : 0))
Base.last(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.stop + (N > 0 ? -N + 1 : 0))

Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r))

function Base.iterate(r::LoopVecRange{N}, i::VecRange) where N
@inline
if (N > zero(N) && i.i >= last(r).i) || (N < zero(N) && i.i <= last(r).i) # greater than or equal prevents infinite loop if length of range is not a multiple of width
return nothing
end
next = i + step(r)
(next, next)
end

Base.length(r::LoopVecRange{N}) where N = (r.stop - r.start + 1) ÷ N
Base.eltype(::Type{LoopVecRange{N}}) where N = VecRange{N}

Base.show(io::IO, r::LoopVecRange) = print(io, repr(first(r)), ':', repr(last(r)))

@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} =
nothing

Expand Down
201 changes: 201 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,207 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))

end

@testset "LoopVecRange Real-world examples" begin

function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
::Type{Vec{N,T}}) where {N,T}
@assert length(ys) == length(xs)
@assert length(xs) % N == 0
@inbounds for lane in LoopVecRange{N}(xs)
xs[lane] += ys[lane]
end
end

let xs = valloc(Float64, L4, 4*L4) do i i end,
ys = valloc(Float64, L4, 4*L4) do i 1 end
vadd!(xs, ys, V4F64)
@test xs == Float64[i+1 for i in 1:(4*L4)]
# @code_native vadd!(xs, ys, V4F64)

ir = llvm_ir(vadd!, (xs, ys, V4F64))
@test occursin(r"( load <4 x double>.*){2}"s, ir)
@test occursin(" store <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
end

function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
@assert length(xs) % N == 0
sv = Vec{N,T}(0)
@inbounds for lane in LoopVecRange{N}(xs)
sv += xs[lane]
end
sum(sv)
end

let xs = valloc(Float64, L4, 4*L4) do i i end
s = vsum(xs, V4F64)
@test s === (x->(x^2+x)/2)(Float64(4*L4))
# @code_native vsum(xs, V4F64)

ir = llvm_ir(vsum, (xs, V4F64))
@test occursin(" load <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
# @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
end


function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
::Type{Vec{N,T}}) where {N, T}
@assert length(ys) == length(xs)
limit = length(xs) - (N-1)
vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
@inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
if lane.i <= limit
xs[lane] += ys[lane]
else
mask = Vec{N,Int}(lane.i) <= vlimit
xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
end
end
end

let xs = valloc(Float64, 4, 13) do i i end,
ys = valloc(Float64, 4, 13) do i 1 end
vadd_masked!(xs, ys, V4F64)
@test xs == Float64[i+1 for i in 1:13]
# @code_native vadd!(xs, ys, V4F64)

ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
@test occursin(r"(masked.load.v4f64.*){2}"s, ir)
@test occursin("masked.store.v4f64", ir)
@test occursin(" store <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
end

function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
sv = Vec{N,T}(0)
@inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
mask = Vec{N,Int}(lane.i) <= vlimit
sv += xs[lane, mask]
end
sum(sv)
end

let xs = valloc(Float64, 4, 13) do i i end
s = vsum_masked(xs, V4F64)
# @code_llvm vsum(xs, V4F64)
# @code_native vsum(xs, V4F64)
@test s === sum(xs)

ir = llvm_ir(vsum_masked, (xs, V4F64))
@test occursin("masked.load.v4f64", ir)
@test occursin(" fadd <4 x double>", ir)
# @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
end

end

@testset "LoopVecRange" begin
@test_throws ArgumentError LoopVecRange{0}(1, 8)
@test_throws ArgumentError LoopVecRange{2}(1, 3)
@test_throws ArgumentError LoopVecRange{4}(3, 3)

@test_throws ArgumentError LoopVecRange{0}(8, 1)
@test_throws ArgumentError LoopVecRange{2}(3, 1)
end

@testset "Reverse LoopVecRange Real-world examples" begin

function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
::Type{Vec{N,T}}) where {N,T}
@assert length(ys) == length(xs)
@assert length(xs) % N == 0
@inbounds for lane in LoopVecRange{-N}(length(xs), 1)
xs[lane] += ys[lane]
end
end

let xs = valloc(Float64, L4, 4*L4) do i i end,
ys = valloc(Float64, L4, 4*L4) do i 1 end
vadd!(xs, ys, V4F64)
@test xs == Float64[i+1 for i in 1:(4*L4)]
# @code_native vadd!(xs, ys, V4F64)

ir = llvm_ir(vadd!, (xs, ys, V4F64))
@test occursin(r"( load <4 x double>.*){2}"s, ir)
@test occursin(" store <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
end

function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
@assert length(xs) % N == 0
sv = Vec{N,T}(0)
@inbounds for lane in LoopVecRange{-N}(length(xs), 1)
sv += xs[lane]
end
sum(sv)
end

let xs = valloc(Float64, L4, 4*L4) do i i end
s = vsum(xs, V4F64)
@test s === (x->(x^2+x)/2)(Float64(4*L4))
# @code_native vsum(xs, V4F64)

ir = llvm_ir(vsum, (xs, V4F64))
@test occursin(" load <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
# @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
end


function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
::Type{Vec{N,T}}) where {N, T}
@assert length(ys) == length(xs)
limit = length(xs) - (N-1)
vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
@inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
if lane.i <= limit
xs[lane] += ys[lane]
else
mask = Vec{N,Int}(lane.i) <= vlimit
xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
end
end
end

let xs = valloc(Float64, 4, 13) do i i end,
ys = valloc(Float64, 4, 13) do i 1 end
vadd_masked!(xs, ys, V4F64)
@test xs == Float64[i+1 for i in 1:13]
# @code_native vadd!(xs, ys, V4F64)

ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
@test occursin(r"(masked.load.v4f64.*){2}"s, ir)
@test occursin("masked.store.v4f64", ir)
@test occursin(" store <4 x double>", ir)
@test occursin(" fadd <4 x double>", ir)
end

function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
sv = Vec{N,T}(0)
@inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
mask = Vec{N,Int}(lane.i) <= vlimit
sv += xs[lane, mask]
end
sum(sv)
end

let xs = valloc(Float64, 4, 13) do i i end
s = vsum_masked(xs, V4F64)
# @code_llvm vsum(xs, V4F64)
# @code_native vsum(xs, V4F64)
@test s === sum(xs)

ir = llvm_ir(vsum_masked, (xs, V4F64))
@test occursin("masked.load.v4f64", ir)
@test occursin(" fadd <4 x double>", ir)
# @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
end

end

@testset "Vector shuffles" begin

for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)
Expand Down