From 4354d9cce3ff173a1e1162f0d7d2fb410fe8a166 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Fri, 7 Apr 2023 15:40:14 +0100 Subject: [PATCH 1/6] Add LoopRangeVec --- src/SIMD.jl | 2 +- src/arrayops.jl | 55 +++++++++++++++++++++++ test/runtests.jl | 113 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 165 insertions(+), 5 deletions(-) diff --git a/src/SIMD.jl b/src/SIMD.jl index 12f9f97..ad02888 100644 --- a/src/SIMD.jl +++ b/src/SIMD.jl @@ -4,7 +4,7 @@ using Base: @propagate_inbounds export Vec, vload, vloada, vloadnt, vloadx, vstore, vstorea, vstorent, vstorec, vgather, vgathera, vscatter, vscattera, shufflevector, vifelse, valloc, - VecRange + VecRange, LoopVecRange const VE = Base.VecElement const LVec{N, T} = NTuple{N, VE{T}} diff --git a/src/arrayops.jl b/src/arrayops.jl index f696543..1de3795 100644 --- a/src/arrayops.jl +++ b/src/arrayops.jl @@ -228,6 +228,61 @@ Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) = Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) = all(first(inds) <= idx) && all(idx <= last(inds)) +export LoopVecRange + +""" + LoopVecRange{N}(start::Int, stop::Int) +Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`. +# Examples +```jldoctest +julia> xs = ones(4); +julia> xs[VecRange{4}(1)] # calls `vload(Vec{4,Float64}, xs, 1)` +<4 x Float64>[1.0, 1.0, 1.0, 1.0] +``` +""" +struct LoopVecRange{N} <: AbstractUnitRange{Int} + start::Int + stop::Int + + Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N + N <= 0 && throw(ArgumentError("Width cannot be less than 1")) + + if !unsafe + @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width")) + + @boundscheck stop < start && throw(ArgumentError("Stop cannot be less than start")) + end + + return new{N}(start, stop) + end +end + +Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = LoopVecRange{N}(1, r.stop, unsafe=unsafe) +Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe) +Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe) + +Base.isempty(r::LoopVecRange) = r.start > r.stop + +Base.step(r::LoopVecRange{N}) where N = N +Base.has_offset_axes(::LoopVecRange) = false + +Base.first(r::LoopVecRange{N}) where N = VecRange{N}(r.start) +Base.last(r::LoopVecRange{N}) where N = VecRange{N}(r.stop - N + 1) + +Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r)) + +function Base.iterate(r::LoopVecRange{N}, i::VecRange{N}) where N + @inline + i.i >= (r.stop - N + 1) && return nothing # greater than or equal prevents infinite loop if length of range is not a multiple of width + next = i + step(r) + (next, next) +end + +Base.length(r::LoopVecRange{N}) where N = (r.stop - r.start + 1) รท N +Base.eltype(::Type{LoopVecRange{N}}) where N = VecRange{N} + +Base.show(io::IO, r::LoopVecRange) = print(io, repr(first(r)), ':', repr(last(r))) + @inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} = nothing diff --git a/test/runtests.jl b/test/runtests.jl index 9981e06..4cb74f3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -819,8 +819,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) let xs = valloc(Float64, 4, 13) do i i end s = vsum_masked(xs, V4F64) - @code_llvm vsum(xs, V4F64) - @code_native vsum(xs, V4F64) + # @code_llvm vsum(xs, V4F64) + # @code_native vsum(xs, V4F64) @test s === sum(xs) ir = llvm_ir(vsum_masked, (xs, V4F64)) @@ -860,8 +860,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) vcompress!(dest, pred, src) @test dest[1:sum(pred)] == src[src .> 0] - @code_llvm vcompress!(dest, pred, src) - @code_native vcompress!(dest, pred, src) + # @code_llvm vcompress!(dest, pred, src) + # @code_native vcompress!(dest, pred, src) ir = llvm_ir(vcompress!, (dest, pred, src)) @test occursin("masked.compressstore.v4f64", ir) @@ -870,6 +870,111 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) end + @testset "LoopVecRange Real-world examples" begin + + function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N,T} + @assert length(ys) == length(xs) + @assert length(xs) % N == 0 + @inbounds for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] + end + end + + let xs = valloc(Float64, L4, 4*L4) do i i end, + ys = valloc(Float64, L4, 4*L4) do i 1 end + vadd!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:(4*L4)] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd!, (xs, ys, V4F64)) + @test occursin(r"( load <4 x double>.*){2}"s, ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + @assert length(xs) % N == 0 + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{N}(xs) + sv += xs[lane] + end + sum(sv) + end + + let xs = valloc(Float64, L4, 4*L4) do i i end + s = vsum(xs, V4F64) + @test s === (x->(x^2+x)/2)(Float64(4*L4)) + # @code_native vsum(xs, V4F64) + + ir = llvm_ir(vsum, (xs, V4F64)) + @test occursin(" load <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + + function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N, T} + @assert length(ys) == length(xs) + limit = length(xs) - (N-1) + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + @inbounds for lane in LoopVecRange{N}(xs, unsafe=true) + if lane.i <= limit + xs[lane] += ys[lane] + else + mask = Vec{N,Int}(lane.i) <= vlimit + xs[lane, mask] = xs[lane, mask] + ys[lane, mask] + end + end + end + + let xs = valloc(Float64, 4, 13) do i i end, + ys = valloc(Float64, 4, 13) do i 1 end + vadd_masked!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:13] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd_masked!, (xs, ys, V4F64)) + @test occursin(r"(masked.load.v4f64.*){2}"s, ir) + @test occursin("masked.store.v4f64", ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{N}(xs, unsafe=true) + mask = Vec{N,Int}(lane.i) <= vlimit + sv += xs[lane, mask] + end + sum(sv) + end + + let xs = valloc(Float64, 4, 13) do i i end + s = vsum_masked(xs, V4F64) + # @code_llvm vsum(xs, V4F64) + # @code_native vsum(xs, V4F64) + @test s === sum(xs) + + ir = llvm_ir(vsum_masked, (xs, V4F64)) + @test occursin("masked.load.v4f64", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + end + + @testset "LoopVecRange" begin + @test_throws ArgumentError LoopVecRange{-1}(1, 8) + @test_throws ArgumentError LoopVecRange{0}(1, 8) + @test_throws ArgumentError LoopVecRange{2}(1, 3) + @test_throws ArgumentError LoopVecRange{2}(3, 2) + @test_throws ArgumentError LoopVecRange{1}(3, 2) + @test_throws ArgumentError LoopVecRange{4}(3, 3) + end + @testset "Vector shuffles" begin for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64) From a6584c8d5b650b8bb617e1abec82dd6244ebaed6 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Fri, 7 Apr 2023 17:02:02 +0100 Subject: [PATCH 2/6] Fix docs for LoopVecRange --- src/arrayops.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/arrayops.jl b/src/arrayops.jl index 1de3795..9c4a5d4 100644 --- a/src/arrayops.jl +++ b/src/arrayops.jl @@ -235,9 +235,11 @@ export LoopVecRange Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`. # Examples ```jldoctest -julia> xs = ones(4); -julia> xs[VecRange{4}(1)] # calls `vload(Vec{4,Float64}, xs, 1)` -<4 x Float64>[1.0, 1.0, 1.0, 1.0] +julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T} + for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] + end +end ``` """ struct LoopVecRange{N} <: AbstractUnitRange{Int} From fcc1af0ab586e102bea4818b739abbb352adc33f Mon Sep 17 00:00:00 2001 From: Zentrik Date: Fri, 7 Apr 2023 18:40:52 +0100 Subject: [PATCH 3/6] Allow stop < start in LoopVecRange --- src/arrayops.jl | 16 ++++---- test/runtests.jl | 102 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 11 deletions(-) diff --git a/src/arrayops.jl b/src/arrayops.jl index 9c4a5d4..e19f858 100644 --- a/src/arrayops.jl +++ b/src/arrayops.jl @@ -247,12 +247,10 @@ struct LoopVecRange{N} <: AbstractUnitRange{Int} stop::Int Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N - N <= 0 && throw(ArgumentError("Width cannot be less than 1")) + N == 0 && throw(ArgumentError("Step cannot be zero")) if !unsafe @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width")) - - @boundscheck stop < start && throw(ArgumentError("Stop cannot be less than start")) end return new{N}(start, stop) @@ -263,19 +261,21 @@ Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe) Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe) -Base.isempty(r::LoopVecRange) = r.start > r.stop +Base.isempty(r::LoopVecRange{N}) where N = (r.start != r.stop) & ((N > zero(N)) != (r.stop > r.start)) Base.step(r::LoopVecRange{N}) where N = N Base.has_offset_axes(::LoopVecRange) = false -Base.first(r::LoopVecRange{N}) where N = VecRange{N}(r.start) -Base.last(r::LoopVecRange{N}) where N = VecRange{N}(r.stop - N + 1) +Base.first(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.start + (N < 0 ? N + 1 : 0)) +Base.last(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.stop + (N > 0 ? -N + 1 : 0)) Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r)) -function Base.iterate(r::LoopVecRange{N}, i::VecRange{N}) where N +function Base.iterate(r::LoopVecRange{N}, i::VecRange) where N @inline - i.i >= (r.stop - N + 1) && return nothing # greater than or equal prevents infinite loop if length of range is not a multiple of width + if (N > zero(N) && i.i >= last(r).i) || (N < zero(N) && i.i <= last(r).i) # greater than or equal prevents infinite loop if length of range is not a multiple of width + return nothing + end next = i + step(r) (next, next) end diff --git a/test/runtests.jl b/test/runtests.jl index 4cb74f3..7d6e390 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -967,12 +967,108 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) end @testset "LoopVecRange" begin - @test_throws ArgumentError LoopVecRange{-1}(1, 8) @test_throws ArgumentError LoopVecRange{0}(1, 8) @test_throws ArgumentError LoopVecRange{2}(1, 3) - @test_throws ArgumentError LoopVecRange{2}(3, 2) - @test_throws ArgumentError LoopVecRange{1}(3, 2) @test_throws ArgumentError LoopVecRange{4}(3, 3) + + @test_throws ArgumentError LoopVecRange{0}(8, 1) + @test_throws ArgumentError LoopVecRange{2}(3, 1) + end + + @testset "Reverse LoopVecRange Real-world examples" begin + + function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N,T} + @assert length(ys) == length(xs) + @assert length(xs) % N == 0 + @inbounds for lane in LoopVecRange{-N}(length(xs), 1) + xs[lane] += ys[lane] + end + end + + let xs = valloc(Float64, L4, 4*L4) do i i end, + ys = valloc(Float64, L4, 4*L4) do i 1 end + vadd!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:(4*L4)] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd!, (xs, ys, V4F64)) + @test occursin(r"( load <4 x double>.*){2}"s, ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + @assert length(xs) % N == 0 + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{-N}(length(xs), 1) + sv += xs[lane] + end + sum(sv) + end + + let xs = valloc(Float64, L4, 4*L4) do i i end + s = vsum(xs, V4F64) + @test s === (x->(x^2+x)/2)(Float64(4*L4)) + # @code_native vsum(xs, V4F64) + + ir = llvm_ir(vsum, (xs, V4F64)) + @test occursin(" load <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + + + function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1}, + ::Type{Vec{N,T}}) where {N, T} + @assert length(ys) == length(xs) + limit = length(xs) - (N-1) + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true) + if lane.i <= limit + xs[lane] += ys[lane] + else + mask = Vec{N,Int}(lane.i) <= vlimit + xs[lane, mask] = xs[lane, mask] + ys[lane, mask] + end + end + end + + let xs = valloc(Float64, 4, 13) do i i end, + ys = valloc(Float64, 4, 13) do i 1 end + vadd_masked!(xs, ys, V4F64) + @test xs == Float64[i+1 for i in 1:13] + # @code_native vadd!(xs, ys, V4F64) + + ir = llvm_ir(vadd_masked!, (xs, ys, V4F64)) + @test occursin(r"(masked.load.v4f64.*){2}"s, ir) + @test occursin("masked.store.v4f64", ir) + @test occursin(" store <4 x double>", ir) + @test occursin(" fadd <4 x double>", ir) + end + + function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T} + vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N))) + sv = Vec{N,T}(0) + @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true) + mask = Vec{N,Int}(lane.i) <= vlimit + sv += xs[lane, mask] + end + sum(sv) + end + + let xs = valloc(Float64, 4, 13) do i i end + s = vsum_masked(xs, V4F64) + # @code_llvm vsum(xs, V4F64) + # @code_native vsum(xs, V4F64) + @test s === sum(xs) + + ir = llvm_ir(vsum_masked, (xs, V4F64)) + @test occursin("masked.load.v4f64", ir) + @test occursin(" fadd <4 x double>", ir) + # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) + end + end @testset "Vector shuffles" begin From 91f9fcf983036d5088347da5c33ef19b2637fe70 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Sun, 9 Apr 2023 15:27:27 +0100 Subject: [PATCH 4/6] Change readme --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3a02847..23d12b1 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,8 @@ using SIMD function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T} @assert length(ys) == length(xs) @assert length(xs) % N == 0 - lane = VecRange{N}(0) - @inbounds for i in 1:N:length(xs) - xs[lane + i] += ys[lane + i] + @inbounds for lane in LoopVecRange{N}(xs) + xs[lane] += ys[lane] end end ``` From 73eba7962ff2ccad0288c894cac09f2607f2d2fe Mon Sep 17 00:00:00 2001 From: Zentrik Date: Sun, 9 Apr 2023 15:27:43 +0100 Subject: [PATCH 5/6] Fix range type --- src/arrayops.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arrayops.jl b/src/arrayops.jl index e19f858..e43f84d 100644 --- a/src/arrayops.jl +++ b/src/arrayops.jl @@ -242,7 +242,7 @@ julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, end ``` """ -struct LoopVecRange{N} <: AbstractUnitRange{Int} +struct LoopVecRange{N} <: AbstractRange{Int} start::Int stop::Int From 40cd698c555c1ad0f86a23f4f64529b975bedd3c Mon Sep 17 00:00:00 2001 From: Zentrik Date: Sun, 18 Jun 2023 18:49:10 +0100 Subject: [PATCH 6/6] Restore tests --- test/runtests.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7d6e390..bbaec35 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -819,8 +819,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) let xs = valloc(Float64, 4, 13) do i i end s = vsum_masked(xs, V4F64) - # @code_llvm vsum(xs, V4F64) - # @code_native vsum(xs, V4F64) + @code_llvm vsum(xs, V4F64) + @code_native vsum(xs, V4F64) @test s === sum(xs) ir = llvm_ir(vsum_masked, (xs, V4F64)) @@ -860,8 +860,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) vcompress!(dest, pred, src) @test dest[1:sum(pred)] == src[src .> 0] - # @code_llvm vcompress!(dest, pred, src) - # @code_native vcompress!(dest, pred, src) + @code_llvm vcompress!(dest, pred, src) + @code_native vcompress!(dest, pred, src) ir = llvm_ir(vcompress!, (dest, pred, src)) @test occursin("masked.compressstore.v4f64", ir)