eschnett · Zentrik · Apr 7, 2023 · Apr 7, 2023 · Apr 7, 2023 · Apr 9, 2023
diff --git a/README.md b/README.md
@@ -29,9 +29,8 @@ using SIMD
 function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
     @assert length(ys) == length(xs)
     @assert length(xs) % N == 0
-    lane = VecRange{N}(0)
-    @inbounds for i in 1:N:length(xs)
-        xs[lane + i] += ys[lane + i]
+    @inbounds for lane in LoopVecRange{N}(xs)
+        xs[lane] += ys[lane]
     end
 end
 ```

diff --git a/src/SIMD.jl b/src/SIMD.jl
@@ -4,7 +4,7 @@ using Base: @propagate_inbounds
 
 export Vec, vload, vloada, vloadnt, vloadx, vstore, vstorea, vstorent, vstorec,
        vgather, vgathera, vscatter, vscattera, shufflevector, vifelse, valloc,
-       VecRange
+       VecRange, LoopVecRange
 
 const VE         = Base.VecElement
 const LVec{N, T} = NTuple{N, VE{T}}

diff --git a/src/arrayops.jl b/src/arrayops.jl
@@ -228,6 +228,63 @@ Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
 Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
     all(first(inds) <= idx) && all(idx <= last(inds))
 
+export LoopVecRange
+
+"""
+    LoopVecRange{N}(start::Int, stop::Int)
+Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`.
+# Examples
+```jldoctest
+julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
+    for lane in LoopVecRange{N}(xs)
+        xs[lane] += ys[lane]
+    end
+end
+```
+"""
+struct LoopVecRange{N} <: AbstractRange{Int}
+    start::Int
+    stop::Int
+
+    Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N
+        N == 0 && throw(ArgumentError("Step cannot be zero"))
+
+        if !unsafe
+            @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width"))
+        end
+
+        return new{N}(start, stop)
+    end
+end
+
+Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = LoopVecRange{N}(1, r.stop, unsafe=unsafe)
+Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe)
+Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe)
+
+Base.isempty(r::LoopVecRange{N}) where N = (r.start != r.stop) & ((N > zero(N)) != (r.stop > r.start))
+
+Base.step(r::LoopVecRange{N}) where N = N
+Base.has_offset_axes(::LoopVecRange) = false
+
+Base.first(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.start + (N < 0 ? N + 1 : 0))
+Base.last(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.stop + (N > 0 ? -N + 1 : 0))
+
+Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r))
+
+function Base.iterate(r::LoopVecRange{N}, i::VecRange) where N
+    @inline
+    if (N > zero(N) && i.i >= last(r).i) || (N < zero(N) && i.i <= last(r).i) # greater than or equal prevents infinite loop if length of range is not a multiple of width
+        return nothing 
+    end
+    next = i + step(r)
+    (next, next)
+end
+
+Base.length(r::LoopVecRange{N}) where N = (r.stop - r.start + 1) ÷ N
+Base.eltype(::Type{LoopVecRange{N}}) where N = VecRange{N}
+
+Base.show(io::IO, r::LoopVecRange) = print(io, repr(first(r)), ':', repr(last(r)))
+
 @inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} =
     nothing
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -870,6 +870,207 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
     end
 
+    @testset "LoopVecRange Real-world examples" begin
+
+        function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                        ::Type{Vec{N,T}}) where {N,T}
+            @assert length(ys) == length(xs)
+            @assert length(xs) % N == 0
+            @inbounds for lane in LoopVecRange{N}(xs)
+                xs[lane] += ys[lane]
+            end
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end,
+            ys = valloc(Float64, L4, 4*L4) do i 1 end
+            vadd!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:(4*L4)]
+            # @code_native vadd!(xs, ys, V4F64)
+
+            ir = llvm_ir(vadd!, (xs, ys, V4F64))
+            @test occursin(r"( load <4 x double>.*){2}"s, ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+
+        function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            @assert length(xs) % N == 0
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{N}(xs)
+                sv += xs[lane]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end
+            s = vsum(xs, V4F64)
+            @test s === (x->(x^2+x)/2)(Float64(4*L4))
+            # @code_native vsum(xs, V4F64)
+
+            ir = llvm_ir(vsum, (xs, V4F64))
+            @test occursin(" load <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+
+        function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                                ::Type{Vec{N,T}}) where {N, T}
+            @assert length(ys) == length(xs)
+            limit = length(xs) - (N-1)
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            @inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
+                if lane.i <= limit
+                    xs[lane] += ys[lane]
+                else
+                    mask = Vec{N,Int}(lane.i) <= vlimit
+                    xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
+                end
+            end
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end,
+            ys = valloc(Float64, 4, 13) do i 1 end
+            vadd_masked!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:13]
+            # @code_native vadd!(xs, ys, V4F64)
+
+            ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
+            @test occursin(r"(masked.load.v4f64.*){2}"s, ir)
+            @test occursin("masked.store.v4f64", ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+
+        function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
+                mask = Vec{N,Int}(lane.i) <= vlimit
+                sv += xs[lane, mask]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end
+            s = vsum_masked(xs, V4F64)
+            # @code_llvm vsum(xs, V4F64)
+            # @code_native vsum(xs, V4F64)
+            @test s === sum(xs)
+
+            ir = llvm_ir(vsum_masked, (xs, V4F64))
+            @test occursin("masked.load.v4f64", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+    end
+
+    @testset "LoopVecRange" begin
+        @test_throws ArgumentError LoopVecRange{0}(1, 8)
+        @test_throws ArgumentError LoopVecRange{2}(1, 3)
+        @test_throws ArgumentError LoopVecRange{4}(3, 3)
+
+        @test_throws ArgumentError LoopVecRange{0}(8, 1)
+        @test_throws ArgumentError LoopVecRange{2}(3, 1)
+    end
+
+    @testset "Reverse LoopVecRange Real-world examples" begin
+
+        function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                        ::Type{Vec{N,T}}) where {N,T}
+            @assert length(ys) == length(xs)
+            @assert length(xs) % N == 0
+            @inbounds for lane in LoopVecRange{-N}(length(xs), 1)
+                xs[lane] += ys[lane]
+            end
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end,
+            ys = valloc(Float64, L4, 4*L4) do i 1 end
+            vadd!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:(4*L4)]
+            # @code_native vadd!(xs, ys, V4F64)
+
+            ir = llvm_ir(vadd!, (xs, ys, V4F64))
+            @test occursin(r"( load <4 x double>.*){2}"s, ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+
+        function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            @assert length(xs) % N == 0
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{-N}(length(xs), 1)
+                sv += xs[lane]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end
+            s = vsum(xs, V4F64)
+            @test s === (x->(x^2+x)/2)(Float64(4*L4))
+            # @code_native vsum(xs, V4F64)
+
+            ir = llvm_ir(vsum, (xs, V4F64))
+            @test occursin(" load <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+
+        function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                                ::Type{Vec{N,T}}) where {N, T}
+            @assert length(ys) == length(xs)
+            limit = length(xs) - (N-1)
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
+                if lane.i <= limit
+                    xs[lane] += ys[lane]
+                else
+                    mask = Vec{N,Int}(lane.i) <= vlimit
+                    xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
+                end
+            end
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end,
+            ys = valloc(Float64, 4, 13) do i 1 end
+            vadd_masked!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:13]
+            # @code_native vadd!(xs, ys, V4F64)
+
+            ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
+            @test occursin(r"(masked.load.v4f64.*){2}"s, ir)
+            @test occursin("masked.store.v4f64", ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+
+        function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
+                mask = Vec{N,Int}(lane.i) <= vlimit
+                sv += xs[lane, mask]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end
+            s = vsum_masked(xs, V4F64)
+            # @code_llvm vsum(xs, V4F64)
+            # @code_native vsum(xs, V4F64)
+            @test s === sum(xs)
+
+            ir = llvm_ir(vsum_masked, (xs, V4F64))
+            @test occursin("masked.load.v4f64", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+    end
+
     @testset "Vector shuffles" begin
 
         for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)