From 4354d9cce3ff173a1e1162f0d7d2fb410fe8a166 Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Fri, 7 Apr 2023 15:40:14 +0100
Subject: [PATCH 1/6] Add LoopRangeVec

---
 src/SIMD.jl      |   2 +-
 src/arrayops.jl  |  55 +++++++++++++++++++++++
 test/runtests.jl | 113 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 165 insertions(+), 5 deletions(-)

diff --git a/src/SIMD.jl b/src/SIMD.jl
index 12f9f97..ad02888 100644
--- a/src/SIMD.jl
+++ b/src/SIMD.jl
@@ -4,7 +4,7 @@ using Base: @propagate_inbounds
 
 export Vec, vload, vloada, vloadnt, vloadx, vstore, vstorea, vstorent, vstorec,
        vgather, vgathera, vscatter, vscattera, shufflevector, vifelse, valloc,
-       VecRange
+       VecRange, LoopVecRange
 
 const VE         = Base.VecElement
 const LVec{N, T} = NTuple{N, VE{T}}
diff --git a/src/arrayops.jl b/src/arrayops.jl
index f696543..1de3795 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -228,6 +228,61 @@ Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
 Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
     all(first(inds) <= idx) && all(idx <= last(inds))
 
+export LoopVecRange
+
+"""
+    LoopVecRange{N}(start::Int, stop::Int)
+Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`.
+# Examples
+```jldoctest
+julia> xs = ones(4);
+julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
+<4 x Float64>[1.0, 1.0, 1.0, 1.0]
+```
+"""
+struct LoopVecRange{N} <: AbstractUnitRange{Int}
+    start::Int
+    stop::Int
+
+    Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N
+        N <= 0 && throw(ArgumentError("Width cannot be less than 1"))
+
+        if !unsafe
+            @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width"))
+
+            @boundscheck stop < start && throw(ArgumentError("Stop cannot be less than start"))
+        end
+
+        return new{N}(start, stop)
+    end
+end
+
+Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N = LoopVecRange{N}(1, r.stop, unsafe=unsafe)
+Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe)
+Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe)
+
+Base.isempty(r::LoopVecRange) = r.start > r.stop
+
+Base.step(r::LoopVecRange{N}) where N = N
+Base.has_offset_axes(::LoopVecRange) = false
+
+Base.first(r::LoopVecRange{N}) where N = VecRange{N}(r.start)
+Base.last(r::LoopVecRange{N}) where N = VecRange{N}(r.stop - N + 1)
+
+Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r))
+
+function Base.iterate(r::LoopVecRange{N}, i::VecRange{N}) where N
+    @inline
+    i.i >= (r.stop - N + 1) && return nothing # greater than or equal prevents infinite loop if length of range is not a multiple of width
+    next = i + step(r)
+    (next, next)
+end
+
+Base.length(r::LoopVecRange{N}) where N = (r.stop - r.start + 1) ÷ N
+Base.eltype(::Type{LoopVecRange{N}}) where N = VecRange{N}
+
+Base.show(io::IO, r::LoopVecRange) = print(io, repr(first(r)), ':', repr(last(r)))
+
 @inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{Any,N}) where {N} =
     nothing
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 9981e06..4cb74f3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -819,8 +819,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
         let xs = valloc(Float64, 4, 13) do i i end
             s = vsum_masked(xs, V4F64)
-            @code_llvm vsum(xs, V4F64)
-            @code_native vsum(xs, V4F64)
+            # @code_llvm vsum(xs, V4F64)
+            # @code_native vsum(xs, V4F64)
             @test s === sum(xs)
 
             ir = llvm_ir(vsum_masked, (xs, V4F64))
@@ -860,8 +860,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
                 vcompress!(dest, pred, src)
                 @test dest[1:sum(pred)] == src[src .> 0]
 
-                @code_llvm vcompress!(dest, pred, src)
-                @code_native vcompress!(dest, pred, src)
+                # @code_llvm vcompress!(dest, pred, src)
+                # @code_native vcompress!(dest, pred, src)
 
                 ir = llvm_ir(vcompress!, (dest, pred, src))
                 @test occursin("masked.compressstore.v4f64", ir)
@@ -870,6 +870,111 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
     end
 
+    @testset "LoopVecRange Real-world examples" begin
+
+        function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                        ::Type{Vec{N,T}}) where {N,T}
+            @assert length(ys) == length(xs)
+            @assert length(xs) % N == 0
+            @inbounds for lane in LoopVecRange{N}(xs)
+                xs[lane] += ys[lane]
+            end
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end,
+            ys = valloc(Float64, L4, 4*L4) do i 1 end
+            vadd!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:(4*L4)]
+            # @code_native vadd!(xs, ys, V4F64)
+    
+            ir = llvm_ir(vadd!, (xs, ys, V4F64))
+            @test occursin(r"( load <4 x double>.*){2}"s, ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+    
+        function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            @assert length(xs) % N == 0
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{N}(xs)
+                sv += xs[lane]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end
+            s = vsum(xs, V4F64)
+            @test s === (x->(x^2+x)/2)(Float64(4*L4))
+            # @code_native vsum(xs, V4F64)
+
+            ir = llvm_ir(vsum, (xs, V4F64))
+            @test occursin(" load <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+    
+        function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                                ::Type{Vec{N,T}}) where {N, T}
+            @assert length(ys) == length(xs)
+            limit = length(xs) - (N-1)
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            @inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
+                if lane.i <= limit
+                    xs[lane] += ys[lane]
+                else
+                    mask = Vec{N,Int}(lane.i) <= vlimit
+                    xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
+                end
+            end
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end,
+            ys = valloc(Float64, 4, 13) do i 1 end
+            vadd_masked!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:13]
+            # @code_native vadd!(xs, ys, V4F64)
+    
+            ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
+            @test occursin(r"(masked.load.v4f64.*){2}"s, ir)
+            @test occursin("masked.store.v4f64", ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+        
+        function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{N}(xs, unsafe=true)
+                mask = Vec{N,Int}(lane.i) <= vlimit
+                sv += xs[lane, mask]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end
+            s = vsum_masked(xs, V4F64)
+            # @code_llvm vsum(xs, V4F64)
+            # @code_native vsum(xs, V4F64)
+            @test s === sum(xs)
+
+            ir = llvm_ir(vsum_masked, (xs, V4F64))
+            @test occursin("masked.load.v4f64", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+    end
+
+    @testset "LoopVecRange" begin
+        @test_throws ArgumentError LoopVecRange{-1}(1, 8)
+        @test_throws ArgumentError LoopVecRange{0}(1, 8)
+        @test_throws ArgumentError LoopVecRange{2}(1, 3)
+        @test_throws ArgumentError LoopVecRange{2}(3, 2)
+        @test_throws ArgumentError LoopVecRange{1}(3, 2)
+        @test_throws ArgumentError LoopVecRange{4}(3, 3)
+    end
+
     @testset "Vector shuffles" begin
 
         for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)

From a6584c8d5b650b8bb617e1abec82dd6244ebaed6 Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Fri, 7 Apr 2023 17:02:02 +0100
Subject: [PATCH 2/6] Fix docs for LoopVecRange

---
 src/arrayops.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/arrayops.jl b/src/arrayops.jl
index 1de3795..9c4a5d4 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -235,9 +235,11 @@ export LoopVecRange
 Analogous to `UnitRange` but for iterating over a vector with SIMD vectors of width `N`.
 # Examples
 ```jldoctest
-julia> xs = ones(4);
-julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
-<4 x Float64>[1.0, 1.0, 1.0, 1.0]
+julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
+    for lane in LoopVecRange{N}(xs)
+        xs[lane] += ys[lane]
+    end
+end
 ```
 """
 struct LoopVecRange{N} <: AbstractUnitRange{Int}

From fcc1af0ab586e102bea4818b739abbb352adc33f Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Fri, 7 Apr 2023 18:40:52 +0100
Subject: [PATCH 3/6] Allow stop < start in LoopVecRange

---
 src/arrayops.jl  |  16 ++++----
 test/runtests.jl | 102 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/src/arrayops.jl b/src/arrayops.jl
index 9c4a5d4..e19f858 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -247,12 +247,10 @@ struct LoopVecRange{N} <: AbstractUnitRange{Int}
     stop::Int
 
     Base.@propagate_inbounds function LoopVecRange{N}(start::Int, stop::Int; unsafe=false) where N
-        N <= 0 && throw(ArgumentError("Width cannot be less than 1"))
+        N == 0 && throw(ArgumentError("Step cannot be zero"))
 
         if !unsafe
             @boundscheck (abs(stop - start) + 1) % N != 0 && throw(ArgumentError("Length of range, has to be a multiple of the width"))
-
-            @boundscheck stop < start && throw(ArgumentError("Stop cannot be less than start"))
         end
 
         return new{N}(start, stop)
@@ -263,19 +261,21 @@ Base.@propagate_inbounds LoopVecRange{N}(r::Base.OneTo; unsafe=false) where N =
 Base.@propagate_inbounds LoopVecRange{N}(r::UnitRange; unsafe=false) where N = LoopVecRange{N}(r.start, r.stop, unsafe=unsafe)
 Base.@propagate_inbounds LoopVecRange{N}(x::AbstractVector; unsafe=false) where N = LoopVecRange{N}(eachindex(x), unsafe=unsafe)
 
-Base.isempty(r::LoopVecRange) = r.start > r.stop
+Base.isempty(r::LoopVecRange{N}) where N = (r.start != r.stop) & ((N > zero(N)) != (r.stop > r.start))
 
 Base.step(r::LoopVecRange{N}) where N = N
 Base.has_offset_axes(::LoopVecRange) = false
 
-Base.first(r::LoopVecRange{N}) where N = VecRange{N}(r.start)
-Base.last(r::LoopVecRange{N}) where N = VecRange{N}(r.stop - N + 1)
+Base.first(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.start + (N < 0 ? N + 1 : 0))
+Base.last(r::LoopVecRange{N}) where N = VecRange{abs(N)}(r.stop + (N > 0 ? -N + 1 : 0))
 
 Base.iterate(r::LoopVecRange) = isempty(r) ? nothing : (first(r), first(r))
 
-function Base.iterate(r::LoopVecRange{N}, i::VecRange{N}) where N
+function Base.iterate(r::LoopVecRange{N}, i::VecRange) where N
     @inline
-    i.i >= (r.stop - N + 1) && return nothing # greater than or equal prevents infinite loop if length of range is not a multiple of width
+    if (N > zero(N) && i.i >= last(r).i) || (N < zero(N) && i.i <= last(r).i) # greater than or equal prevents infinite loop if length of range is not a multiple of width
+        return nothing 
+    end
     next = i + step(r)
     (next, next)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4cb74f3..7d6e390 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -967,12 +967,108 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
     end
 
     @testset "LoopVecRange" begin
-        @test_throws ArgumentError LoopVecRange{-1}(1, 8)
         @test_throws ArgumentError LoopVecRange{0}(1, 8)
         @test_throws ArgumentError LoopVecRange{2}(1, 3)
-        @test_throws ArgumentError LoopVecRange{2}(3, 2)
-        @test_throws ArgumentError LoopVecRange{1}(3, 2)
         @test_throws ArgumentError LoopVecRange{4}(3, 3)
+
+        @test_throws ArgumentError LoopVecRange{0}(8, 1)
+        @test_throws ArgumentError LoopVecRange{2}(3, 1)
+    end
+
+    @testset "Reverse LoopVecRange Real-world examples" begin
+
+        function vadd!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                        ::Type{Vec{N,T}}) where {N,T}
+            @assert length(ys) == length(xs)
+            @assert length(xs) % N == 0
+            @inbounds for lane in LoopVecRange{-N}(length(xs), 1)
+                xs[lane] += ys[lane]
+            end
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end,
+            ys = valloc(Float64, L4, 4*L4) do i 1 end
+            vadd!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:(4*L4)]
+            # @code_native vadd!(xs, ys, V4F64)
+    
+            ir = llvm_ir(vadd!, (xs, ys, V4F64))
+            @test occursin(r"( load <4 x double>.*){2}"s, ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+    
+        function vsum(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            @assert length(xs) % N == 0
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{-N}(length(xs), 1)
+                sv += xs[lane]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, L4, 4*L4) do i i end
+            s = vsum(xs, V4F64)
+            @test s === (x->(x^2+x)/2)(Float64(4*L4))
+            # @code_native vsum(xs, V4F64)
+
+            ir = llvm_ir(vsum, (xs, V4F64))
+            @test occursin(" load <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
+    
+        function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
+                                ::Type{Vec{N,T}}) where {N, T}
+            @assert length(ys) == length(xs)
+            limit = length(xs) - (N-1)
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
+                if lane.i <= limit
+                    xs[lane] += ys[lane]
+                else
+                    mask = Vec{N,Int}(lane.i) <= vlimit
+                    xs[lane, mask] = xs[lane, mask] + ys[lane, mask]
+                end
+            end
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end,
+            ys = valloc(Float64, 4, 13) do i 1 end
+            vadd_masked!(xs, ys, V4F64)
+            @test xs == Float64[i+1 for i in 1:13]
+            # @code_native vadd!(xs, ys, V4F64)
+    
+            ir = llvm_ir(vadd_masked!, (xs, ys, V4F64))
+            @test occursin(r"(masked.load.v4f64.*){2}"s, ir)
+            @test occursin("masked.store.v4f64", ir)
+            @test occursin(" store <4 x double>", ir)
+            @test occursin(" fadd <4 x double>", ir)
+        end
+        
+        function vsum_masked(xs::AbstractArray{T,1}, ::Type{Vec{N,T}}) where {N,T}
+            vlimit = Vec(ntuple(i -> length(xs) - i + 1, Val(N)))
+            sv = Vec{N,T}(0)
+            @inbounds for lane in LoopVecRange{-N}(length(xs) + (N - 1), 1, unsafe=true)
+                mask = Vec{N,Int}(lane.i) <= vlimit
+                sv += xs[lane, mask]
+            end
+            sum(sv)
+        end
+
+        let xs = valloc(Float64, 4, 13) do i i end
+            s = vsum_masked(xs, V4F64)
+            # @code_llvm vsum(xs, V4F64)
+            # @code_native vsum(xs, V4F64)
+            @test s === sum(xs)
+
+            ir = llvm_ir(vsum_masked, (xs, V4F64))
+            @test occursin("masked.load.v4f64", ir)
+            @test occursin(" fadd <4 x double>", ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+        end
+
     end
 
     @testset "Vector shuffles" begin

From 91f9fcf983036d5088347da5c33ef19b2637fe70 Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Sun, 9 Apr 2023 15:27:27 +0100
Subject: [PATCH 4/6] Change readme

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3a02847..23d12b1 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,8 @@ using SIMD
 function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
     @assert length(ys) == length(xs)
     @assert length(xs) % N == 0
-    lane = VecRange{N}(0)
-    @inbounds for i in 1:N:length(xs)
-        xs[lane + i] += ys[lane + i]
+    @inbounds for lane in LoopVecRange{N}(xs)
+        xs[lane] += ys[lane]
     end
 end
 ```

From 73eba7962ff2ccad0288c894cac09f2607f2d2fe Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Sun, 9 Apr 2023 15:27:43 +0100
Subject: [PATCH 5/6] Fix range type

---
 src/arrayops.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arrayops.jl b/src/arrayops.jl
index e19f858..e43f84d 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -242,7 +242,7 @@ julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N,
 end
 ```
 """
-struct LoopVecRange{N} <: AbstractUnitRange{Int}
+struct LoopVecRange{N} <: AbstractRange{Int}
     start::Int
     stop::Int
 

From 40cd698c555c1ad0f86a23f4f64529b975bedd3c Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Sun, 18 Jun 2023 18:49:10 +0100
Subject: [PATCH 6/6] Restore tests

---
 test/runtests.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 7d6e390..bbaec35 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -819,8 +819,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
         let xs = valloc(Float64, 4, 13) do i i end
             s = vsum_masked(xs, V4F64)
-            # @code_llvm vsum(xs, V4F64)
-            # @code_native vsum(xs, V4F64)
+            @code_llvm vsum(xs, V4F64)
+            @code_native vsum(xs, V4F64)
             @test s === sum(xs)
 
             ir = llvm_ir(vsum_masked, (xs, V4F64))
@@ -860,8 +860,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
                 vcompress!(dest, pred, src)
                 @test dest[1:sum(pred)] == src[src .> 0]
 
-                # @code_llvm vcompress!(dest, pred, src)
-                # @code_native vcompress!(dest, pred, src)
+                @code_llvm vcompress!(dest, pred, src)
+                @code_native vcompress!(dest, pred, src)
 
                 ir = llvm_ir(vcompress!, (dest, pred, src))
                 @test occursin("masked.compressstore.v4f64", ir)