Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: support mixed Int/CartesianIndex indexing #84

Merged
merged 5 commits into from
Apr 8, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions src/reconstruct_loopset.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const NOpsType = Union{Int,Vector{Int}}

function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{<:AbstractUnitRange})
ssym = String(sym)
start = gensym(ssym*"_loopstart"); stop = gensym(ssym*"_loopstop"); loopsym = gensym(ssym * "_loop")
Expand Down Expand Up @@ -54,7 +56,7 @@ end

function ArrayReferenceMeta(
ls::LoopSet, @nospecialize(ar::ArrayRefStruct), arraysymbolinds::Vector{Symbol},
opsymbols::Vector{Symbol}, nopsv::Vector{Int}, expandedv::Vector{Bool}
opsymbols::Vector{Symbol}, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}
)
index_types = ar.index_types
indices = ar.indices
Expand All @@ -73,7 +75,14 @@ function ArrayReferenceMeta(
elseif index_types == ComputedIndex
opsym = opsymbols[ind]
if expandedv[ind]
for j ∈ 0:nopsv[ind]-1
nops = nopsv[ind]
if isa(nops, Vector)
n = first(nops)
if all(isequal(n), nops)
nops = n
end
end
for j ∈ 0:nops-1
pushfirst!(index_vec, expandedopname(opsym, j))
pushfirst!(loopedindex, false)
end
Expand Down Expand Up @@ -144,7 +153,7 @@ function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{<:Abstrac
end
function create_mrefs!(
ls::LoopSet, arf::Vector{ArrayRefStruct}, as::Vector{Symbol}, os::Vector{Symbol},
nopsv::Vector{Int}, expanded::Vector{Bool}, vargs
nopsv::Vector{NOpsType}, expanded::Vector{Bool}, vargs
)
mrefs = Vector{ArrayReferenceMeta}(undef, length(arf))
for i ∈ eachindex(arf)
Expand Down Expand Up @@ -230,14 +239,11 @@ function calcnops(ls::LoopSet, os::OperationStruct)
offsets = ls.loopsymbol_offsets
idxs = loopindex(ls, os.loopdeps, 0x04) # FIXME DRY
iszero(length(idxs)) && return 1
Δidxs = map(i->offsets[i+1]-offsets[i], idxs)
nops = first(Δidxs)
@assert all(isequal(nops), Δidxs)
nops
return map(i->offsets[i+1]-offsets[i], idxs)
end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, only loads and stores are allowed to have a mix of nops as inputs.
nops for loads and stores are also always 1, so we don't need a vector.

function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{Int}, i::Int)
function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{NOpsType}, i::Int)
nops = nopsv[i]
isone(nops) && return false
(nops === 1 || nops == [1]) && return false
os = ops[i]
optyp = optype(os)
if optyp == compute
Expand All @@ -250,7 +256,7 @@ function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{Int
end

function add_op!(
ls::LoopSet, instr::Instruction, ops::Vector{OperationStruct}, nopsv::Vector{Int}, expandedv::Vector{Bool}, i::Int,
ls::LoopSet, instr::Instruction, ops::Vector{OperationStruct}, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}, i::Int,
mrefs::Vector{ArrayReferenceMeta}, opsymbol, elementbytes::Int
)
os = ops[i]
Expand All @@ -272,9 +278,15 @@ function add_op!(
push!(opoffsets, opoffsets[end] + 1)
return
end
if isa(nops, Vector)
n = first(nops)
if all(isequal(n), nops)
nops = n
end
end
# if expanded, optyp must be either loopvalue, or compute (with loopvalues in its ancestry, not cutoff by loads)
for offset = 0:nops-1
sym = nops == 1 ? opsymbol : expandedopname(opsymbol, offset)
sym = nops === 1 ? opsymbol : expandedopname(opsymbol, offset)
op = Operation(
length(operations(ls)), sym, elementbytes, instr,
optyp, loopdependencies(ls, os, false, offset), reduceddependencies(ls, os, false, offset),
Expand All @@ -295,7 +307,7 @@ function add_parents_to_op!(ls::LoopSet, vparents::Vector{Operation}, up::Unsign
for j ∈ offsets[i]+1:offsets[i+1] # if parents are expanded, add them all
pushfirst!(vparents, ops[j])
end
end
end
else#if isexpanded
# Do we want to require that all Δidxs are equal?
# Because `CartesianIndex((2,3)) - 1` results in a methoderorr, I think this is reasonable for now
Expand All @@ -318,15 +330,15 @@ function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoff
pushpreamble!(ls, Expr(:(=), instr.instr, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:ref, :vargs, constoffset))))
end
elseif !isloopvalue(op)
add_parents_to_op!(ls, parents(op), ops[i].parents, k, Δ)
add_parents_to_op!(ls, parents(op), ops[i].parents, k, Δ)
end
end
end
constoffset
end
function add_ops!(
ls::LoopSet, instr::Vector{Instruction}, ops::Vector{OperationStruct}, mrefs::Vector{ArrayReferenceMeta},
opsymbols::Vector{Symbol}, constoffset::Int, nopsv::Vector{Int}, expandedv::Vector{Bool}, elementbytes::Int
opsymbols::Vector{Symbol}, constoffset::Int, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}, elementbytes::Int
)
# @show ls.loopsymbols ls.loopsymbol_offsets
for i ∈ eachindex(ops)
Expand Down Expand Up @@ -378,7 +390,7 @@ function avx_loopset(instr, ops, arf, AM, LPSYM, LB, vargs)
resize!(ls.loop_order, ls.loopsymbol_offsets[end])
arraysymbolinds = gen_array_syminds(AM)
opsymbols = [gensym(:op) for _ ∈ eachindex(ops)]
nopsv = calcnops.(Ref(ls), ops)
nopsv = NOpsType[calcnops(ls, op) for op in ops]
expandedv = [isexpanded(ls, ops, nopsv, i) for i ∈ eachindex(ops)]
mrefs = create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs)
pushpreamble!(ls, Expr(:(=), ls.T, Expr(:call, :promote_type, [Expr(:call, :eltype, vptr(mref)) for mref ∈ mrefs]...)))
Expand Down Expand Up @@ -417,6 +429,3 @@ end
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
avx_body(ls, UT)
end



64 changes: 59 additions & 5 deletions test/miscellaneous.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
using LoopVectorization
using LinearAlgebra
using Test

@testset "Miscellaneous" begin

Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
dot3q = :(for m ∈ 1:M, n ∈ 1:N
s += x[m] * A[m,n] * y[n]
Expand Down Expand Up @@ -48,7 +52,7 @@
end
s
end

subcolq = :(for i ∈ 1:size(A,2), j ∈ eachindex(x)
B[j,i] = A[j,i] - x[j]
end)
Expand Down Expand Up @@ -220,10 +224,10 @@
# t = β₁ = β₂ = ρ = s = 0.0; weights = rand(1); nodes = rand(1); lomnibus(args...) = +(args...)
# LoopVectorization.@avx_debug for i ∈ eachindex(weights, nodes)
# s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
# end
# end
# @macroexpand @avx for i ∈ eachindex(weights, nodes)
# s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
# end
# end
function softmax3_core!(lse, qq, xx, tmpmax, maxk, nk)
for k in Base.OneTo(maxk)
@inbounds for i in eachindex(lse)
Expand Down Expand Up @@ -486,7 +490,7 @@
out[i] = 1 + i
end
end

for T ∈ (Float32, Float64)
@show T, @__LINE__
A = randn(T, 199, 498);
Expand Down Expand Up @@ -550,7 +554,7 @@
@test view(vec(B1), r) == view(vec(B2), r)
fill!(B2, NaN); test_for_with_different_index_avx!(B2, A, C, start_sample, num_samples)
@test view(vec(B1), r) == view(vec(B2), r)

ni, nj, nk = (127, 113, 13)
x = rand(T, ni, nj, nk);
q1 = similar(x);
Expand Down Expand Up @@ -652,4 +656,54 @@
one_plus_i_avx!(out2)
@test out1 == out2
end

@testset "Mixed CartesianIndex/Int indexing" begin
# A demo similar to the exponential filtering demo from https://julialang.org/blog/2016/02/iteration/,
# but with no loop-carried dependency.
function smoothdim!(s, x, α, Rpre, irng::AbstractUnitRange, Rpost)
ifirst, ilast = first(irng), last(irng)
ifirst > ilast && return s
for Ipost in Rpost
# Initialize the first value along the filtered dimension
for Ipre in Rpre
s[Ipre, ifirst, Ipost] = x[Ipre, ifirst, Ipost]
end
# Handle all other entries
for i = ifirst+1:ilast
for Ipre in Rpre
s[Ipre, i, Ipost] = α*x[Ipre, i, Ipost] + (1-α)*x[Ipre, i-1, Ipost]
end
end
end
s
end
function smoothdim_avx!(s, x, α, Rpre, irng::AbstractUnitRange, Rpost)
ifirst, ilast = first(irng), last(irng)
ifirst > ilast && return s
@avx for Ipost in Rpost
# Initialize the first value along the filtered dimension
for Ipre in Rpre
s[Ipre, ifirst, Ipost] = x[Ipre, ifirst, Ipost]
end
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least for d=1 in the test-loop at the bottom of this file, the test failure is due to this loop failing to run for all of Rpost; it only runs for first(Rpost). (And I do mean Rpost rather than Rpre.) I am guessing this is a loop-ordering thing? I looked at the expanded code but struggled to figure out which pieces come from what parts of this code; if you get a chance, mind looking at that? For the 3d test below, test failures occur for d=1 and d=3, whereas it passes on d=2. Would probably be best to reenable the 5d (or at least, 4d) case before merging, since you need d=5 to test the case of indexing with x[::CI{2}, ::Int, ::CI{2}] (CI=CartesianIndex).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think heterogenous nops should either only be allowed for memload and memstore operations, or perhaps also allowed for compute iff Set(Δidxs) == Set([1,n]) (i.e., broadcasting some nops == 1s to match the others).

I'd lean toward the former now, for simplicity. That means, maintain the old behavior, except for indexing operations.
That means you could make the @assert immediate, but conditional on !accesses_memory(op).

I'll take a deeper look at the code in the next few hours and leave more comments.

Copy link
Member

@chriselrod chriselrod Apr 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found a couple problems. I'll push when I've fixed them and leave a few comments.
Two of the highlights:

  1. subset_vptr! is broken with CartesianIndices, because it used the number of loops to find the axis. It also gets called when constructing the initial LoopSet, not in reconstruct_loopset, meaning we can't supply it the offset vector. I'll either drop it and trust LLVM to take care of that optimization, or adjust the function call to do the right thing anyway.
  2. I'm using const NOpsType = Int, and the calcnops function returns maximum(i->offsets[i+1]-offsets[i], idxs).

Copy link
Member

@chriselrod chriselrod Apr 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tile=(1,1) is unnecessary. I just added that while experimenting.
But I think @avx is being overly aggressive in unrolling this. I'll run benchmarks later.

LoopVectorization also doesn't support multiple loops at the same level in a loop nest yet.

for m in 1:M
    for n in 1:N
    end
    for n in 1:N
    end
end

Would be good to add eventually.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it's nice that I don't have to think about cache-friendliness when writing out those loops!

# Handle all other entries
for i = ifirst+1:ilast
for Ipre in Rpre
s[Ipre, i, Ipost] = α*x[Ipre, i, Ipost] + (1-α)*x[Ipre, i-1, Ipost]
end
end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not as pretty, but another way to write the function.

I think this may risk crashing because of the read to x[Ipre, ifirst-1, Ipost], even though that value wont be used, so I'm not inclined to recommend this until I add explicit support for conditional loads, like the conditional store support it has now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This did cause crashes on Travis. I'll work on turning this into a conditional load.

end
s
end

x = rand(11,11,11) # ,11,11)
dest1, dest2 = similar(x), similar(x)
α = 0.3
for d = 1:ndims(x)
Rpre = CartesianIndices(axes(x)[1:d-1])
Rpost = CartesianIndices(axes(x)[d+1:end])
smoothdim!(dest1, x, α, Rpre, axes(x, d), Rpost)
smoothdim_avx!(dest2, x, α, Rpre, axes(x, d), Rpost)
@test dest1 ≈ dest2
end
end
end