Skip to content

Commit

Permalink
Permit vectorization of non-recursive atomic operations (halide#7346)
Browse files Browse the repository at this point in the history
* Vectorization of non-recursive atomic operations

* Remove dead Vars
  • Loading branch information
abadams authored and ardier committed Mar 3, 2024
1 parent 5692db6 commit e703f0d
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
17 changes: 16 additions & 1 deletion src/VectorizeLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,12 +1079,27 @@ class VectorSubs : public IRMutator {
break;
}

// f[x] = f[x] <op> y
const Store *store = op->body.as<Store>();
if (!store) {
break;
}

// f[x] = y
if (!expr_uses_var(store->value, store->name) &&
!expr_uses_var(store->predicate, store->name)) {
// This can be naively vectorized just fine. If there are
// repeated values in the vectorized store index, the ordering
// of writes may be undetermined and backend-dependent, but
// they'll be atomic.
Stmt s = mutate(store);

// We may still need the atomic node, if there was more
// parallelism than just the vectorization.
s = Atomic::make(op->producer_name, op->mutex_name, s);
return s;
}

// f[x] = f[x] <op> y
VectorReduce::Operator reduce_op = VectorReduce::Add;
Expr a, b;
if (const Add *add = store->value.as<Add>()) {
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ tests(GROUPS correctness
realize_condition_depends_on_tuple.cpp
realize_larger_than_two_gigs.cpp
realize_over_shifted_domain.cpp
recursive_box_filters.cpp
reduction_chain.cpp
reduction_predicate_racing.cpp
reduction_non_rectangular.cpp
Expand Down
49 changes: 49 additions & 0 deletions test/correctness/recursive_box_filters.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
// Compute a two-tap and a four-tap box filter at the same time,
// recursively.

Var x;
Func f;
f(x) = x;
f.compute_root();

const int size = 1024;

Func h;
h(x) = {undef<int>(), undef<int>()};
h(0) = {f(0), f(0)};
h(1) = {f(1) + f(0), f(1) + f(0)};

RDom r(2, size - 2);
Expr blur2 = f(r) + f(r - 1);
h(r) = {blur2, blur2 + h(r - 2)[0]};

// This is safe to vectorize, but it's not associative/commutative, so we
// have to pass 'true' to the atomic call to tell it to skip the check.
h.update(2).atomic(true).vectorize(r, 16);

Buffer<int> r0(size);
Buffer<int> r1(size);
h.realize({r0, r1});

for (int i = 3; i < size; i++) {
int correct2 = i + (i - 1);
int correct4 = i + (i - 1) + (i - 2) + (i - 3);
if (r0(i) != correct2) {
printf("r0[%d] = %d instead of %d\n", i, r0(i), correct2);
return -1;
}
if (r1(i) != correct4) {
printf("r1[%d] = %d instead of %d\n", i, r1(i), correct4);
return -1;
}
}

printf("Success!\n");

return 0;
}

0 comments on commit e703f0d

Please sign in to comment.