Merge pull request #4628 from halide/atomic_vectorization

Within-vector reductions
halide · Jun 25, 2020 · 95e8982 · 95e8982
2 parents 95f154a + 015f650
commit 95e8982
Show file tree

Hide file tree

Showing 50 changed files with 3,193 additions and 99 deletions.
diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp
@@ -142,6 +142,7 @@ void define_enums(py::module &m) {
         .value("WasmSignExt", Target::Feature::WasmSignExt)
         .value("SVE", Target::Feature::SVE)
         .value("SVE2", Target::Feature::SVE2)
+        .value("ARMDotProd", Target::Feature::ARMDotProd)
         .value("FeatureEnd", Target::Feature::FeatureEnd);
 
     py::enum_<halide_type_code_t>(m, "TypeCode")

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
@@ -1403,6 +1403,40 @@ class Bounds : public IRVisitor {
         interval = result;
     }
 
+    void visit(const VectorReduce *op) override {
+        TRACK_BOUNDS_INTERVAL;
+        op->value.accept(this);
+        int factor = op->value.type().lanes() / op->type.lanes();
+        switch (op->op) {
+        case VectorReduce::Add:
+            if (interval.has_upper_bound()) {
+                interval.max *= factor;
+            }
+            if (interval.has_lower_bound()) {
+                interval.min *= factor;
+            }
+            break;
+        case VectorReduce::Mul:
+            // Technically there are some things we could say
+            // here. E.g. if all the lanes are positive then we're
+            // bounded by the upper bound raised to the factor
+            // power. However it's extremely unlikely that a mul
+            // reduce will ever make it into a bounds expression, so
+            // for now we bail.
+            interval = Interval::everything();
+            break;
+        case VectorReduce::Min:
+        case VectorReduce::Max:
+            // The bounds of a single lane are sufficient
+            break;
+        case VectorReduce::And:
+        case VectorReduce::Or:
+            // Don't try for now
+            interval = Interval::everything();
+            break;
+        }
+    }
+
     void visit(const LetStmt *) override {
         internal_error << "Bounds of statement\n";
     }

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -1,6 +1,7 @@
 #include <iostream>
 #include <sstream>
 
+#include "CSE.h"
 #include "CodeGen_ARM.h"
 #include "CodeGen_Internal.h"
 #include "ConciseCasts.h"
@@ -484,10 +485,6 @@ void CodeGen_ARM::visit(const Div *op) {
     CodeGen_Posix::visit(op);
 }
 
-void CodeGen_ARM::visit(const Add *op) {
-    CodeGen_Posix::visit(op);
-}
-
 void CodeGen_ARM::visit(const Sub *op) {
     if (neon_intrinsics_disabled()) {
         CodeGen_Posix::visit(op);
@@ -1059,6 +1056,184 @@ void CodeGen_ARM::visit(const LE *op) {
     CodeGen_Posix::visit(op);
 }
 
+void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
+    if (neon_intrinsics_disabled() ||
+        op->op == VectorReduce::Or ||
+        op->op == VectorReduce::And ||
+        op->op == VectorReduce::Mul) {
+        CodeGen_Posix::codegen_vector_reduce(op, init);
+        return;
+    }
+
+    // ARM has a variety of pairwise reduction ops for +, min,
+    // max. The versions that do not widen take two 64-bit args and
+    // return one 64-bit vector of the same type. The versions that
+    // widen take one arg and return something with half the vector
+    // lanes and double the bit-width.
+
+    int factor = op->value.type().lanes() / op->type.lanes();
+
+    // These are the types for which we have reduce intrinsics in the
+    // runtime.
+    bool have_reduce_intrinsic = (op->type.is_int() ||
+                                  op->type.is_uint() ||
+                                  op->type.is_float());
+
+    // We don't have 16-bit float or bfloat horizontal ops
+    if (op->type.is_bfloat() || (op->type.is_float() && op->type.bits() < 32)) {
+        have_reduce_intrinsic = false;
+    }
+
+    // Only aarch64 has float64 horizontal ops
+    if (target.bits == 32 && op->type.element_of() == Float(64)) {
+        have_reduce_intrinsic = false;
+    }
+
+    // For 64-bit integers, we only have addition, not min/max
+    if (op->type.bits() == 64 &&
+        !op->type.is_float() &&
+        op->op != VectorReduce::Add) {
+        have_reduce_intrinsic = false;
+    }
+
+    // We only have intrinsics that reduce by a factor of two
+    if (factor != 2) {
+        have_reduce_intrinsic = false;
+    }
+
+    if (have_reduce_intrinsic) {
+        Expr arg = op->value;
+        if (op->op == VectorReduce::Add &&
+            op->type.bits() >= 16 &&
+            !op->type.is_float()) {
+            Type narrower_type = arg.type().with_bits(arg.type().bits() / 2);
+            Expr narrower = lossless_cast(narrower_type, arg);
+            if (!narrower.defined() && arg.type().is_int()) {
+                // We can also safely accumulate from a uint into a
+                // wider int, because the addition uses at most one
+                // extra bit.
+                narrower = lossless_cast(narrower_type.with_code(Type::UInt), arg);
+            }
+            if (narrower.defined()) {
+                arg = narrower;
+            }
+        }
+        int output_bits;
+        if (target.bits == 32 && arg.type().bits() == op->type.bits()) {
+            // For the non-widening version, the output must be 64-bit
+            output_bits = 64;
+        } else if (op->type.bits() * op->type.lanes() <= 64) {
+            // No point using the 128-bit version of the instruction if the output is narrow.
+            output_bits = 64;
+        } else {
+            output_bits = 128;
+        }
+
+        const int output_lanes = output_bits / op->type.bits();
+        Type intrin_type = op->type.with_lanes(output_lanes);
+        Type arg_type = arg.type().with_lanes(output_lanes * 2);
+        if (op->op == VectorReduce::Add &&
+            arg.type().bits() == op->type.bits() &&
+            arg_type.is_uint()) {
+            // For non-widening additions, there is only a signed
+            // version (because it's equivalent).
+            arg_type = arg_type.with_code(Type::Int);
+            intrin_type = intrin_type.with_code(Type::Int);
+        } else if (arg.type().is_uint() && intrin_type.is_int()) {
+            // Use the uint version
+            intrin_type = intrin_type.with_code(Type::UInt);
+        }
+
+        std::stringstream ss;
+        vector<Expr> args;
+        ss << "pairwise_" << op->op << "_" << intrin_type << "_" << arg_type;
+        Expr accumulator = init;
+        if (op->op == VectorReduce::Add &&
+            accumulator.defined() &&
+            arg_type.bits() < intrin_type.bits()) {
+            // We can use the accumulating variant
+            ss << "_accumulate";
+            args.push_back(init);
+            accumulator = Expr();
+        }
+        args.push_back(arg);
+        value = call_intrin(op->type, output_lanes, ss.str(), args);
+
+        if (accumulator.defined()) {
+            // We still have an initial value to take care of
+            string n = unique_name('t');
+            sym_push(n, value);
+            Expr v = Variable::make(accumulator.type(), n);
+            switch (op->op) {
+            case VectorReduce::Add:
+                accumulator += v;
+                break;
+            case VectorReduce::Min:
+                accumulator = min(accumulator, v);
+                break;
+            case VectorReduce::Max:
+                accumulator = max(accumulator, v);
+                break;
+            default:
+                internal_error << "unreachable";
+            }
+            codegen(accumulator);
+            sym_pop(n);
+        }
+
+        return;
+    }
+
+    // Pattern-match 8-bit dot product instructions available on newer
+    // ARM cores.
+    if (target.has_feature(Target::ARMDotProd) &&
+        factor % 4 == 0 &&
+        op->op == VectorReduce::Add &&
+        target.bits == 64 &&
+        (op->type.element_of() == Int(32) ||
+         op->type.element_of() == UInt(32))) {
+        const Mul *mul = op->value.as<Mul>();
+        if (mul) {
+            const int input_lanes = mul->type.lanes();
+            Expr a = lossless_cast(UInt(8, input_lanes), mul->a);
+            Expr b = lossless_cast(UInt(8, input_lanes), mul->b);
+            if (!a.defined()) {
+                a = lossless_cast(Int(8, input_lanes), mul->a);
+                b = lossless_cast(Int(8, input_lanes), mul->b);
+            }
+            if (a.defined() && b.defined()) {
+                if (factor != 4) {
+                    Expr equiv = VectorReduce::make(op->op, op->value, input_lanes / 4);
+                    equiv = VectorReduce::make(op->op, equiv, op->type.lanes());
+                    codegen_vector_reduce(equiv.as<VectorReduce>(), init);
+                    return;
+                }
+                Expr i = init;
+                if (!i.defined()) {
+                    i = make_zero(op->type);
+                }
+                vector<Expr> args{i, a, b};
+                if (op->type.lanes() <= 2) {
+                    if (op->type.is_uint()) {
+                        value = call_intrin(op->type, 2, "llvm.aarch64.neon.udot.v2i32.v8i8", args);
+                    } else {
+                        value = call_intrin(op->type, 2, "llvm.aarch64.neon.sdot.v2i32.v8i8", args);
+                    }
+                } else {
+                    if (op->type.is_uint()) {
+                        value = call_intrin(op->type, 4, "llvm.aarch64.neon.udot.v4i32.v16i8", args);
+                    } else {
+                        value = call_intrin(op->type, 4, "llvm.aarch64.neon.sdot.v4i32.v16i8", args);
+                    }
+                }
+                return;
+            }
+        }
+    }
+
+    CodeGen_Posix::codegen_vector_reduce(op, init);
+}
+
 string CodeGen_ARM::mcpu() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
@@ -1094,6 +1269,10 @@ string CodeGen_ARM::mattrs() const {
             arch_flags = "+sve";
         }
 
+        if (target.has_feature(Target::ARMDotProd)) {
+            arch_flags += "+dotprod";
+        }
+
         if (target.os == Target::IOS || target.os == Target::OSX) {
             return arch_flags + "+reserve-x18";
         } else {

diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h
@@ -24,7 +24,6 @@ class CodeGen_ARM : public CodeGen_Posix {
     /** Nodes for which we want to emit specific neon intrinsics */
     // @{
     void visit(const Cast *) override;
-    void visit(const Add *) override;
     void visit(const Sub *) override;
     void visit(const Div *) override;
     void visit(const Mul *) override;
@@ -35,6 +34,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
+    void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
     // @}
 
     /** Various patterns to peephole match against */