Skip to content

Commit

Permalink
[CreatePrimFunc] Support multi-source ReduceNode (apache#64)
Browse files Browse the repository at this point in the history
* initial

* assert structural equal test
  • Loading branch information
hypercubestart authored and YuchenJin committed Mar 2, 2022
1 parent 9807ab8 commit 89f8867
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 23 deletions.
94 changes: 71 additions & 23 deletions src/te/operation/create_primfunc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,10 @@ struct CreateFuncInfo {
}
};

BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::Tensor& tensor,
Array<PrimExpr> bindings, PrimExpr expr_body,
CreateFuncInfo* info, arith::Analyzer* analyzer) {
BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
const Array<te::Tensor>& tensors, Array<PrimExpr> bindings,
PrimExpr expr_body, CreateFuncInfo* info,
arith::Analyzer* analyzer) {
// Step 1. Push_back data_par axis and reduce_axis into block_vars.
Array<IterVar> iter_vars;
std::unordered_map<const VarNode*, PrimExpr> var_map;
Expand All @@ -108,12 +109,18 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
f_push_block_vars(compute_op->reduce_axis);

// Step 2. Declare buffer and update op2buffers
Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, tensor->GetNameHint(), "global");
info->tensor2buffers[tensor] = buffer;
Array<Buffer> buffers;
for (const te::Tensor& tensor : tensors) {
Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, tensor->GetNameHint(), "global");
info->tensor2buffers[tensor] = buffer;
buffers.push_back(buffer);
}

// Step 3. Add Buffer to root_alloc
if (!info->IsArg(tensor)) {
info->root_alloc.push_back(buffer);
for (const te::Tensor& tensor : tensors) {
if (!info->IsArg(tensor)) {
info->root_alloc.push_back(info->tensor2buffers[tensor]);
}
}

// Step 4. Calculate indices for BufferStore
Expand All @@ -130,18 +137,35 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
Stmt body;
if (const auto* reduce = expr_body.as<ReduceNode>()) {
// Case 1. Reduce compute
ICHECK_EQ(reduce->source.size(), 1);
const PrimExpr& lhs = BufferLoad(buffer, indices);
const PrimExpr& rhs = Substitute(info->transformer(reduce->source[0]), var_map);
ICHECK(lhs->dtype == rhs->dtype);
const PrimExpr& reduce_body = reduce->combiner.get()->operator()({lhs}, {rhs})[0];
const PrimExpr& init_body = reduce->combiner->identity_element[0];
body = BufferStore(buffer, analyzer->Simplify(reduce_body), indices);
init = BufferStore(buffer, analyzer->Simplify(init_body), indices);
Array<PrimExpr> lhs;
Array<PrimExpr> rhs;

for (size_t i = 0; i < buffers.size(); i++) {
const PrimExpr& left = BufferLoad(buffers[i], indices);
const PrimExpr& right = Substitute(info->transformer(reduce->source[i]), var_map);
lhs.push_back(left);
rhs.push_back(right);

ICHECK(left->dtype == right->dtype);
}

Array<Stmt> body_stmts;
Array<Stmt> init_stmts;

for (size_t i = 0; i < buffers.size(); i++) {
const Buffer& buffer = buffers[i];
body_stmts.push_back(
BufferStore(buffer, reduce->combiner.get()->operator()(lhs, rhs)[i], indices));
init_stmts.push_back(BufferStore(buffer, reduce->combiner->identity_element[i], indices));
}

body = SeqStmt::Flatten(body_stmts);
init = SeqStmt::Flatten(init_stmts);
} else {
// Case 2. Data parallel compute
const PrimExpr& compute_body = Substitute(info->transformer(expr_body), var_map);
body = BufferStore(buffer, analyzer->Simplify(compute_body), indices);
ICHECK_EQ(tensors.size(), 1);
body = BufferStore(info->tensor2buffers[tensors[0]],
Substitute(info->transformer(expr_body), var_map), indices);
}

// Step 6. Add script_parsing_detect_access attr for auto complete the whole IR.
Expand Down Expand Up @@ -175,14 +199,20 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
Block(/*iter_vars=*/std::move(iter_vars),
/*reads=*/{},
/*writes=*/{},
/*name_hint=*/info->GetUniqueName(tensor->GetNameHint()),
/*name_hint=*/info->GetUniqueName(tensors[0]->GetNameHint()),
/*body=*/std::move(body),
/*init=*/std::move(init),
/*alloc_buffers=*/{},
/*match_buffers=*/{},
/*annotations=*/std::move(annotations)));
}

inline bool ReduceEqual(const tir::ReduceNode* a, const tir::ReduceNode* b) {
return (a->combiner.same_as(b->combiner)) && (a->source.same_as(b->source)) &&
(a->axis.same_as(b->axis)) && (a->condition.same_as(b->condition)) &&
((a->init.empty() && b->init.empty()) || (a->init.same_as(b->init)));
}

Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* info,
arith::Analyzer* analyzer) {
// Step 1. Creating loop vars for block bindings.
Expand All @@ -194,12 +224,30 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
}
// Step 2. Generate block bodies.
Array<Stmt> seq_stmt;
for (int i = 0; i < compute_op->num_outputs(); ++i) {
const te::Tensor& tensor = compute_op.output(i);
PrimExpr expr_body = compute_op->body[i];
seq_stmt.push_back(GenerateBlockFromTensor(compute_op, tensor, bindings, std::move(expr_body),
info, analyzer));
if (compute_op->body[0]->IsInstance<ReduceNode>()) {
PrimExpr expr_body = compute_op->body[0];
Array<te::Tensor> tensors = {compute_op.output(0)};
const tir::ReduceNode* reduce = expr_body.as<tir::ReduceNode>();
// specially handle reduction inline for multiplre reductions.
for (size_t k = 1; k < compute_op->body.size(); ++k) {
const tir::ReduceNode* reduce_ = compute_op->body[k].as<tir::ReduceNode>();
ICHECK(reduce_);
ICHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
<< "have the same attribute except value_index";
tensors.push_back(compute_op.output(k));
}

seq_stmt.push_back(GenerateBlockFromTensors(compute_op, tensors, bindings, std::move(expr_body),
info, analyzer));
} else {
for (int i = 0; i < compute_op->num_outputs(); ++i) {
const te::Tensor& tensor = compute_op.output(i);
PrimExpr expr_body = compute_op->body[i];
seq_stmt.push_back(GenerateBlockFromTensors(compute_op, {tensor}, bindings,
std::move(expr_body), info, analyzer));
}
}

Stmt body = SeqStmt::Flatten(seq_stmt);

// Step 3. Generate loop nesting.
Expand Down
70 changes: 70 additions & 0 deletions tests/python/unittest/test_te_create_primfunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,75 @@ def test_unbound_var():
tvm.testing.assert_allclose(a_np, b.numpy())


def te_argmax():
# x and y are the operands of reduction, both of them is a tuple of index
# and value.
def fcombine(x, y):
lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
return lhs, rhs

# our identity element also need to be a tuple, so `fidentity` accepts
# two types as inputs.
def fidentity(t0, t1):
return tvm.tir.const(-1, t0), tvm.te.min_value(t1)

argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

# describe the reduction computation
m = te.var("m")
n = te.var("n")
idx = te.placeholder((m, n), name="idx", dtype="int32")
val = te.placeholder((m, n), name="val", dtype="int32")
k = te.reduce_axis((0, n), "k")
T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")
return [idx, val, T0, T1]


@T.prim_func
def tir_argmax(
var_idx: T.handle, var_val: T.handle, var_T_v0: T.handle, var_T_v1: T.handle
) -> None:
m = T.var("int32")
n = T.var("int32")
idx = T.match_buffer(var_idx, [m, n], dtype="int32")
val = T.match_buffer(var_val, [m, n], dtype="int32")
T_v0 = T.match_buffer(var_T_v0, [m], dtype="int32")
T_v1 = T.match_buffer(var_T_v1, [m], dtype="int32")
# body
# with T.block("root")
for i0, i1 in T.grid(m, n):
with T.block("T.v0"):
i, k = T.axis.remap("SR", [i0, i1])
with T.init():
T_v0[i] = -1
T_v1[i] = -2147483648
T_v0[i] = T.Select(T_v1[i] >= val[i, k], T_v0[i], idx[i, k])
T_v1[i] = T.Select(T_v1[i] >= val[i, k], T_v1[i], val[i, k])


def test_argmax():
_check_workload(te_argmax, tir_argmax)

dtype = "int32"
func = te.create_prim_func(te_argmax())
assert len(func.params) == 4

func = tvm.build(func)

idx_np = np.arange(100, dtype=dtype).reshape((10, 10))
val_np = np.random.permutation(100).reshape((10, 10)).astype(dtype)
c = tvm.nd.array(np.zeros(10, dtype=dtype)) # argmax index
d = tvm.nd.array(np.zeros(10, dtype=dtype)) # max value
func(tvm.nd.array(idx_np), tvm.nd.array(val_np), c, d)

c_expected = idx_np[np.arange(10), np.argmax(val_np, axis=1)]
d_expected = np.amax(val_np, axis=1)

tvm.testing.assert_allclose(c_expected, c.numpy())
tvm.testing.assert_allclose(d_expected, d.numpy())


if __name__ == "__main__":
test_unique_name()
test_matmul()
Expand All @@ -412,3 +481,4 @@ def test_unbound_var():
test_tensor_attr()
test_loop_var_datatype()
test_unbound_var()
test_argmax()

0 comments on commit 89f8867

Please sign in to comment.