Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[USMP] Adding support for U4 usecase #10785

Merged
merged 3 commits into from
Apr 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/tvm/tir/usmp/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map<tir::Stmt, PoolAllocation
*/
TVM_DLL Pass AssignPoolInfo();

/*!
* \brief This pass creates Allocate nodes for I/O tensors
*
* If the user wants to place the I/O tensors in the workspace, this pass is required to be
* run. In doing so, it will create Allocate nodes for I/O tensors to be planned, and be removed
* from function arguments.
*
* \return the pass
*/
TVM_DLL Pass CreateAllocatesForIO();

} // namespace transform
} // namespace usmp
} // namespace tir
Expand Down
47 changes: 42 additions & 5 deletions include/tvm/tir/usmp/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,20 @@ constexpr const char* kUSMPEnableOption = "tir.usmp.enable";
* \brief PassContext option to select the memory planning algorithm in USMP
*/
constexpr const char* kUSMPAlgorithmOption = "tir.usmp.algorithm";
/*!
* \brief PassContext option to enable placing I/O tensors in the workspace
*/
constexpr const char* kUSMPUseWorkspaceIO = "tir.usmp.use_workspace_io";

namespace tir {
namespace usmp {

/*!
* \brief A special kind to distinguish between I/O tensors to the model
* and intermediate tensors of the model
*/
enum class BufferInfoKind { kIntermediate = 0, kInput = 1, kOutput = 2 };
manupak marked this conversation as resolved.
Show resolved Hide resolved

/*!
* \brief Describes an abstract memory buffer that will get allocated inside a pool.
* The actual memory buffer in represented by PoolAllocationNode after static memory planning.
Expand All @@ -65,19 +75,22 @@ struct BufferInfoNode : public Object {
Integer alignment;
/*! \brief The liveness conflicting other buffer info objects */
Array<ObjectRef> conflicts;
/*! \brief Whether BufferInfo object retains info about IO tensors or intermediaries */
BufferInfoKind kind;

void VisitAttrs(tvm::AttrVisitor* v) {
v->Visit("name_hint", &name_hint);
v->Visit("size_bytes", &size_bytes);
v->Visit("pool_candidates", &pool_candidates);
v->Visit("alignment", &alignment);
v->Visit("conflicts", &conflicts);
v->Visit("kind", &kind);
}

bool SEqualReduce(const BufferInfoNode* other, SEqualReducer equal) const {
return equal(name_hint, other->name_hint) && equal(size_bytes, other->size_bytes) &&
equal(pool_candidates, other->pool_candidates) && equal(alignment, other->alignment) &&
equal(conflicts, other->conflicts);
equal(conflicts, other->conflicts) && equal(kind, other->kind);
}

void SHashReduce(SHashReducer hash_reduce) const {
Expand All @@ -86,6 +99,7 @@ struct BufferInfoNode : public Object {
hash_reduce(alignment);
hash_reduce(conflicts);
hash_reduce(pool_candidates);
hash_reduce(kind);
}
/*!
* \brief Set the liveness conflicts of this BufferInfo
Expand All @@ -101,7 +115,8 @@ struct BufferInfoNode : public Object {
class BufferInfo : public ObjectRef {
public:
TVM_DLL BufferInfo(String name_hint, Integer size_bytes, Array<PoolInfo> pool_candidates,
Integer alignment = runtime::kDefaultWorkspaceAlignment);
Integer alignment = runtime::kDefaultWorkspaceAlignment,
BufferInfoKind kind = BufferInfoKind::kIntermediate);
TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(BufferInfo, ObjectRef, BufferInfoNode);
};

Expand Down Expand Up @@ -237,6 +252,18 @@ Integer CalculateModuleWorkspaceSize(const IRModule& mod);
*/
static constexpr const char* kPoolCandidatesAllocateAttr = "candidate_memory_pools";

/*!
* \brief The allocate node attribute to indicate it is being used to hold
* an input tensor, that needs to be initialized with.
*/
static constexpr const char* kInputTensorAllocate = "input_tensor";

/*!
* \brief The allocate node attribute to indicate it is being used to hold
* an output tensor.
*/
static constexpr const char* kOutputTensorAllocate = "output_tensor";

/*!
* \brief Calculate the size of the extents in bytes
*
Expand All @@ -254,6 +281,16 @@ Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
const Map<BufferInfo, Stmt>& buffer_info_to_stmt,
const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation);

/*!
* \brief Obtains I/O tensor names to their PoolAllocation objects
*
* \param buffer_info_to_pool_allocation the map of BufferInfo objects to PoolAllocation objects
*
* This function will obtain pool allocations for I/O tensors if that had been planned
*/
Map<String, PoolAllocation> GetIOPoolAllocations(
const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation);

} // namespace usmp
} // namespace tir

Expand All @@ -265,10 +302,10 @@ namespace attr {
static constexpr const char* kPoolArgs = "pool_args";

/*!
* \brief This is a IRModule attribute that contains all the PoolInfo objects
* as an Array.
* \brief This is a IRModule attribute that contains I/O Tensor names to pool
* allocations.
*/
static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos";
static constexpr const char* kIOTensorPoolAllocations = "io_tensor_pool_allocations";

} // namespace attr

Expand Down
31 changes: 17 additions & 14 deletions python/tvm/micro/model_library_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ class UnsupportedInModelLibraryFormatError(Exception):


def generate_c_interface_header(
module_name, inputs, outputs, pools, devices, workspace_size, include_path
module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size, include_path
):
"""Generate C Interface header to be included in MLF"""
mangled_name = to_c_variable_style(prefix_generated_name(module_name))
metadata_header = os.path.join(include_path, f"{mangled_name}.h")

interface_c_create = tvm._ffi.get_global_func("runtime.InterfaceCCreate")
interface_c_module = interface_c_create(
module_name, inputs, outputs, pools, devices, workspace_size
module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size
)

with open(metadata_header, "w") as header_file:
Expand Down Expand Up @@ -281,24 +281,19 @@ def _convert_tuple_to_outputs(ret_type, offset=0):


def _get_inputs_and_outputs_from_module(mod):
main_func = _get_main_relay_func(mod)
inputs = [argument.name_hint for argument in main_func.params]

if "output_tensor_names" in main_func.attrs:
outputs = main_func.attrs["output_tensor_names"]
else:
if isinstance(main_func.ret_type, TupleType):
outputs = _convert_tuple_to_outputs(main_func.ret_type)
else:
outputs = ["output"]

inputs = [str(input_var.name) for input_var in mod.executor_codegen_metadata.inputs]
outputs = list(mod.executor_codegen_metadata.outputs)
return inputs, outputs


def _get_pools_from_module(mod):
return list(dict(mod.executor_codegen_metadata.pool_inputs).values())


def _get_io_pool_allocation_from_module(mod):
return dict(mod.executor_codegen_metadata.io_pool_allocations)


def _should_generate_interface_header(mod):
return "interface-api" in mod.executor and mod.executor["interface-api"] == "c"

Expand Down Expand Up @@ -369,9 +364,17 @@ def _export_graph_model_library_format(
inputs, outputs = _get_inputs_and_outputs_from_module(mod)
devices = mod.get_devices()
pools = _get_pools_from_module(mod)
io_pool_allocations = _get_io_pool_allocation_from_module(mod)
workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"])
generate_c_interface_header(
mod.libmod_name, inputs, outputs, pools, devices, workspace_size, include_path
mod.libmod_name,
inputs,
outputs,
pools,
io_pool_allocations,
devices,
workspace_size,
include_path,
)

parameters_dir = tempdir / "parameters"
Expand Down
118 changes: 78 additions & 40 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -784,13 +784,18 @@ class AOTExecutorCodegen : public MixedModeVisitor {
* brief Create tir::Var for input/output while updating
* the buffer_maps.
*/
void CreateIOVar(const Expr& expr, std::string name) {
void CreateIOVar(const Expr& expr, const std::string& original_name,
bool use_unique_name = true) {
if (expr->IsInstance<TupleNode>()) {
Tuple tuple = Downcast<Tuple>(expr);
for (unsigned i = 0; i < tuple->fields.size(); i++) {
CreateIOVar(tuple->fields[i], name + std::to_string(i) + "_");
CreateIOVar(tuple->fields[i], original_name);
}
} else {
std::string name = original_name;
if (use_unique_name) {
name = GetUniqueIOVarName(original_name);
}
tir::Var var = tir::Var(name, DataType::Handle());
main_signature_.push_back(var);
auto tensor_type = expr->checked_type().as<TensorTypeNode>();
Expand All @@ -804,6 +809,19 @@ class AOTExecutorCodegen : public MixedModeVisitor {
}
}

/*!
* brief Create a unique name for I/O Var
*/
std::string GetUniqueIOVarName(std::string name) {
if (io_var_names_.find(name) == io_var_names_.end()) {
io_var_names_[name] = 1;
return name;
} else {
io_var_names_[name] = io_var_names_[name] + 1;
return name + std::to_string(io_var_names_[name]);
}
}

/*!
* brief Calculate workspace sizes for PrimFuncs in the IRModule
*/
Expand Down Expand Up @@ -945,6 +963,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
std::vector<tir::Stmt> stmts_;
/*! \brief the list of return sids (note that the function might return more then one output */
std::vector<int> return_sid_;
/*! \brief This is per IO var name counter to aid the generating unique names */
std::unordered_map<std::string, int> io_var_names_;

public:
AOTExecutorCodegen(runtime::Module* mod, const tec::TargetMap& targets, Target target_host)
Expand Down Expand Up @@ -1032,7 +1052,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {
for (auto input : lowered_main_func->params) {
input_vars_.push_back(input);
std::string input_name = SanitizeName(input->name_hint());
CreateIOVar(input, input_name);
// We dont want the compiler changing input names in the
// event of a sanitization collision. Therefore, enforcing
// the var created to use the input_name strictly.
CreateIOVar(input, input_name, /*use_unique_name = */ false);
}

// Define the storage allocator ids
Expand All @@ -1052,7 +1075,27 @@ class AOTExecutorCodegen : public MixedModeVisitor {
// Retrieve the return sids
return_sid_ = final_aot_allocator.GetReturnIds();
// Insert outputs to main func signature
CreateIOVar(lowered_main_func->body, "output");
// If output tensor names were provided use them
if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
Array<String> output_tensor_names = opt.value();
if (lowered_main_func->body->IsInstance<TupleNode>()) {
Tuple output_tuple = Downcast<Tuple>(lowered_main_func->body);
for (unsigned i = 0; i < output_tuple->fields.size(); i++) {
// AoT Executor Codegen does not create these names,
// thus should be used as they are provided.
CreateIOVar(output_tuple->fields[i], output_tensor_names[i],
/*use_unique_name = */ false);
}
} else {
// AoT Executor Codegen does not create these names,
// thus should be used as they are provided.
CreateIOVar(lowered_main_func->body, output_tensor_names[0], /*use_unique_name = */ false);
}
} else {
// If output tensor names are not provided we will generate output(x)
// where x is a counter to create unique names.
CreateIOVar(lowered_main_func->body, "output");
}

CollectDeviceVariables(lowered_mod->GetAttr<Map<GlobalVar, String>>("device_contexts").value());
VisitExpr(lowered_main_func->body);
Expand All @@ -1071,8 +1114,27 @@ class AOTExecutorCodegen : public MixedModeVisitor {
// AoT Executor codegen works completely on TIR beyond this point, hence removing relay main
// function and replacing it with its TIR version. We should try to make this a Pass.
lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), prim_func);
auto tir_main_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
// Extract additional information around main TIR PrimFunc arguments
Array<String> devices = ListDevices();
const auto main_func_params_end_iterator =
tir_main_func->params.begin() + tir_main_func->params.size();
const auto outputs_begin_iterator =
main_func_params_end_iterator - return_sid_.size() - devices.size();
Array<tir::Var> inputs = Array<tir::Var>(tir_main_func->params.begin(), outputs_begin_iterator);
Array<TensorType> input_tensor_types;
for (auto i : inputs) {
input_tensor_types.push_back(io_tensor_types_[i]);
}
Array<tir::Var> outputs =
Array<tir::Var>(outputs_begin_iterator, main_func_params_end_iterator - devices.size());
std::vector<String> output_var_names;
for (const tir::Var& output : outputs) {
output_var_names.push_back(output->name_hint);
}

Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), tir_main_func);
// Parallel for loops are not supported in AoT codegen.
lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);

Expand Down Expand Up @@ -1109,9 +1171,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {

ret.external_mods = external_modules.value();

// Extract USMP metadata to pass onto metadata sources
Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
std::vector<tir::Var> pool_vars;
tir::PrimFunc tir_main_func =
tir_main_func =
Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
Expand All @@ -1122,41 +1185,16 @@ class AOTExecutorCodegen : public MixedModeVisitor {
pool_var_info.Set(tir_main_func->params[pool_var_index], allocated_pool_info);
}
}
Array<String> devices = ListDevices();
Array<tir::Var> inputs =
Array<tir::Var>(tir_main_func->params.begin(),
tir_main_func->params.begin() + tir_main_func->params.size() -
return_sid_.size() - pool_vars.size() - devices.size());
Map<String, tir::usmp::PoolAllocation> io_pool_allocations =
lowered_mod
->GetAttr<Map<String, tir::usmp::PoolAllocation>>(tvm::attr::kIOTensorPoolAllocations)
.value_or({});

Array<TensorType> input_tensor_types;
for (auto i : inputs) {
input_tensor_types.push_back(io_tensor_types_[i]);
}

std::vector<String> output_var_names;
if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
Array<String> output_tensor_names = opt.value();
for (size_t i = 0; i < output_tensor_names.size(); ++i) {
output_var_names.push_back(output_tensor_names[i]);
}
}

// If output names have not been specified then generate default output names
if (output_var_names.size() == 0) {
if (return_sid_.size() == 1) {
output_var_names.push_back(String("output"));
} else {
for (size_t i = 0; i < return_sid_.size(); ++i) {
output_var_names.push_back(String("output" + std::to_string(i)));
}
}
}

Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
ret.metadata =
ExecutorCodegenMetadata(inputs, input_tensor_types, output_var_names, output_tensor_types,
pool_vars, devices, runtime::kTvmExecutorAot, mod_name,
interface_api, unpacked_api, pool_var_info, io_pool_allocations);

ret.metadata = ExecutorCodegenMetadata(
inputs, input_tensor_types, output_var_names, output_tensor_types, pool_vars, devices,
runtime::kTvmExecutorAot, mod_name, interface_api, unpacked_api, pool_var_info);
return ret;
}

Expand Down
4 changes: 3 additions & 1 deletion src/relay/backend/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
String executor, String mod_name, String interface_api, bool unpacked_api,
Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs) {
Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs,
Map<String, tir::usmp::PoolAllocation> io_pool_allocations) {
auto n = make_object<ExecutorCodegenMetadataNode>();
n->inputs = inputs;
n->input_tensor_types = input_tensor_types;
Expand All @@ -198,6 +199,7 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
n->unpacked_api = unpacked_api;
n->mod_name = mod_name;
n->pool_inputs = pool_inputs;
n->io_pool_allocations = io_pool_allocations;
data_ = std::move(n);
}

Expand Down
Loading