From d8f639a84fd9a11f03a949eb751a18018a9c1033 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 23 Feb 2022 13:12:55 +0000
Subject: [PATCH] [AOT] BugFix of workspace calculation (#10337)

Following an investigation from #10022,
it turns out, currently the workspace
calculation assumes there would be a single
lowered PrimFunc could be produced per
primitive Relay Function.

However, the exception turned out to
be the CMSIS-NN codegen that produces
multiple calls/PrimFuncs in the place
of a single call to single relay PrimFunc.

This commit adds changes to workspace
calculation to be done on lowered IRModule.

Additionally, changes the test utils to
not to generate any stack allocator code
when USMP is used to make the tests more
strict.

This change also removes the confusing
"run_model" which has semantics identitical
to "__tvm_main__" in TIR.
---
 .../src/example_project/model.c               |  2 +-
 apps/microtvm/zephyr_cmsisnn/src/main.c       |  2 +-
 include/tvm/runtime/module.h                  |  2 -
 src/relay/backend/aot_executor_codegen.cc     | 58 +++++++----
 src/target/source/source_module.cc            |  2 +-
 src/tir/usmp/transform/assign_pool_info.cc    |  4 +-
 .../convert_pool_allocations_to_offsets.cc    |  2 +-
 src/tir/usmp/unified_static_memory_planner.cc |  4 +-
 src/tir/usmp/utils.cc                         |  2 +-
 tests/python/contrib/test_ethosu/infra.py     |  3 +-
 .../contrib/test_ethosu/test_networks.py      |  2 +-
 tests/python/relay/aot/aot_test_utils.py      | 97 +++++++++++++++----
 tests/python/relay/aot/test_c_device_api.py   |  4 +-
 tests/python/relay/aot/test_crt_aot.py        | 80 +++++++++++++--
 tests/python/relay/aot/test_crt_aot_usmp.py   | 28 ++----
 ...orm_convert_pool_allocations_to_offsets.py | 16 +--
 tests/scripts/task_demo_microtvm.sh           |  7 +-
 17 files changed, 220 insertions(+), 95 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/src/example_project/model.c b/apps/microtvm/arduino/template_project/src/example_project/model.c
index 553665191b14..25d609dacce1 100644
--- a/apps/microtvm/arduino/template_project/src/example_project/model.c
+++ b/apps/microtvm/arduino/template_project/src/example_project/model.c
@@ -86,7 +86,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
 void TVMInitialize() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
 
 void TVMExecute(void* input_data, void* output_data) {
-  int ret_val = tvmgen_default_run_model(input_data, output_data);
+  int ret_val = tvmgen_default___tvm_main__(input_data, output_data);
   if (ret_val != 0) {
     TVMPlatformAbort(kTvmErrorPlatformCheckFailure);
   }
diff --git a/apps/microtvm/zephyr_cmsisnn/src/main.c b/apps/microtvm/zephyr_cmsisnn/src/main.c
index 274bd63d3ea5..31f6cd0cc1d0 100644
--- a/apps/microtvm/zephyr_cmsisnn/src/main.c
+++ b/apps/microtvm/zephyr_cmsisnn/src/main.c
@@ -34,7 +34,7 @@ extern float output_storage[12];
 
 extern const size_t output_len;
 
-static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE + 512];
+static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE];
 tvm_workspace_t app_workspace;
 
 void TVMLogf(const char* msg, ...) {
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 7b5326a44921..2e2a79b1ca53 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -235,8 +235,6 @@ constexpr const char* tvm_module_main = "__tvm_main__";
 constexpr const char* tvm_param_prefix = "__tvm_param__";
 /*! \brief A PackedFunc that looks up linked parameters by storage_id. */
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
-/*! \brief The main AOT executor function generated from TIR */
-constexpr const char* tvm_run_func_suffix = "run_model";
 /*! \brief Model entrypoint generated as an interface to the AOT function outside of TIR */
 constexpr const char* tvm_entrypoint_suffix = "run";
 }  // namespace symbol
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 2168ea74a0ff..a25ef458906c 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -658,8 +658,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Define the PrimFunc attributes
     Map<String, ObjectRef> dict_attrs;
-    String run_func_name =
-        runtime::get_name_mangled(mod_name, runtime::symbol::tvm_run_func_suffix);
+    String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_module_main);
     dict_attrs.Set("global_symbol", run_func_name);
     dict_attrs.Set("runner_function", Bool(true));
     dict_attrs.Set(tvm::attr::kTarget, target_host_);
@@ -702,6 +701,35 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * brief Calculate workspace sizes for PrimFuncs in the IRModule
+   */
+  Map<String, FunctionInfo> CalculateWorkspaceSizes(
+      const IRModule& lowered_mod, const Map<String, FunctionInfo>& function_metadata) {
+    Executor executor_config = lowered_mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    Integer workspace_byte_alignment =
+        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Map<String, FunctionInfo> updated_function_metadata;
+    for (const auto& kv : lowered_mod->functions) {
+      GlobalVar global_var = kv.first;
+      BaseFunc base_func = kv.second;
+      if (base_func->IsInstance<tir::PrimFuncNode>()) {
+        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(base_func);
+        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
+        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+        if (function_metadata.count(global_var->name_hint)) {
+          updated_function_metadata.Set(global_var->name_hint,
+                                        function_metadata[global_var->name_hint]);
+          updated_function_metadata[global_var->name_hint]->workspace_sizes.Set(tgt, ws);
+        } else {
+          FunctionInfo finfo{{{tgt, ws}}, {}, {}, {{tgt, pfunc}}, {}};
+          updated_function_metadata.Set(global_var->name_hint, finfo);
+        }
+      }
+    }
+    return updated_function_metadata;
+  }
+
   /*!
    * brief Run USMP to plan memory for lowered IRModule
    */
@@ -710,17 +738,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
-    // Update workspace size based on the pool allocations.
-    for (const auto& kv : function_metadata_) {
-      if (lowered_mod->ContainGlobalVar(kv.first) &&
-          lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
-        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
-        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
-        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
-        kv.second->workspace_sizes.Set(tgt, ws);
-      }
-    }
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     backend::FunctionInfo main_func_info =
@@ -752,17 +771,18 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     // Running StorageRewrite just on the main function
     tir::PrimFunc tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     IRModule main_func_mod;
-    main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+    main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main),
                           tir_main_func);
     main_func_mod = tir::transform::StorageRewrite()(main_func_mod);
-    lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
-                        main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main),
+                        main_func_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     // Use the PrimFunc to calculate the workspace required to service the allocates
     Integer main_workspace_size_bytes =
         CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
@@ -920,7 +940,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // function and replacing it with its TIR version. We should try to make this a Pass.
     lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
     auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
-    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func);
+    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), prim_func);
     // Parallel for loops are not supported in AoT codegen.
     lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
 
@@ -960,7 +980,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
     std::vector<tir::Var> pool_vars;
     tir::PrimFunc tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     if (allocated_pool_infos) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 907eb061303f..97461ca2091f 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -474,7 +474,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   void GenerateAOTDescriptor() {
-    const std::string run_func_suffix = ::tvm::runtime::symbol::tvm_run_func_suffix;
+    const std::string run_func_suffix = ::tvm::runtime::symbol::tvm_module_main;
     const std::string tvm_entrypoint_suffix = ::tvm::runtime::symbol::tvm_entrypoint_suffix;
     const std::string run_func_mangled =
         runtime::get_name_mangled(metadata_->mod_name, run_func_suffix);
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index 9d8e36137c37..a2304f3b9e3d 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -42,7 +42,7 @@ class PoolInfoAssigner : public StmtExprMutator {
  public:
   explicit PoolInfoAssigner(const IRModule& module) {
     PrimFunc main_func =
-        Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
     ICHECK(main_func.defined()) << "main function is not in the module";
     Optional<Target> target_host = main_func->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target_host) << "main function does not have a target attr";
@@ -79,7 +79,7 @@ class PoolInfoAssigner : public StmtExprMutator {
 PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module) {
   Map<Target, String> target_access;
   tir::PrimFunc tir_main_func =
-      Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
   Target target_host = tir_main_func->GetAttr<Target>(tvm::attr::kTarget).value();
   for (const auto& kv : module->functions) {
     BaseFunc func = kv.second;
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 999ca37d2128..6abc48c31be0 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -331,7 +331,7 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const LoadNode* op) {
 }
 
 IRModule PoolAllocationToOffsetConverter::operator()() {
-  GlobalVar gv = module_->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix);
+  GlobalVar gv = module_->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main);
   PrimFunc main_func = Downcast<PrimFunc>(module_->Lookup(gv));
   ScopeInfo si = UpdateFunctionScopeInfo(main_func);
   this->scope_stack.push(si);
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index 3b941d3cc021..e848440f029e 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -51,7 +51,7 @@ static std::unordered_map<String, std::function<Map<BufferInfo, PoolAllocation>(
 
 IRModule PlanMemory(const IRModule& mod, String algo) {
   VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod);
-  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
   BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod);
   Array<BufferInfo> buffer_info_arr =
       CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
@@ -63,7 +63,7 @@ IRModule PlanMemory(const IRModule& mod, String algo) {
       buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
   IRModule ret = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(mod);
   tir::PrimFunc tir_main_func =
-      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_module_main));
   Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
       tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
   if (allocated_pool_infos) {
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 5c95f7d7a7be..03fac325905c 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -181,7 +181,7 @@ class ModuleWorkspaceSizeCalculator : public StmtExprVisitor {
     for (const auto& gv_func : mod_->functions) {
       functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
     }
-    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
     ICHECK(main_func_.defined()) << "main function is not in the module";
     Optional<Target> target_host = main_func_->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target_host) << "main function does not have a target attr";
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 4bdaef7a74ca..b355c440e006 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -242,12 +242,13 @@ def build_source(
 def verify_source(
     models: List[AOTCompiledTestModel],
     accel="ethos-u55-256",
+    enable_usmp=True,
 ):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel)
+    test_runner = create_test_runner(accel, enable_usmp)
     run_and_check(
         models,
         test_runner,
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index e9c6da5be18a..7e3140ff514a 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -71,7 +71,7 @@ def test_forward_mobilenet_v1(accel_type, enable_usmp):
     compiled_models = infra.build_source(
         mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp
     )
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, accel_type, enable_usmp=enable_usmp)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index b7021e5a8984..63817fc4b965 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -265,21 +265,29 @@ def emit_data_linkage(output_file, data_linkage):
 
 
 def emit_main_prologue(
-    main_file, custom_prologue, workspace_bytes, data_linkage, compiled_models, interface_api
+    main_file,
+    custom_prologue,
+    workspace_bytes,
+    data_linkage,
+    compiled_models,
+    interface_api,
+    use_stack_allocator=True,
 ):
-    # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
-    workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
-    if interface_api == "c":
-        for compiled_model in compiled_models:
-            model = compiled_model.model
-            workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
-    workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
-    main_file.write(workspace_define)
-    emit_data_linkage(main_file, data_linkage)
-    main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
-    main_file.write("tvm_workspace_t app_workspace;\n")
-    main_file.write(
-        """
+    if use_stack_allocator:
+        workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
+        if interface_api == "c":
+            for compiled_model in compiled_models:
+                model = compiled_model.model
+                workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
+        # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
+        workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
+        main_file.write(workspace_define)
+        emit_data_linkage(main_file, data_linkage)
+        main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
+        main_file.write("tvm_workspace_t app_workspace;\n")
+        main_file.write(
+            """
+            
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
@@ -287,7 +295,26 @@ def emit_main_prologue(
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return StackMemoryManager_Free(&app_workspace,ptr);
 }
+        """
+        )
+    else:
+        # An implementation is not needed for these if the stack allocator is not used
+        main_file.write(
+            """
+            
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
 
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
+
+            """
+        )
+    main_file.write(
+        """
+    
 void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); }
 
 void TVMLogf(const char* msg, ...) {
@@ -296,10 +323,10 @@ def emit_main_prologue(
   vfprintf(stdout, msg, args);
   va_end(args);
 }
-
+    
 TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
 int main(){\n
-"""
+    """
     )
     main_file.write(custom_prologue)
 
@@ -511,6 +538,7 @@ def create_main(
     data_linkage,
     interface_api,
     workspace_bytes,
+    use_stack_allocator=True,
 ):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
@@ -533,8 +561,10 @@ def create_main(
             data_linkage,
             compiled_models,
             interface_api,
+            use_stack_allocator,
         )
-        emit_main_init_memory_manager(main_file)
+        if use_stack_allocator:
+            emit_main_init_memory_manager(main_file)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
@@ -709,11 +739,14 @@ def run_and_check(
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
-        workspace_bytes = model.extra_memory_in_bytes
-        use_usmp = runner.pass_config.get("tir.usmp.enable", False)
-        if interface_api == "packed" and not use_usmp:
+        # Interface C APIs does not need compiler generated
+        # workspace to generate the test application, because
+        # workspace size is codegen'd as a macro to
+        # tvmgen_<model_name>.h.
+        if interface_api != "c":
             workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
 
+        workspace_bytes += model.extra_memory_in_bytes
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
             create_header_file(
@@ -738,6 +771,10 @@ def run_and_check(
                 data_linkage,
             )
 
+    use_usmp = runner.pass_config.get("tir.usmp.enable", False)
+    # We only need the stack allocator if USMP is not used
+    use_stack_allocator = not use_usmp
+
     create_main(
         "test.c",
         models,
@@ -748,6 +785,7 @@ def run_and_check(
         data_linkage,
         interface_api,
         workspace_bytes,
+        use_stack_allocator,
     )
 
     # Verify that compiles fine
@@ -868,3 +906,22 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
         output_tensor_names = main.attrs["output_tensor_names"]
 
     return dict(zip(output_tensor_names, out))
+
+
+def create_relay_module_and_inputs_from_tflite_file(tflite_model_file):
+    """A helper function to create a Relay IRModule with inputs
+    and params from a tflite file"""
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    mod, params = convert_to_relay(tflite_model_buf)
+
+    inputs = dict()
+    for param in mod["main"].params:
+        name = str(param.name_hint)
+        data_shape = [int(i) for i in param.type_annotation.shape]
+        dtype = str(param.type_annotation.dtype)
+        in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max)
+        data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype)
+        inputs[name] = data
+
+    return mod, inputs, params
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index d369fd0a4a30..8252ee68ade8 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -93,7 +93,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
             pass_config=test_runner.pass_config,
         )
         main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[0][1]
-        main_func = main_ir_module["run_model"]
+        main_func = main_ir_module["__tvm_main__"]
         return main_func
 
     return compile_to_main_func
@@ -124,7 +124,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
             pass_config=test_runner.pass_config,
         )
         main_ir_module = list(compiled_models[0].executor_factory.lowered_ir_mods.values())[0]
-        main_func = main_ir_module["run_model"]
+        main_func = main_ir_module["__tvm_main__"]
         return main_func
 
     return compile_to_main_func
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f4f0806dca52..0147b8cf755a 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -28,6 +28,7 @@
 from tvm.relay.testing import byoc
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
+from tvm.micro import model_library_format as mlf
 from aot_test_utils import (
     AOTTestModel,
     AOT_DEFAULT_RUNNER,
@@ -36,6 +37,7 @@
     compile_and_run,
     compile_models,
     parametrize_aot_options,
+    create_relay_module_and_inputs_from_tflite_file,
 )
 
 
@@ -541,13 +543,7 @@ def test_quant_mobilenet_tfl():
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         "mobilenet_v1_1.0_224_quant.tflite",
     )
-    with open(tflite_model_file, "rb") as f:
-        tflite_model_buf = f.read()
-    data_shape = (1, 224, 224, 3)
-    in_min, in_max = (0, 255)
-    data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
-    mod, params = convert_to_relay(tflite_model_buf)
-    inputs = {"input": data}
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
     compile_and_run(
         AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
@@ -843,5 +839,75 @@ def representative_dataset():
         assert output_name in source
 
 
+@pytest.mark.parametrize(
+    "workspace_byte_alignment,main_workspace_size",
+    [
+        (8, 14880),
+        (16, 14880),
+        (256, 15616),
+    ],
+)
+def test_workspace_calculation(workspace_byte_alignment, main_workspace_size):
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": workspace_byte_alignment,
+        },
+    )
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == main_workspace_size
+
+
+@tvm.testing.requires_package("tflite")
+@tvm.testing.requires_cmsisnn
+def test_workspace_calculation_cmsis_nn():
+    """This tests cmsis_nn codegen for workspace calculation.
+    This is tested specially because cmsis-nn codegen creates
+    multiple PrimFuncs per offloaded relay function in a non
+    -hierarchical manner."""
+    pytest.importorskip("tflite")
+
+    from tvm.relay.op.contrib import cmsisnn
+    from tvm.contrib.download import download_testdata
+
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 16,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+
+    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    file_to_download = "cnn_s_quantized.tflite"
+    file_saved = "cnn_s_quantized_15Dec2021.tflite"
+    model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
+    mod, _, params = create_relay_module_and_inputs_from_tflite_file(model_file)
+    mod = cmsisnn.partition_for_cmsisnn(mod, params)
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 9904
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index a27609cc07ad..73b34700ee27 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -39,6 +39,7 @@
     compile_models,
     parametrize_aot_options,
     run_and_check,
+    create_relay_module_and_inputs_from_tflite_file,
 )
 
 
@@ -202,23 +203,6 @@ def test_byoc_microtvm(merge_compiler_regions):
     )
 
 
-def _get_relay_module_and_inputs_from_tflite_file(tflite_model_file):
-    with open(tflite_model_file, "rb") as f:
-        tflite_model_buf = f.read()
-    mod, params = convert_to_relay(tflite_model_buf)
-
-    inputs = dict()
-    for param in mod["main"].params:
-        name = str(param.name_hint)
-        data_shape = [int(i) for i in param.type_annotation.shape]
-        dtype = str(param.type_annotation.dtype)
-        in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max)
-        data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype)
-        inputs[name] = data
-
-    return mod, inputs, params
-
-
 MOBILENET_V1_URL = (
     "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
     "mobilenet_v1_1.0_224_quant.tflite",
@@ -253,7 +237,7 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -324,7 +308,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -384,7 +368,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -438,14 +422,14 @@ def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_ur
         model_urls[0][0],
         model_urls[0][1],
     )
-    mod1, inputs1, params1 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file1)
+    mod1, inputs1, params1 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file1)
     output_list1 = generate_ref_data(mod1, inputs1, params1)
 
     tflite_model_file2 = tf_testing.get_workload_official(
         model_urls[1][0],
         model_urls[1][1],
     )
-    mod2, inputs2, params2 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file2)
+    mod2, inputs2, params2 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file2)
     output_list2 = generate_ref_data(mod2, inputs2, params2)
 
     compiled_test_mods = compile_models(
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index ab40c646391c..07e31a989874 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -122,9 +122,9 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
                 T.store(T_cast_7.data, (((ax0_ax1_fused_5*3584) + (ax2_5*64)) + ax3_3), T.cast(T.load("uint8", tensor_2, (((ax0_ax1_fused_5*3584) + (ax2_5*64)) + ax3_3)), "int16"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "run_model", "runner_function": True})
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
@@ -140,7 +140,7 @@ def run_model(input: T.handle, output: T.handle) -> None:
 @tvm.script.ir_module
 class LinearStructurePlanned:
     @T.prim_func
-    def run_model(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle,  output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle,  output: T.handle) -> None:
         fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
         slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
@@ -217,7 +217,7 @@ def test_mobilenet_subgraph():
     tir_mod = assign_poolinfos_to_allocates_in_irmodule(
         tir_mod, [fast_memory_pool, slow_memory_pool]
     )
-    main_func = tir_mod["run_model"]
+    main_func = tir_mod["__tvm_main__"]
     buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
     buffer_info_map = buffer_analysis.buffer_info_stmts
 
@@ -328,9 +328,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
                     T.store(T_cast_7.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4, T.cast(T.max(T.min(T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_3, ax3_inner_4) + T.load("int32", placeholder_26.data, ax3_outer_2 * 64 + ax3_inner_4), 1343014664, 31, -8, dtype="int32") + 136, 255), 0), "uint8"), "int32") - 136, 1073903788, 31, 1, dtype="int32") + T.load("int32", placeholder_28.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4), 255), 0), "uint8"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "run_model", "runner_function": True})
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
@@ -464,7 +464,7 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
                 T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1_let, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None:
         global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
         T.attr("default", "device_id", 0)
@@ -491,7 +491,7 @@ def test_resnet_subgraph():
     tir_mod = ResnetStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
     tir_mod = assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool])
-    main_func = tir_mod["run_model"]
+    main_func = tir_mod["__tvm_main__"]
     buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
     buffer_info_map = buffer_analysis.buffer_info_stmts
 
diff --git a/tests/scripts/task_demo_microtvm.sh b/tests/scripts/task_demo_microtvm.sh
index 9ed9c671acc0..b5c18ec9e757 100755
--- a/tests/scripts/task_demo_microtvm.sh
+++ b/tests/scripts/task_demo_microtvm.sh
@@ -19,8 +19,7 @@
 set -euxo pipefail
 
 pushd apps/microtvm/zephyr_cmsisnn
-# Demo tests are disabled here due to https://github.com/apache/tvm/issues/10312
-# timeout 5m ./run_demo.sh
+ timeout 5m ./run_demo.sh
 popd
 
 pushd apps/microtvm/ethosu
@@ -28,6 +27,6 @@ FVP_PATH="/opt/arm/FVP_Corstone_SSE-300_Ethos-U55"
 CMAKE_PATH="/opt/arm/cmake/bin/cmake"
 FREERTOS_PATH="/opt/freertos/FreeRTOSv202112.00"
 
-# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH
-# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH
+ timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH
+ timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH
 popd