[MetaSchedule] Introduce Async Pipeline in MultiLevelTiling

This PR introduces async pipeline in the current TVM's MultiLevelTiling Rules. This PR is blocking on apache#13966 since some conv2d workload will use `tir.if_then_else` to pad the input to the correct size, and this PR uses async copy in such copy statement. 1. Add a subrule in `src/meta_schedule/schedule_rule/multi_level_tiling.h/.cc` that annotate async copy for mlt. In CUDA Core, this PR has a perf boost of around 1T GFLOP/s in most Conv2d test cases and 1T ~ 2T in most GEMM test cases. All generated codes, scripts, and traces are available at https://github.com/Rainy-Memory/tvm-async-rule-benchmark. Currently tested on commit `afbfb7aa7e43732cb716f8e443df696110be6afc` in conv2d NHWC workload, with a RTX 3080 GPU. Workload: Conv2d NHWC |Shape|Mainline TVM|Mainline TVM with Async| |-|-|-| |N=1_H=224_W=224_C=3_K=64_R=7_S=7_STR=2_PAD=3_DIL=1|13838.05219|14687.89452| |N=1_H=56_W=56_C=64_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|5398.305085|5613.892553| |N=1_H=56_W=56_C=64_K=64_R=3_S=3_STR=1_PAD=1_DIL=1|11652.96825|13157.88249| |N=1_H=56_W=56_C=64_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|10638.8309|11674.68499| |N=1_H=56_W=56_C=256_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|8692.32829|9469.264089| |N=1_H=56_W=56_C=256_K=128_R=1_S=1_STR=2_PAD=0_DIL=1|4685.767442|5698.19634| |N=1_H=28_W=28_C=128_K=128_R=3_S=3_STR=1_PAD=1_DIL=1|9872.787087|10404.60405| |N=1_H=28_W=28_C=128_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|9974.281496|10073.31657| |N=1_H=28_W=28_C=512_K=128_R=1_S=1_STR=1_PAD=0_DIL=1|7075.866932|8564.572712| |N=1_H=28_W=28_C=512_K=256_R=1_S=1_STR=2_PAD=0_DIL=1|3648.330914|4021.923142| |N=1_H=14_W=14_C=256_K=256_R=3_S=3_STR=1_PAD=1_DIL=1|8192.954618|9160.182054| |N=1_H=14_W=14_C=256_K=1024_R=1_S=1_STR=1_PAD=0_DIL=1|8008.870153|9362.825279| |N=1_H=14_W=14_C=1024_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|5210.062241|6051.208379| |N=1_H=14_W=14_C=1024_K=512_R=1_S=1_STR=2_PAD=0_DIL=1|2550.787202|3587.902938| |N=1_H=7_W=7_C=512_K=512_R=3_S=3_STR=1_PAD=1_DIL=1|4350.626084|5432.788068| |N=1_H=7_W=7_C=512_K=2048_R=1_S=1_STR=1_PAD=0_DIL=1|6672.068026|7663.725217| |N=1_H=7_W=7_C=2048_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|3142.564263|4297.988014| Workload: GEMM NN |Shape|Mainline TVM|Mainline TVM with Async| |-|-|-| |M=512_N=256_K=640|8678.46|10607.37| |M=512_N=384_K=256|8109.13|10290.72| |M=512_N=512_K=512|11419.83|14000.86| |M=512_N=3072_K=768|19709.39|18351.61| |M=512_N=768_K=3072|12844.59|13730.88| |M=896_N=896_K=896|16149.91|16131.39| |M=1024_N=1024_K=1024|18842.11|19662.8| |M=1152_N=1152_K=1152|15386.79|16736.1| |M=1536_N=1536_K=1536|18522.67|18872.06| |M=2048_N=2048_K=2048|19515.42|18874.85| |M=3072_N=3072_K=3072|19233.9|19291.42| |M=4096_N=4096_K=4096|17122.17|19259.01|
cblmemo · Feb 17, 2023 · 9cc4620 · 9cc4620
1 parent 49b6c3a
commit 9cc4620
Show file tree

Hide file tree

Showing 116 changed files with 2,754 additions and 1,940 deletions.
diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -197,8 +197,8 @@ def _disassemble_mlf(self, mlf_tar_path, source_dir):
                 metadata = json.load(f)
         return metadata
 
-    def _template_model_header(self, source_dir, metadata):
-        with open(source_dir / "model.h", "r") as f:
+    def _template_model(self, source_dir, metadata):
+        with open(source_dir / "platform.c", "r") as f:
             model_h_template = Template(f.read())
 
         all_module_names = []
@@ -218,7 +218,7 @@ def _template_model_header(self, source_dir, metadata):
             "workspace_size_bytes": workspace_size_bytes,
         }
 
-        with open(source_dir / "model.h", "w") as f:
+        with open(source_dir / "platform.c", "w") as f:
             f.write(model_h_template.substitute(template_values))
 
     # Arduino ONLY recognizes .ino, .ccp, .c, .h
@@ -415,9 +415,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         metadata = self._disassemble_mlf(model_library_format_path, source_dir)
         shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
 
-        # For AOT, template model.h with metadata to minimize space usage
+        # For AOT, template platform.c with metadata to minimize space usage
         if project_type == "example_project":
-            self._template_model_header(source_dir, metadata)
+            self._template_model(source_dir, metadata)
 
         self._change_cpp_file_extensions(source_dir)
 

diff --git a/...plate_project/src/example_project/model.c → ...te_project/src/example_project/platform.c b/...plate_project/src/example_project/model.c → ...te_project/src/example_project/platform.c
@@ -17,17 +17,22 @@
  * under the License.
  */
 
-#include "model.h"
+/*!
+ * \brief Implementation of TVMPlatform functions in tvm/runtime/crt/platform.h
+ */
 
 #include "Arduino.h"
 #include "standalone_crt/include/dlpack/dlpack.h"
 #include "standalone_crt/include/tvm/runtime/crt/stack_allocator.h"
 
+#define TVM_WORKSPACE_SIZE_BYTES $workspace_size_bytes
+
 // AOT memory array, stack allocator wants it aligned
-static uint8_t g_aot_memory[WORKSPACE_SIZE]
+static uint8_t g_aot_memory[TVM_WORKSPACE_SIZE_BYTES]
     __attribute__((aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)));
 tvm_workspace_t app_workspace;
 
+// Called when an internal error occurs and execution cannot continue.
 // Blink code for debugging purposes
 void TVMPlatformAbort(tvm_crt_error_t error) {
   TVMLogf("TVMPlatformAbort: 0x%08x\n", error);
@@ -45,19 +50,23 @@ void TVMPlatformAbort(tvm_crt_error_t error) {
   }
 }
 
-void TVMLogf(const char* msg, ...) {}
-
+// Allocate memory for use by TVM.
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
   return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
 
+// Free memory used by TVM.
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
   return StackMemoryManager_Free(&app_workspace, ptr);
 }
 
+// Internal logging API call implementation.
+void TVMLogf(const char* msg, ...) {}
+
 unsigned long g_utvm_start_time_micros;
 int g_utvm_timer_running = 0;
 
+// Start a device timer.
 tvm_crt_error_t TVMPlatformTimerStart() {
   if (g_utvm_timer_running) {
     return kTvmErrorPlatformTimerBadState;
@@ -67,6 +76,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
   return kTvmErrorNoError;
 }
 
+// Stop the running device timer and get the elapsed time (in microseconds).
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   if (!g_utvm_timer_running) {
     return kTvmErrorPlatformTimerBadState;
@@ -77,14 +87,19 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   return kTvmErrorNoError;
 }
 
+// Fill a buffer with random data.
 tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
   for (size_t i = 0; i < num_bytes; i++) {
     buffer[i] = rand();
   }
   return kTvmErrorNoError;
 }
 
-void TVMInitialize() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
+// Initialize TVM inference.
+tvm_crt_error_t TVMPlatformInitialize() {
+  StackMemoryManager_Init(&app_workspace, g_aot_memory, sizeof(g_aot_memory));
+  return kTvmErrorNoError;
+}
 
 void TVMExecute(void* input_data, void* output_data) {
   int ret_val = tvmgen_default___tvm_main__(input_data, output_data);

diff --git a/...plate_project/src/example_project/model.h → ...te_project/src/example_project/platform.h b/...plate_project/src/example_project/model.h → ...te_project/src/example_project/platform.h
@@ -17,14 +17,10 @@
  * under the License.
  */
 
-#define WORKSPACE_SIZE $workspace_size_bytes
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void TVMInitialize();
-
 /* TODO template this function signature with the input and output
  * data types and sizes. For example:
  *

diff --git a/apps/microtvm/arduino/template_project/src/example_project/project.ino b/apps/microtvm/arduino/template_project/src/example_project/project.ino
@@ -17,10 +17,10 @@
  * under the License.
  */
 
-#include "src/model.h"
+#include "src/standalone_crt/include/tvm/runtime/crt/platform.h"
 
 void setup() {
-  TVMInitialize();
+  TVMPlatformInitialize();
   // If desired, initialize the RNG with random noise
   // randomSeed(analogRead(0));
 }

diff --git a/...e_project/src/host_driven/model_support.c → ...mplate_project/src/host_driven/platform.c b/...e_project/src/host_driven/model_support.c → ...mplate_project/src/host_driven/platform.c
@@ -17,22 +17,28 @@
  * under the License.
  */
 
+/*!
+ * \brief Implementation of TVMPlatform functions in tvm/runtime/crt/platform.h
+ */
+
 #include "standalone_crt/include/dlpack/dlpack.h"
 #include "standalone_crt/include/tvm/runtime/crt/error_codes.h"
 #include "stdarg.h"
 
-// Blink code for debugging purposes
+// Called when an internal error occurs and execution cannot continue.
 void TVMPlatformAbort(tvm_crt_error_t error) {
   TVMLogf("TVMPlatformAbort: 0x%08x\n", error);
   for (;;)
     ;
 }
 
+// Called by the microTVM RPC server to implement TVMLogf.
 size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
                                 va_list args) {
   return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
 }
 
+// Allocate memory for use by TVM.
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
   if (num_bytes == 0) {
     num_bytes = sizeof(int);
@@ -41,6 +47,7 @@ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void**
   return (*out_ptr == NULL) ? kTvmErrorPlatformNoMemory : kTvmErrorNoError;
 }
 
+// Free memory used by TVM.
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
   free(ptr);
   return kTvmErrorNoError;
@@ -49,6 +56,7 @@ tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
 unsigned long g_utvm_start_time_micros;
 int g_utvm_timer_running = 0;
 
+// Start a device timer.
 tvm_crt_error_t TVMPlatformTimerStart() {
   if (g_utvm_timer_running) {
     return kTvmErrorPlatformTimerBadState;
@@ -58,6 +66,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
   return kTvmErrorNoError;
 }
 
+// Stop the running device timer and get the elapsed time (in microseconds).
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   if (!g_utvm_timer_running) {
     return kTvmErrorPlatformTimerBadState;
@@ -68,6 +77,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   return kTvmErrorNoError;
 }
 
+// Fill a buffer with random data.
 tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
   for (size_t i = 0; i < num_bytes; i++) {
     buffer[i] = rand();

diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -83,4 +83,4 @@ endif()
 
 file(GLOB_RECURSE app_srcs src/**.c src/**.cc)
 target_sources(app PRIVATE ${app_srcs} ${cmsis_lib_srcs})
-target_include_directories(app PRIVATE crt_config ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
+target_include_directories(app PRIVATE crt_config include ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -210,14 +210,14 @@ def _get_board_mem_size_bytes(zephyr_base: str, board: str):
     return None
 
 
-DEFAULT_HEAP_SIZE_BYTES = 216 * 1024
+DEFAULT_WORKSPACE_SIZE_BYTES = 216 * 1024
 
 
 def _get_recommended_heap_size_bytes(board: str):
     prop = BOARD_PROPERTIES[board]
     if "recommended_heap_size_bytes" in prop:
         return prop["recommended_heap_size_bytes"]
-    return DEFAULT_HEAP_SIZE_BYTES
+    return DEFAULT_WORKSPACE_SIZE_BYTES
 
 
 def generic_find_serial_port(serial_number: str = None):
@@ -358,11 +358,11 @@ def _get_nrf_device_args(serial_number: str = None) -> list:
         help="Run on the FVP emulator instead of hardware.",
     ),
     server.ProjectOption(
-        "heap_size_bytes",
+        "workspace_size_bytes",
         optional=["generate_project"],
         type="int",
         default=None,
-        help="Sets the value for HEAP_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
+        help="Sets the value for TVM_WORKSPACE_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
     ),
 ]
 
@@ -403,7 +403,13 @@ def server_info_query(self, tvm_version):
     }
 
     def _create_prj_conf(
-        self, project_dir: pathlib.Path, board: str, project_type: str, config_main_stack_size
+        self,
+        project_dir: pathlib.Path,
+        board: str,
+        project_type: str,
+        config_main_stack_size: int,
+        config_led: bool,
+        use_fvp: bool,
     ):
         with open(project_dir / "prj.conf", "w") as f:
             f.write(
@@ -413,6 +419,13 @@ def _create_prj_conf(
                 "CONFIG_UART_INTERRUPT_DRIVEN=y\n"
                 "\n"
             )
+            if (
+                config_led
+                and not self._is_qemu(board, use_fvp)
+                and not self._is_fvp(board, use_fvp)
+            ):
+                f.write("# For debugging.\n" "CONFIG_LED=y\n" "\n")
+
             f.write("# For TVMPlatformAbort().\n" "CONFIG_REBOOT=y\n" "\n")
 
             if project_type == "host_driven":
@@ -522,6 +535,18 @@ def _generate_cmake_args(
 
         return cmake_args
 
+    def _copy_src_and_header_files(self, src_dir: pathlib.Path, dst_dir: pathlib.Path):
+        """Copy content of src_dir from template project to dst_dir in separate
+        source and header sub-directories.
+        """
+        for file in os.listdir(src_dir):
+            file = src_dir / file
+            if file.is_file():
+                if file.suffix in [".cc", ".c"]:
+                    shutil.copy2(file, dst_dir / "src")
+                elif file.suffix in [".h"]:
+                    shutil.copy2(file, dst_dir / "include" / "tvm")
+
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         zephyr_board = options["board"]
         project_type = options["project_type"]
@@ -533,7 +558,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         verbose = options.get("verbose")
 
         recommended_heap_size = _get_recommended_heap_size_bytes(zephyr_board)
-        heap_size_bytes = options.get("heap_size_bytes") or recommended_heap_size
+        workspace_size_bytes = options.get("workspace_size_bytes") or recommended_heap_size
         board_mem_size = _get_board_mem_size_bytes(zephyr_base, zephyr_board)
 
         compile_definitions = options.get("compile_definitions")
@@ -602,7 +627,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             else:
                 shutil.copy2(src_path, dst_path)
 
-        # Populate Makefile.
+        # Populate CMakeLists.
         with open(project_dir / CMAKELIST_FILENAME, "w") as cmake_f:
             with open(API_SERVER_DIR / f"{CMAKELIST_FILENAME}.template", "r") as cmake_template_f:
                 for line in cmake_template_f:
@@ -629,10 +654,10 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
                 if board_mem_size is not None:
                     assert (
-                        heap_size_bytes < board_mem_size
-                    ), f"Heap size {heap_size_bytes} is larger than memory size {board_mem_size} on this board."
+                        workspace_size_bytes < board_mem_size
+                    ), f"Workspace size {workspace_size_bytes} is larger than memory size {board_mem_size} on this board."
                 cmake_f.write(
-                    f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size_bytes})\n"
+                    f"target_compile_definitions(app PUBLIC -DTVM_WORKSPACE_SIZE_BYTES={workspace_size_bytes})\n"
                 )
 
                 if compile_definitions:
@@ -649,7 +674,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 if self._is_fvp(zephyr_board, use_fvp):
                     cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n")
 
-        self._create_prj_conf(project_dir, zephyr_board, project_type, config_main_stack_size)
+        self._create_prj_conf(
+            project_dir, zephyr_board, project_type, config_main_stack_size, verbose, use_fvp
+        )
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "crt_config"
@@ -658,13 +685,19 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             API_SERVER_DIR / "crt_config" / "crt_config.h", crt_config_dir / "crt_config.h"
         )
 
-        # Populate src/
+        # Populate `src` and `include`
         src_dir = project_dir / "src"
-        if project_type != "host_driven" or self._is_fvp(zephyr_board, use_fvp):
-            shutil.copytree(API_SERVER_DIR / "src" / project_type, src_dir)
-        else:
-            src_dir.mkdir()
-            shutil.copy2(API_SERVER_DIR / "src" / project_type / "main.c", src_dir)
+        src_dir.mkdir()
+        include_dir = project_dir / "include" / "tvm"
+        include_dir.mkdir(parents=True)
+        src_project_type_dir = API_SERVER_DIR / "src" / project_type
+        self._copy_src_and_header_files(src_project_type_dir, project_dir)
+
+        if self._is_fvp(zephyr_board, use_fvp):
+            self._copy_src_and_header_files(src_project_type_dir / "fvp", project_dir)
+
+        if project_type == "mlperftiny":
+            shutil.copytree(src_project_type_dir / "api", src_dir / "api")
 
         # Populate extra_files
         if extra_files_tar: