Skip to content

Commit

Permalink
[MetaSchedule] Introduce Async Pipeline in MultiLevelTiling
Browse files Browse the repository at this point in the history
This PR introduces async pipeline in the current TVM's MultiLevelTiling Rules. This PR is blocking on apache#13966 since some conv2d workload will use `tir.if_then_else` to pad the input to the correct size, and this PR uses async copy in such copy statement.

1. Add a subrule in `src/meta_schedule/schedule_rule/multi_level_tiling.h/.cc` that annotate async copy for mlt.

In CUDA Core, this PR has a perf boost of around 1T GFLOP/s in most Conv2d test cases and 1T ~ 2T in most GEMM test cases.

All generated codes, scripts, and traces are available at https://github.com/Rainy-Memory/tvm-async-rule-benchmark.

Currently tested on commit `afbfb7aa7e43732cb716f8e443df696110be6afc` in conv2d NHWC workload, with a RTX 3080 GPU.

Workload: Conv2d NHWC

|Shape|Mainline TVM|Mainline TVM with Async|
|-|-|-|
|N=1_H=224_W=224_C=3_K=64_R=7_S=7_STR=2_PAD=3_DIL=1|13838.05219|14687.89452|
|N=1_H=56_W=56_C=64_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|5398.305085|5613.892553|
|N=1_H=56_W=56_C=64_K=64_R=3_S=3_STR=1_PAD=1_DIL=1|11652.96825|13157.88249|
|N=1_H=56_W=56_C=64_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|10638.8309|11674.68499|
|N=1_H=56_W=56_C=256_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|8692.32829|9469.264089|
|N=1_H=56_W=56_C=256_K=128_R=1_S=1_STR=2_PAD=0_DIL=1|4685.767442|5698.19634|
|N=1_H=28_W=28_C=128_K=128_R=3_S=3_STR=1_PAD=1_DIL=1|9872.787087|10404.60405|
|N=1_H=28_W=28_C=128_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|9974.281496|10073.31657|
|N=1_H=28_W=28_C=512_K=128_R=1_S=1_STR=1_PAD=0_DIL=1|7075.866932|8564.572712|
|N=1_H=28_W=28_C=512_K=256_R=1_S=1_STR=2_PAD=0_DIL=1|3648.330914|4021.923142|
|N=1_H=14_W=14_C=256_K=256_R=3_S=3_STR=1_PAD=1_DIL=1|8192.954618|9160.182054|
|N=1_H=14_W=14_C=256_K=1024_R=1_S=1_STR=1_PAD=0_DIL=1|8008.870153|9362.825279|
|N=1_H=14_W=14_C=1024_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|5210.062241|6051.208379|
|N=1_H=14_W=14_C=1024_K=512_R=1_S=1_STR=2_PAD=0_DIL=1|2550.787202|3587.902938|
|N=1_H=7_W=7_C=512_K=512_R=3_S=3_STR=1_PAD=1_DIL=1|4350.626084|5432.788068|
|N=1_H=7_W=7_C=512_K=2048_R=1_S=1_STR=1_PAD=0_DIL=1|6672.068026|7663.725217|
|N=1_H=7_W=7_C=2048_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|3142.564263|4297.988014|

Workload: GEMM NN

|Shape|Mainline TVM|Mainline TVM with Async|
|-|-|-|
|M=512_N=256_K=640|8678.46|10607.37|
|M=512_N=384_K=256|8109.13|10290.72|
|M=512_N=512_K=512|11419.83|14000.86|
|M=512_N=3072_K=768|19709.39|18351.61|
|M=512_N=768_K=3072|12844.59|13730.88|
|M=896_N=896_K=896|16149.91|16131.39|
|M=1024_N=1024_K=1024|18842.11|19662.8|
|M=1152_N=1152_K=1152|15386.79|16736.1|
|M=1536_N=1536_K=1536|18522.67|18872.06|
|M=2048_N=2048_K=2048|19515.42|18874.85|
|M=3072_N=3072_K=3072|19233.9|19291.42|
|M=4096_N=4096_K=4096|17122.17|19259.01|
  • Loading branch information
cblmemo committed Feb 17, 2023
1 parent 49b6c3a commit 9cc4620
Show file tree
Hide file tree
Showing 116 changed files with 2,754 additions and 1,940 deletions.
10 changes: 5 additions & 5 deletions apps/microtvm/arduino/template_project/microtvm_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ def _disassemble_mlf(self, mlf_tar_path, source_dir):
metadata = json.load(f)
return metadata

def _template_model_header(self, source_dir, metadata):
with open(source_dir / "model.h", "r") as f:
def _template_model(self, source_dir, metadata):
with open(source_dir / "platform.c", "r") as f:
model_h_template = Template(f.read())

all_module_names = []
Expand All @@ -218,7 +218,7 @@ def _template_model_header(self, source_dir, metadata):
"workspace_size_bytes": workspace_size_bytes,
}

with open(source_dir / "model.h", "w") as f:
with open(source_dir / "platform.c", "w") as f:
f.write(model_h_template.substitute(template_values))

# Arduino ONLY recognizes .ino, .ccp, .c, .h
Expand Down Expand Up @@ -415,9 +415,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
metadata = self._disassemble_mlf(model_library_format_path, source_dir)
shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)

# For AOT, template model.h with metadata to minimize space usage
# For AOT, template platform.c with metadata to minimize space usage
if project_type == "example_project":
self._template_model_header(source_dir, metadata)
self._template_model(source_dir, metadata)

self._change_cpp_file_extensions(source_dir)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,22 @@
* under the License.
*/

#include "model.h"
/*!
* \brief Implementation of TVMPlatform functions in tvm/runtime/crt/platform.h
*/

#include "Arduino.h"
#include "standalone_crt/include/dlpack/dlpack.h"
#include "standalone_crt/include/tvm/runtime/crt/stack_allocator.h"

#define TVM_WORKSPACE_SIZE_BYTES $workspace_size_bytes

// AOT memory array, stack allocator wants it aligned
static uint8_t g_aot_memory[WORKSPACE_SIZE]
static uint8_t g_aot_memory[TVM_WORKSPACE_SIZE_BYTES]
__attribute__((aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)));
tvm_workspace_t app_workspace;

// Called when an internal error occurs and execution cannot continue.
// Blink code for debugging purposes
void TVMPlatformAbort(tvm_crt_error_t error) {
TVMLogf("TVMPlatformAbort: 0x%08x\n", error);
Expand All @@ -45,19 +50,23 @@ void TVMPlatformAbort(tvm_crt_error_t error) {
}
}

void TVMLogf(const char* msg, ...) {}

// Allocate memory for use by TVM.
tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
}

// Free memory used by TVM.
tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
return StackMemoryManager_Free(&app_workspace, ptr);
}

// Internal logging API call implementation.
void TVMLogf(const char* msg, ...) {}

unsigned long g_utvm_start_time_micros;
int g_utvm_timer_running = 0;

// Start a device timer.
tvm_crt_error_t TVMPlatformTimerStart() {
if (g_utvm_timer_running) {
return kTvmErrorPlatformTimerBadState;
Expand All @@ -67,6 +76,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
return kTvmErrorNoError;
}

// Stop the running device timer and get the elapsed time (in microseconds).
tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
if (!g_utvm_timer_running) {
return kTvmErrorPlatformTimerBadState;
Expand All @@ -77,14 +87,19 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
return kTvmErrorNoError;
}

// Fill a buffer with random data.
tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
for (size_t i = 0; i < num_bytes; i++) {
buffer[i] = rand();
}
return kTvmErrorNoError;
}

void TVMInitialize() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
// Initialize TVM inference.
tvm_crt_error_t TVMPlatformInitialize() {
StackMemoryManager_Init(&app_workspace, g_aot_memory, sizeof(g_aot_memory));
return kTvmErrorNoError;
}

void TVMExecute(void* input_data, void* output_data) {
int ret_val = tvmgen_default___tvm_main__(input_data, output_data);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,10 @@
* under the License.
*/

#define WORKSPACE_SIZE $workspace_size_bytes

#ifdef __cplusplus
extern "C" {
#endif

void TVMInitialize();

/* TODO template this function signature with the input and output
* data types and sizes. For example:
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
* under the License.
*/

#include "src/model.h"
#include "src/standalone_crt/include/tvm/runtime/crt/platform.h"

void setup() {
TVMInitialize();
TVMPlatformInitialize();
// If desired, initialize the RNG with random noise
// randomSeed(analogRead(0));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,28 @@
* under the License.
*/

/*!
* \brief Implementation of TVMPlatform functions in tvm/runtime/crt/platform.h
*/

#include "standalone_crt/include/dlpack/dlpack.h"
#include "standalone_crt/include/tvm/runtime/crt/error_codes.h"
#include "stdarg.h"

// Blink code for debugging purposes
// Called when an internal error occurs and execution cannot continue.
void TVMPlatformAbort(tvm_crt_error_t error) {
TVMLogf("TVMPlatformAbort: 0x%08x\n", error);
for (;;)
;
}

// Called by the microTVM RPC server to implement TVMLogf.
size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
va_list args) {
return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
}

// Allocate memory for use by TVM.
tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
if (num_bytes == 0) {
num_bytes = sizeof(int);
Expand All @@ -41,6 +47,7 @@ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void**
return (*out_ptr == NULL) ? kTvmErrorPlatformNoMemory : kTvmErrorNoError;
}

// Free memory used by TVM.
tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
free(ptr);
return kTvmErrorNoError;
Expand All @@ -49,6 +56,7 @@ tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
unsigned long g_utvm_start_time_micros;
int g_utvm_timer_running = 0;

// Start a device timer.
tvm_crt_error_t TVMPlatformTimerStart() {
if (g_utvm_timer_running) {
return kTvmErrorPlatformTimerBadState;
Expand All @@ -58,6 +66,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
return kTvmErrorNoError;
}

// Stop the running device timer and get the elapsed time (in microseconds).
tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
if (!g_utvm_timer_running) {
return kTvmErrorPlatformTimerBadState;
Expand All @@ -68,6 +77,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
return kTvmErrorNoError;
}

// Fill a buffer with random data.
tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
for (size_t i = 0; i < num_bytes; i++) {
buffer[i] = rand();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ endif()

file(GLOB_RECURSE app_srcs src/**.c src/**.cc)
target_sources(app PRIVATE ${app_srcs} ${cmsis_lib_srcs})
target_include_directories(app PRIVATE crt_config ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
target_include_directories(app PRIVATE crt_config include ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
67 changes: 50 additions & 17 deletions apps/microtvm/zephyr/template_project/microtvm_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,14 @@ def _get_board_mem_size_bytes(zephyr_base: str, board: str):
return None


DEFAULT_HEAP_SIZE_BYTES = 216 * 1024
DEFAULT_WORKSPACE_SIZE_BYTES = 216 * 1024


def _get_recommended_heap_size_bytes(board: str):
prop = BOARD_PROPERTIES[board]
if "recommended_heap_size_bytes" in prop:
return prop["recommended_heap_size_bytes"]
return DEFAULT_HEAP_SIZE_BYTES
return DEFAULT_WORKSPACE_SIZE_BYTES


def generic_find_serial_port(serial_number: str = None):
Expand Down Expand Up @@ -358,11 +358,11 @@ def _get_nrf_device_args(serial_number: str = None) -> list:
help="Run on the FVP emulator instead of hardware.",
),
server.ProjectOption(
"heap_size_bytes",
"workspace_size_bytes",
optional=["generate_project"],
type="int",
default=None,
help="Sets the value for HEAP_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
help="Sets the value for TVM_WORKSPACE_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.",
),
]

Expand Down Expand Up @@ -403,7 +403,13 @@ def server_info_query(self, tvm_version):
}

def _create_prj_conf(
self, project_dir: pathlib.Path, board: str, project_type: str, config_main_stack_size
self,
project_dir: pathlib.Path,
board: str,
project_type: str,
config_main_stack_size: int,
config_led: bool,
use_fvp: bool,
):
with open(project_dir / "prj.conf", "w") as f:
f.write(
Expand All @@ -413,6 +419,13 @@ def _create_prj_conf(
"CONFIG_UART_INTERRUPT_DRIVEN=y\n"
"\n"
)
if (
config_led
and not self._is_qemu(board, use_fvp)
and not self._is_fvp(board, use_fvp)
):
f.write("# For debugging.\n" "CONFIG_LED=y\n" "\n")

f.write("# For TVMPlatformAbort().\n" "CONFIG_REBOOT=y\n" "\n")

if project_type == "host_driven":
Expand Down Expand Up @@ -522,6 +535,18 @@ def _generate_cmake_args(

return cmake_args

def _copy_src_and_header_files(self, src_dir: pathlib.Path, dst_dir: pathlib.Path):
"""Copy content of src_dir from template project to dst_dir in separate
source and header sub-directories.
"""
for file in os.listdir(src_dir):
file = src_dir / file
if file.is_file():
if file.suffix in [".cc", ".c"]:
shutil.copy2(file, dst_dir / "src")
elif file.suffix in [".h"]:
shutil.copy2(file, dst_dir / "include" / "tvm")

def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
zephyr_board = options["board"]
project_type = options["project_type"]
Expand All @@ -533,7 +558,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
verbose = options.get("verbose")

recommended_heap_size = _get_recommended_heap_size_bytes(zephyr_board)
heap_size_bytes = options.get("heap_size_bytes") or recommended_heap_size
workspace_size_bytes = options.get("workspace_size_bytes") or recommended_heap_size
board_mem_size = _get_board_mem_size_bytes(zephyr_base, zephyr_board)

compile_definitions = options.get("compile_definitions")
Expand Down Expand Up @@ -602,7 +627,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
else:
shutil.copy2(src_path, dst_path)

# Populate Makefile.
# Populate CMakeLists.
with open(project_dir / CMAKELIST_FILENAME, "w") as cmake_f:
with open(API_SERVER_DIR / f"{CMAKELIST_FILENAME}.template", "r") as cmake_template_f:
for line in cmake_template_f:
Expand All @@ -629,10 +654,10 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec

if board_mem_size is not None:
assert (
heap_size_bytes < board_mem_size
), f"Heap size {heap_size_bytes} is larger than memory size {board_mem_size} on this board."
workspace_size_bytes < board_mem_size
), f"Workspace size {workspace_size_bytes} is larger than memory size {board_mem_size} on this board."
cmake_f.write(
f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size_bytes})\n"
f"target_compile_definitions(app PUBLIC -DTVM_WORKSPACE_SIZE_BYTES={workspace_size_bytes})\n"
)

if compile_definitions:
Expand All @@ -649,7 +674,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
if self._is_fvp(zephyr_board, use_fvp):
cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n")

self._create_prj_conf(project_dir, zephyr_board, project_type, config_main_stack_size)
self._create_prj_conf(
project_dir, zephyr_board, project_type, config_main_stack_size, verbose, use_fvp
)

# Populate crt-config.h
crt_config_dir = project_dir / "crt_config"
Expand All @@ -658,13 +685,19 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
API_SERVER_DIR / "crt_config" / "crt_config.h", crt_config_dir / "crt_config.h"
)

# Populate src/
# Populate `src` and `include`
src_dir = project_dir / "src"
if project_type != "host_driven" or self._is_fvp(zephyr_board, use_fvp):
shutil.copytree(API_SERVER_DIR / "src" / project_type, src_dir)
else:
src_dir.mkdir()
shutil.copy2(API_SERVER_DIR / "src" / project_type / "main.c", src_dir)
src_dir.mkdir()
include_dir = project_dir / "include" / "tvm"
include_dir.mkdir(parents=True)
src_project_type_dir = API_SERVER_DIR / "src" / project_type
self._copy_src_and_header_files(src_project_type_dir, project_dir)

if self._is_fvp(zephyr_board, use_fvp):
self._copy_src_and_header_files(src_project_type_dir / "fvp", project_dir)

if project_type == "mlperftiny":
shutil.copytree(src_project_type_dir / "api", src_dir / "api")

# Populate extra_files
if extra_files_tar:
Expand Down
Loading

0 comments on commit 9cc4620

Please sign in to comment.