From 49d63dc21a66bfc4447d0cf031c2fb6acd702e7e Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 21 Sep 2018 15:46:46 -0700 Subject: [PATCH] [HARDWARE] [AUTOTVM] Allowing AutoTVM to tune for different VTA designs and adding ZCU102 support (#19) * compilation support for ZCU102 * dll caching to save on dynamic reconfiguration time * introducing model signature to differentiate between VTA variants * disable tophub to avoid falling back on invalid schedules * reconfig when targeting an FPGA to reset the hardware * typo fix * addressing comments --- python/tvm/autotvm/measure/measure_methods.py | 4 ++ python/tvm/autotvm/task/nnvm_integration.py | 4 +- vta/hardware/xilinx/Makefile | 3 +- vta/hardware/xilinx/scripts/hls.tcl | 10 ++++- vta/hardware/xilinx/scripts/ultra96.tcl | 42 +++++++++++-------- vta/python/vta/environment.py | 17 +++++++- vta/python/vta/exec/rpc_server.py | 27 ++++++++---- vta/python/vta/pkg_config.py | 15 +++++++ vta/scripts/tune_conv.py | 5 +-- vta/scripts/tune_resnet.py | 9 ++-- 10 files changed, 98 insertions(+), 38 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index f2819d1dad369..36c793cd7f90f 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -461,6 +461,10 @@ def run_through_rpc(measure_input, build_result, try: # upload built module remote = request_remote(*remote_args) + if measure_input.target.device_name == 'vta': + from vta import program_fpga, reconfig_runtime + program_fpga(remote, None) + reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index 4d1551c57209f..eba9d35ab3276 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -12,6 +12,7 @@ from ..util import get_const_tuple from .task import create, register +from .dispatcher import ApplyHistoryBest logger = logging.getLogger('autotvm') @@ -240,7 +241,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): # run compiler to collect all TOPI calls during compilation nnvm.compiler.engine.clear_cache() - nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) + with ApplyHistoryBest([]): + nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) nnvm.compiler.engine.clear_cache() logger.disabled = old_state diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile index c0f1024568791..22a1fcf1468cf 100644 --- a/vta/hardware/xilinx/Makefile +++ b/vta/hardware/xilinx/Makefile @@ -102,7 +102,8 @@ $(BIT_PATH): $(IP_PATH) mkdir -p $(HW_BUILD_PATH) cd $(HW_BUILD_PATH) && \ $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/ultra96.tcl \ - -tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \ + -tclargs $(VTA_TARGET) $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) \ + $(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \ $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \ $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl index c338795137cf1..a64b4bd8931db 100644 --- a/vta/hardware/xilinx/scripts/hls.tcl +++ b/vta/hardware/xilinx/scripts/hls.tcl @@ -93,13 +93,21 @@ proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width b set_part {xc7z020clg484-1} } elseif {$target=="ultra96"} { set_part {xczu3eg-sbva484-1-e} + } elseif {$target=="zcu102"} { + set_part {xczu9eg-ffvb1156-2-e} } # Max bus width (supported by Vivado) set max_width 1024 # Set axi width (TODO derive from top level config) - set axi_width 128 + if {$target=="pynq"} { + set axi_width 64 + } elseif {$target=="ultra96"} { + set axi_width 128 + } elseif {$target=="zcu102"} { + set axi_width 128 + } # Set the clock frequency create_clock -period $per -name default diff --git a/vta/hardware/xilinx/scripts/ultra96.tcl b/vta/hardware/xilinx/scripts/ultra96.tcl index ff0c4050a7160..c2d8be4935900 100644 --- a/vta/hardware/xilinx/scripts/ultra96.tcl +++ b/vta/hardware/xilinx/scripts/ultra96.tcl @@ -21,23 +21,24 @@ if { [string first $scripts_vivado_version $current_vivado_version] == -1 } { } # Parse argument list, derive the clock to utilize -if { [llength $argv] eq 13 } { - set ip_path [lindex $argv 0] - set num_threads [lindex $argv 1] - set clock_freq [lindex $argv 2] - set gemm_ii [lindex $argv 3] - set inp_width [expr 1 << [lindex $argv 4]] - set wgt_width [expr 1 << [lindex $argv 5]] - set out_width [expr 1 << [lindex $argv 6]] - set batch [expr 1 << [lindex $argv 7]] - set out_block [expr 1 << [lindex $argv 8]] - set in_block [expr 1 << [lindex $argv 9]] - set inp_mem_size [expr 1 << [lindex $argv 10]] - set wgt_mem_size [expr 1 << [lindex $argv 11]] - set out_mem_size [expr 1 << [lindex $argv 12]] +if { [llength $argv] eq 14 } { + set target [lindex $argv 0] + set ip_path [lindex $argv 1] + set num_threads [lindex $argv 2] + set clock_freq [lindex $argv 3] + set gemm_ii [lindex $argv 4] + set inp_width [expr 1 << [lindex $argv 5]] + set wgt_width [expr 1 << [lindex $argv 6]] + set out_width [expr 1 << [lindex $argv 7]] + set batch [expr 1 << [lindex $argv 8]] + set out_block [expr 1 << [lindex $argv 9]] + set in_block [expr 1 << [lindex $argv 10]] + set inp_mem_size [expr 1 << [lindex $argv 11]] + set wgt_mem_size [expr 1 << [lindex $argv 12]] + set out_mem_size [expr 1 << [lindex $argv 13]] } else { - puts "Arg list incomplete: \ - \ + \ " return 1 } @@ -82,8 +83,13 @@ set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_ set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip" # Create custom project -create_project -force $proj_name $proj_path -part xczu3eg-sbva484-1-e -set_property BOARD_PART em.avnet.com:ultra96:part0:1.0 [current_project] +if { ${target} eq "ultra96" } { + create_project -force $proj_name $proj_path -part xczu3eg-sbva484-1-e + set_property BOARD_PART em.avnet.com:ultra96:part0:1.0 [current_project] +} elseif { ${target} eq "zcu102" } { + create_project -force $proj_name $proj_path -part xczu9eg-ffvb1156-2-e + set_property BOARD_PART xilinx.com:zcu102:part0:3.2 [current_project] +} # Update IP repository with generated IP file mkdir $ip_lib diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 362adf69686b8..86246cfad2d9d 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -149,7 +149,7 @@ def __init__(self, cfg): self._mock_env = None self._dev_ctx = None self._last_env = None - # derive bitstream name + # derive bitstream name self.BITSTREAM = "{}/{}/{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format( self.HW_VER.replace('.', '_'), self.TARGET, @@ -171,6 +171,21 @@ def __init__(self, cfg): if self.MUL_EN and self.ALU_EN: self.BITSTREAM += "_mul" self.BITSTREAM += ".bit" + # model - autoTVM signature that identifies VTA configuration. + # This is WIP: knobs that could influence the efficacy of the + # schedule have been left out for now. + self.MODEL = "{}-{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}".format( + self.TARGET, + self.BATCH, + self.BLOCK_IN, + self.BLOCK_OUT, + self.INP_WIDTH, + self.WGT_WIDTH, + self.OUT_WIDTH, + self.LOG_UOP_BUFF_SIZE, + self.LOG_INP_BUFF_SIZE, + self.LOG_WGT_BUFF_SIZE, + self.LOG_ACC_BUFF_SIZE) def __enter__(self): self._last_env = Environment.current diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py index 54c3582f8be4d..3edc940e3ab2e 100644 --- a/vta/python/vta/exec/rpc_server.py +++ b/vta/python/vta/exec/rpc_server.py @@ -10,6 +10,7 @@ import ctypes import json import tvm +from shutil import copyfile from tvm._ffi.base import c_str from tvm import rpc from tvm.contrib import cc @@ -87,14 +88,24 @@ def reconfig_runtime(cfg_json): if pkg.same_config(old_cfg): logging.info("Skip reconfig_runtime due to same config.") return - cflags = ["-O2", "-std=c++11"] - cflags += pkg.cflags - ldflags = pkg.ldflags - lib_name = dll_path - source = pkg.lib_source - logging.info("Rebuild runtime:\n output=%s,\n cflags=%s,\n source=%s,\n ldflags=%s", - dll_path, '\n\t'.join(cflags), '\n\t'.join(source), '\n\t'.join(ldflags)) - cc.create_shared(lib_name, source, cflags + ldflags) + # check if a dll matching the configuration has been cached + dll_root, dll_ext = os.path.splitext(dll_path) + cached_dll_path = dll_root + '-' + pkg.signature + dll_ext + if os.path.isfile(cached_dll_path): + copyfile(cached_dll_path, dll_path) + logging.info("Swapping in cached dll: source=%s, destination=%s", + cached_dll_path, dll_path) + else: + cflags = ["-O2", "-std=c++11"] + cflags += pkg.cflags + ldflags = pkg.ldflags + lib_name = dll_path + source = pkg.lib_source + logging.info("Rebuild runtime:\n output=%s,\n cflags=%s,\n source=%s,\n ldflags=%s", + dll_path, '\n\t'.join(cflags), '\n\t'.join(source), '\n\t'.join(ldflags)) + cc.create_shared(lib_name, source, cflags + ldflags) + copyfile(dll_path, cached_dll_path) + logging.info("Caching dll to: %s", cached_dll_path) with open(cfg_path, "w") as outputfile: outputfile.write(pkg.cfg_json) diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py index 417fefb4f12c1..af0eca7593f45 100644 --- a/vta/python/vta/pkg_config.py +++ b/vta/python/vta/pkg_config.py @@ -80,6 +80,21 @@ def __init__(self, cfg, proj_root): def cflags(self): return self.include_path + self.macro_defs + @property + def signature(self): + return "{}-{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format( + self.cfg_dict["TARGET"], + self.cfg_dict["LOG_BATCH"], + self.cfg_dict["LOG_BLOCK_IN"], + self.cfg_dict["LOG_BLOCK_OUT"], + self.cfg_dict["LOG_INP_WIDTH"], + self.cfg_dict["LOG_WGT_WIDTH"], + self.cfg_dict["LOG_OUT_WIDTH"], + self.cfg_dict["LOG_UOP_BUFF_SIZE"], + self.cfg_dict["LOG_INP_BUFF_SIZE"], + self.cfg_dict["LOG_WGT_BUFF_SIZE"], + self.cfg_dict["LOG_ACC_BUFF_SIZE"]) + @property def cfg_json(self): return json.dumps(self.cfg_dict, indent=2) diff --git a/vta/scripts/tune_conv.py b/vta/scripts/tune_conv.py index 6b1740092b160..516301e11173c 100644 --- a/vta/scripts/tune_conv.py +++ b/vta/scripts/tune_conv.py @@ -49,12 +49,11 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype): return s, [data, kernel, bias, res] if __name__ == '__main__': - model = env.TARGET N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype = \ 1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), 'int8', 'int32' task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype), - target=tvm.target.vta(model), target_host=env.target_host, template_key='direct') + target=tvm.target.vta(env.MODEL), target_host=env.target_host, template_key='direct') print(task.config_space) # logging config (for printing tuning log to the screen) @@ -63,7 +62,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype): measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), - runner=autotvm.RPCRunner(model, 'fleet', 9190, number=4, repeat=3, timeout=30, + runner=autotvm.RPCRunner(env.TARGET, 'fleet', 9190, number=4, repeat=3, timeout=30, check_correctness=True)) tuner = autotvm.tuner.RandomTuner(task) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index f3a64323723e7..7de60ec1462e9 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -159,10 +159,9 @@ def tune_tasks(tasks, os.remove(tmp_log_file) if __name__ == '__main__': - device_key = env.TARGET tuning_opt = { - 'log_filename': 'resnet-18.log', + 'log_filename': 'resnet-18-{}.log'.format(env.MODEL), 'tuner': 'random', 'n_trial': 1e9, @@ -170,7 +169,7 @@ def tune_tasks(tasks, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), - runner=autotvm.RPCRunner(device_key, 'fleet', 9190, + runner=autotvm.RPCRunner(env.TARGET, 'fleet', 9190, number=4, repeat=3, timeout=60, check_correctness=True)) } @@ -182,7 +181,7 @@ def tune_tasks(tasks, register_vta_tuning_tasks() print("Extract tasks...") - target = tvm.target.vta(device_key) + target = tvm.target.vta(env.MODEL) target_host = env.target_host tasks = extract_tasks(sym, params, target, target_host) @@ -203,7 +202,7 @@ def tune_tasks(tasks, # upload module to device print("Upload...") - remote = autotvm.measure.request_remote(device_key, 'fleet', 9190, timeout=10000) + remote = autotvm.measure.request_remote(env.TARGET, 'fleet', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename)