Skip to content

Commit

Permalink
[Hardware] Ultra-96 support (apache#16)
Browse files Browse the repository at this point in the history
* vta ALU fix

* allowing for more matrix shapes for batched inference at lower precisions, for narrower input channel vectors

* refactoring topi test so same tests can be used by autovta

* makefile changes: use simply expanded variables

* correctness checks are returned along with other stats

* moving to version 0.0.4

* fixing bias shape bug when relying on tensorization

* updating drivers

* being more specific

* prelimiary ultra-96 support

* updating drivers to prevent memory leaks, ultra96 support tested in non-coherent mode

* adding support for dynamic runtime rebuilding on Ultra-96

* hls support for Ultra-96

* bug fix in old scheduler

* taking advantage of Ultra96 device width

* updated ultra96 drivers

* hardware compilation for ultra96 (WIP)

* elif fix

* adding target to path

* updated freq for ultra96

* not needed param

* simulator bug fixes

* ARM CPU operator support or Ultra-96 (aarch64 - Cortexa53)

* fail elegantly when trying to program FPGA in sim mode

* wip - support for ultra96

* fix llvm cmd

* rpc server bitstream program cleanup

* fix ultra96 driver address map

* updating default ultra96 conf, and bitstream format
  • Loading branch information
tmoreau89 committed Jan 2, 2019
1 parent d370a98 commit fe8910a
Show file tree
Hide file tree
Showing 32 changed files with 3,196 additions and 477 deletions.
3 changes: 1 addition & 2 deletions apps/pynq_rpc/start_rpc_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"

export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
python3 -m vta.exec.rpc_server
python3.6 -m vta.exec.rpc_server
5 changes: 5 additions & 0 deletions cmake/modules/VTA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ elseif(PYTHON)
find_library(__cma_lib NAMES cma PATH /usr/lib)
target_link_libraries(vta ${__cma_lib})
endif()
# Ultra96 rules
if(${VTA_TARGET} STREQUAL "ultra96")
find_library(__sds_lib NAMES sds_lib PATH /usr/lib)
target_link_libraries(vta ${__sds_lib})
endif()
else()
message(STATUS "Cannot found python in env, VTA build is skipped..")
endif()
4 changes: 2 additions & 2 deletions vta/config/pynq_sample.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"TARGET" : "pynq",
"HW_VER" : "0.0.2",
"HW_VER" : "0.0.4",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 7,
"ALU_EN" : true,
"MUL_EN" : true,
"MUL_EN" : false,
"GEMM_II" : 1,
"TALU_II" : 2,
"LOG_INP_WIDTH" : 3,
Expand Down
21 changes: 21 additions & 0 deletions vta/config/ultra96_sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"TARGET" : "ultra96",
"HW_VER" : "0.0.4",
"HW_FREQ" : 333,
"HW_CLK_TARGET" : 2,
"ALU_EN" : true,
"MUL_EN" : false,
"GEMM_II" : 1,
"TALU_II" : 2,
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
4 changes: 2 additions & 2 deletions vta/config/vta_config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"TARGET" : "sim",
"HW_VER" : "0.0.2",
"HW_VER" : "0.0.4",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 7,
"ALU_EN" : true,
"MUL_EN" : true,
"MUL_EN" : false,
"GEMM_II" : 1,
"TALU_II" : 2,
"LOG_INP_WIDTH" : 3,
Expand Down
5 changes: 4 additions & 1 deletion vta/config/vta_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def main():
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"]
# Generate bitstream config string.
# Needs to match the BITSTREAM string in python/vta/environment.py
cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
cfg["BITSTREAM"] = "{}_{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
cfg["TARGET"],
cfg["HW_VER"].replace('.', '_'),
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
Expand Down Expand Up @@ -131,6 +132,8 @@ def main():
cflags_str = " ".join(pkg.cflags)
if cfg["TARGET"] == "pynq":
cflags_str += " -DVTA_TARGET_PYNQ"
if cfg["TARGET"] == "ultra96":
cflags_str += " -DVTA_TARGET_ULTRA96"
print(cflags_str)

if args.ldflags:
Expand Down
34 changes: 18 additions & 16 deletions vta/hardware/xilinx/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ VIVADO = vivado
HSI = hsi

# HLS mode
MODE = all
MODE = skip_sim
# Debug flag
DEBUG = False
# SLURM
SLURM = False

# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
VTA_CONFIG := python $(CURDIR)/../../config/vta_config.py
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
VTA_TARGET := $(shell ${VTA_CONFIG} --target)

Expand All @@ -46,10 +46,10 @@ VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen)
#---------------------
# FPGA Parameters
#--------------------
VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
VTA_GEMM_II = $(shell ${VTA_CONFIG} --get-gemmii)
VTA_TALU_II = $(shell ${VTA_CONFIG} --get-taluii)
VTA_CLOCK_FREQ := $(shell ${VTA_CONFIG} --get-fpgafreq)
VTA_TARGET_PER := $(shell ${VTA_CONFIG} --get-fpgaper)
VTA_GEMM_II := $(shell ${VTA_CONFIG} --get-gemmii)
VTA_TALU_II := $(shell ${VTA_CONFIG} --get-taluii)

#---------------------
# Compilation parameters
Expand All @@ -59,20 +59,21 @@ VTA_TALU_II = $(shell ${VTA_CONFIG} --get-taluii)
VTA_HW_COMP_THREADS = 8

# Derive config name
CONF = $(shell ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
CONF := $(shell ${VTA_CONFIG} --cfg-str)
IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)

ifeq ($(SLURM), true)
# Build on local scratch drive when using cluster
ifeq ($(SLURM), True)
IP_BUILD_PATH = /scratch/hls/$(CONF)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif

# IP file path
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip

# Bitstream file path
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit

.PHONY: all ip bit bsp clean clean_all

Expand All @@ -84,27 +85,28 @@ $(IP_PATH): $(SRC_DIR)/*
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
-tclargs $(VTA_TARGET) \
$(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
ifeq ($(SLURM), True)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif

$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/ultra96.tcl \
-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
ifeq ($(SLURM), True)
mkdir -p $(BUILD_DIR)/vivado
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
Expand Down
119 changes: 63 additions & 56 deletions vta/hardware/xilinx/scripts/hls.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -5,55 +5,58 @@
#

# Command line arguments:
# Arg 1: path to design sources
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: mode
# Arg 6: debug
# Arg 7: alu_ena
# Arg 8: mul_ena
# Arg 9: target clock period
# Arg 10: target II for GEMM
# Arg 11: target II for tensor ALU
# Arg 12: input type width (log)
# Arg 13: weight type width (log)
# Arg 14: accum type width (log)
# Arg 15: output type width (log)
# Arg 16: batch size (log)
# Arg 17: in block size (log)
# Arg 18: out block size (log)
# Arg 19: uop buffer size in B (log)
# Arg 20: inp buffer size in B (log)
# Arg 21: wgt buffer size in B (log)
# Arg 22: acc buffer size in B (log)
# Arg 23: out buffer size in B (log)

if { [llength $argv] eq 25 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set mode [lindex $argv 6]
set debug [lindex $argv 7]
set alu_ena [lindex $argv 8]
set mul_ena [lindex $argv 9]
set target_period [lindex $argv 10]
set target_gemm_ii [lindex $argv 11]
set target_alu_ii [lindex $argv 12]
set inp_width [lindex $argv 13]
set wgt_width [lindex $argv 14]
set acc_width [lindex $argv 15]
set out_width [lindex $argv 16]
set batch [lindex $argv 17]
set block_in [lindex $argv 18]
set block_out [lindex $argv 19]
set uop_buff_size [lindex $argv 20]
set inp_buff_size [lindex $argv 21]
set wgt_buff_size [lindex $argv 22]
set acc_buff_size [lindex $argv 23]
set out_buff_size [lindex $argv 24]
# Arg 1: target (FPGA)
# Arg 2: path to design sources
# Arg 3: path to sim sources
# Arg 4: path to test sources
# Arg 5: path to include sources
# Arg 6: mode
# Arg 7: debug
# Arg 8: alu_ena
# Arg 9: mul_ena
# Arg 10: target clock period
# Arg 11: target II for GEMM
# Arg 12: target II for tensor ALU
# Arg 13: input type width (log)
# Arg 14: weight type width (log)
# Arg 15: accum type width (log)
# Arg 16: output type width (log)
# Arg 17: batch size (log)
# Arg 18: in block size (log)
# Arg 19: out block size (log)
# Arg 20: uop buffer size in B (log)
# Arg 21: inp buffer size in B (log)
# Arg 22: wgt buffer size in B (log)
# Arg 23: acc buffer size in B (log)
# Arg 24: out buffer size in B (log)

if { [llength $argv] eq 26 } {
set target [lindex $argv 2]
set src_dir [lindex $argv 3]
set sim_dir [lindex $argv 4]
set test_dir [lindex $argv 5]
set include_dir [lindex $argv 6]
set mode [lindex $argv 7]
set debug [lindex $argv 8]
set alu_ena [lindex $argv 9]
set mul_ena [lindex $argv 10]
set target_period [lindex $argv 11]
set target_gemm_ii [lindex $argv 12]
set target_alu_ii [lindex $argv 13]
set inp_width [lindex $argv 14]
set wgt_width [lindex $argv 15]
set acc_width [lindex $argv 16]
set out_width [lindex $argv 17]
set batch [lindex $argv 18]
set block_in [lindex $argv 19]
set block_out [lindex $argv 20]
set uop_buff_size [lindex $argv 21]
set inp_buff_size [lindex $argv 22]
set wgt_buff_size [lindex $argv 23]
set acc_buff_size [lindex $argv 24]
set out_buff_size [lindex $argv 25]
} else {
set target "pynq"
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
Expand Down Expand Up @@ -83,16 +86,20 @@ if { [llength $argv] eq 25 } {
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {

# Set device number
set_part {xc7z020clg484-1}
if {$target=="pynq"} {
set_part {xc7z020clg484-1}
} elseif {$target=="ultra96"} {
set_part {xczu3eg-sbva484-1-e}
}

# Max bus width (supported by Vivado)
set max_width 1024

# Set axi width (TODO derive from top level config)
set axi_width 64
set axi_width 128

# Set the clock frequency
create_clock -period $per -name default
Expand Down Expand Up @@ -178,7 +185,7 @@ if {$mode=="all" || $mode=="sim"} {
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csim_design -clean
close_project
}
Expand All @@ -189,7 +196,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -203,7 +210,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -217,7 +224,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -231,7 +238,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand Down
Loading

0 comments on commit fe8910a

Please sign in to comment.