From dae77cdb4cbf6b1ee1715266cad403229bd279b9 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 2 May 2018 18:13:33 -0700 Subject: [PATCH] [HARDWARE, TEST] Fixed hardware generation flow (#34) --- vta/apps/pynq_rpc/start_rpc_server.sh | 2 +- vta/hardware/xilinx/Makefile | 71 ++++-- vta/hardware/xilinx/scripts/hls.tcl | 83 +++--- vta/hardware/xilinx/scripts/vivado.tcl | 18 +- vta/hardware/xilinx/sim/vta_test.cc | 3 + vta/make/config.json | 4 +- vta/make/sim_sample.json | 4 +- vta/make/vta_config.py | 80 +++++- vta/python/vta/environment.py | 6 +- vta/python/vta/ir_pass.py | 8 +- vta/tests/hardware/common/test_lib.cc | 334 ++++++++++++++++++++++++- vta/tests/hardware/common/test_lib.h | 16 +- vta/tests/hardware/pynq/Makefile | 19 +- vta/tests/hardware/pynq/metal_test.cc | 100 +------- 14 files changed, 550 insertions(+), 198 deletions(-) diff --git a/vta/apps/pynq_rpc/start_rpc_server.sh b/vta/apps/pynq_rpc/start_rpc_server.sh index 445b72ea..fac12e82 100755 --- a/vta/apps/pynq_rpc/start_rpc_server.sh +++ b/vta/apps/pynq_rpc/start_rpc_server.sh @@ -1,4 +1,4 @@ #!/bin/bash -export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python:/home/xilinx/vta/python +export PYTHONPATH=${PYTHONPATH}:/home/xilinx/vta/nnvm/tvm/python:/home/xilinx/vta/python export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ python -m vta.exec.rpc_server diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile index 8208de7d..9ed7654e 100644 --- a/vta/hardware/xilinx/Makefile +++ b/vta/hardware/xilinx/Makefile @@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls VIVADO = vivado HSI = hsi -# HLS Mode -MODE = all +# HLS mode +MODE = skip_sim +# Debug flag +DEBUG = false # SLURM SLURM = false # Prevent generation of DSP @@ -22,15 +24,26 @@ NO_DSP = false # Prevent generation of ALU NO_ALU = false -# Include top-level config file -ifndef config -ifneq ("$(wildcard ../../config.mk)", "") - config = ../../config.mk -else - config = ../../make/config.mk -endif -endif -include $(config) +# Process VTA JSON config +VTA_CONFIG = python $(CURDIR)/../../make/vta_config.py +CFLAGS := $(shell ${VTA_CONFIG} --cflags) +VTA_TARGET := $(shell ${VTA_CONFIG} --target) + +#--------------------- +# VTA Parameters +#-------------------- +VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth) +VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth) +VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth) +VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth) +VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch) +VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin) +VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout) +VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize) +VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize) +VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize) +VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize) +VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize) #--------------------- # Compilation parameters @@ -50,8 +63,8 @@ TARGET_PER = \ $(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" ) # Derive config name -CONF = \ -$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns +CONF_ROOT = $(shell ${VTA_CONFIG} --cfg-str) +CONF = $(CONF_ROOT)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF) HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF) @@ -60,26 +73,34 @@ ifeq ($(SLURM), true) HW_BUILD_PATH = /scratch/vivado/$(CONF) endif -.PHONY: all ip bit driver clean clean_all +# IP file path +IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip -all: bit +# Bitstream file path +BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit -ip: +.PHONY: all ip bit bsp clean clean_all + +all: bsp +ip: $(IP_PATH) +bit: $(BIT_PATH) + +$(IP_PATH): $(SRC_DIR)/* mkdir -p $(IP_BUILD_PATH) cd $(IP_BUILD_PATH) && \ $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ - -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \ - $(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \ - $(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \ - $(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \ - $(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \ - $(MODE) $(NO_DSP) $(NO_ALU) + -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \ + $(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(TARGET_PER) \ + $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \ + $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ + $(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \ + $(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE) ifeq ($(SLURM), true) mkdir -p $(BUILD_DIR)/hls mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/. endif -bit: ip +$(BIT_PATH): $(IP_PATH) mkdir -p $(HW_BUILD_PATH) cd $(HW_BUILD_PATH) && \ $(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \ @@ -92,12 +113,12 @@ ifeq ($(SLURM), true) mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/. endif -driver: bit +bsp: $(BIT_PATH) cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog cd $(HW_BUILD_PATH)/bsp && make clean: rm -rf *.out *.log *.sb figures -clean_all: clean +cleanall: clean rm -rf $(BUILD_DIR) diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl index a1270189..57efe9c2 100644 --- a/vta/hardware/xilinx/scripts/hls.tcl +++ b/vta/hardware/xilinx/scripts/hls.tcl @@ -9,65 +9,69 @@ # Arg 2: path to sim sources # Arg 3: path to test sources # Arg 4: path to include sources -# Arg 5: target clock period -# Arg 6: input type width (log) -# Arg 7: weight type width (log) -# Arg 8: accum type width (log) -# Arg 9: output type width (log) -# Arg 10: batch size (log) -# Arg 11: in block size (log) -# Arg 12: out block size (log) -# Arg 13: uop buffer size in B (log) -# Arg 14: inp buffer size in B (log) -# Arg 15: wgt buffer size in B (log) -# Arg 16: acc buffer size in B (log) -# Arg 17: out buffer size in B (log) -# Arg 18: mode -# Arg 19: no_dsp -# Arg 20: no_alu +# Arg 5: mode +# Arg 6: debug +# Arg 7: no_dsp +# Arg 8: no_alu +# Arg 9: target clock period +# Arg 10: input type width (log) +# Arg 11: weight type width (log) +# Arg 12: accum type width (log) +# Arg 13: output type width (log) +# Arg 14: batch size (log) +# Arg 15: in block size (log) +# Arg 16: out block size (log) +# Arg 17: uop buffer size in B (log) +# Arg 18: inp buffer size in B (log) +# Arg 19: wgt buffer size in B (log) +# Arg 20: acc buffer size in B (log) +# Arg 21: out buffer size in B (log) -if { [llength $argv] eq 22 } { +if { [llength $argv] eq 23 } { set src_dir [lindex $argv 2] set sim_dir [lindex $argv 3] set test_dir [lindex $argv 4] set include_dir [lindex $argv 5] - set target_period [lindex $argv 6] - set inp_width [lindex $argv 7] - set wgt_width [lindex $argv 8] - set acc_width [lindex $argv 9] - set out_width [lindex $argv 10] - set batch [lindex $argv 11] - set block_in [lindex $argv 12] - set block_out [lindex $argv 13] - set uop_buff_size [lindex $argv 14] - set inp_buff_size [lindex $argv 15] - set wgt_buff_size [lindex $argv 16] - set acc_buff_size [lindex $argv 17] - set out_buff_size [lindex $argv 18] - set mode [lindex $argv 19] - set no_dsp [lindex $argv 20] - set no_alu [lindex $argv 21] + set mode [lindex $argv 6] + set debug [lindex $argv 7] + set no_dsp [lindex $argv 8] + set no_alu [lindex $argv 9] + set target_period [lindex $argv 10] + set inp_width [lindex $argv 11] + set wgt_width [lindex $argv 12] + set acc_width [lindex $argv 13] + set out_width [lindex $argv 14] + set batch [lindex $argv 15] + set block_in [lindex $argv 16] + set block_out [lindex $argv 17] + set uop_buff_size [lindex $argv 18] + set inp_buff_size [lindex $argv 19] + set wgt_buff_size [lindex $argv 20] + set acc_buff_size [lindex $argv 21] + set out_buff_size [lindex $argv 22] } else { set src_dir "../src" set sim_dir "../sim" set test_dir "../../src/test" set include_dir "../../include" + set mode "all" + set debug "false" + set no_dsp "true" + set no_alu "false" set target_period 10 set inp_width 3 set wgt_width 3 set acc_width 5 set out_width 3 set batch 1 - set block_out 4 set block_in 4 + set block_out 4 set uop_buff_size 15 set inp_buff_size 15 set wgt_buff_size 15 set acc_buff_size 17 set out_buff_size 15 - set mode "all" - set no_dsp "true" - set no_alu "false" + exit } # Initializes the HLS design and sets HLS pragmas for memory partitioning. @@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} { # C define flags to pass to compiler set cflags "-I $include_dir -I $src_dir -I $test_dir \ - -DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \ + -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \ -DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \ -DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \ -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \ -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \ -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size" +if {$debug=="true"} { + append cflags " -DVTA_DEBUG=1" +} if {$no_dsp=="true"} { append cflags " -DNO_DSP" } diff --git a/vta/hardware/xilinx/scripts/vivado.tcl b/vta/hardware/xilinx/scripts/vivado.tcl index bdfd24c0..58a1f500 100644 --- a/vta/hardware/xilinx/scripts/vivado.tcl +++ b/vta/hardware/xilinx/scripts/vivado.tcl @@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } { set ip_path [lindex $argv 0] set num_threads [lindex $argv 1] set clock_freq [lindex $argv 2] - set inp_width [lindex $argv 3] - set wgt_width [lindex $argv 4] - set out_width [lindex $argv 5] - set batch [lindex $argv 6] - set out_block [lindex $argv 7] - set in_block [lindex $argv 8] - set inp_mem_size [lindex $argv 9] - set wgt_mem_size [lindex $argv 10] - set out_mem_size [lindex $argv 11] + set inp_width [expr 1 << [lindex $argv 3]] + set wgt_width [expr 1 << [lindex $argv 4]] + set out_width [expr 1 << [lindex $argv 5]] + set batch [expr 1 << [lindex $argv 6]] + set out_block [expr 1 << [lindex $argv 7]] + set in_block [expr 1 << [lindex $argv 8]] + set inp_mem_size [expr 1 << [lindex $argv 9]] + set wgt_mem_size [expr 1 << [lindex $argv 10]] + set out_mem_size [expr 1 << [lindex $argv 11]] if {$clock_freq eq 100} { set clock_id 0 puts "Setting clock frequency to 100MHz" diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc index 79f4a1f6..e1c28834 100644 --- a/vta/hardware/xilinx/sim/vta_test.cc +++ b/vta/hardware/xilinx/sim/vta_test.cc @@ -53,5 +53,8 @@ int main(void) { status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1); status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1); + // Simple GEMM unit test + status |= gemm_test(64, 64, 64, true); + return status; } diff --git a/vta/make/config.json b/vta/make/config.json index 8a7dbcf6..c64473e7 100644 --- a/vta/make/config.json +++ b/vta/make/config.json @@ -7,8 +7,8 @@ "LOG_BATCH" : 0, "LOG_BLOCK_IN" : 4, "LOG_BLOCK_OUT" : 4, - "LOG_UOP_BUFF_SIZE" : 15, + "LOG_UOP_BUFF_SIZE" : 14, "LOG_INP_BUFF_SIZE" : 15, - "LOG_WGT_BUFF_SIZE" : 15, + "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/make/sim_sample.json b/vta/make/sim_sample.json index 57aa89ad..747e22fc 100644 --- a/vta/make/sim_sample.json +++ b/vta/make/sim_sample.json @@ -7,8 +7,8 @@ "LOG_BATCH" : 0, "LOG_BLOCK_IN" : 4, "LOG_BLOCK_OUT" : 4, - "LOG_UOP_BUFF_SIZE" : 15, + "LOG_UOP_BUFF_SIZE" : 14, "LOG_INP_BUFF_SIZE" : 15, - "LOG_WGT_BUFF_SIZE" : 15, + "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/make/vta_config.py b/vta/make/vta_config.py index a25a9ca8..c64a11f6 100644 --- a/vta/make/vta_config.py +++ b/vta/make/vta_config.py @@ -28,6 +28,32 @@ def main(): help="print all the config json") parser.add_argument("--target", action="store_true", help="print the target") + parser.add_argument("--cfg-str", action="store_true", + help="print the configuration string") + parser.add_argument("--get-inpwidth", action="store_true", + help="returns log of input bitwidth") + parser.add_argument("--get-wgtwidth", action="store_true", + help="returns log of weight bitwidth") + parser.add_argument("--get-accwidth", action="store_true", + help="returns log of accum bitwidth") + parser.add_argument("--get-outwidth", action="store_true", + help="returns log of output bitwidth") + parser.add_argument("--get-batch", action="store_true", + help="returns log of tensor batch dimension") + parser.add_argument("--get-blockin", action="store_true", + help="returns log of tensor block in dimension") + parser.add_argument("--get-blockout", action="store_true", + help="returns log of tensor block out dimension") + parser.add_argument("--get-uopbuffsize", action="store_true", + help="returns log of micro-op buffer size in B") + parser.add_argument("--get-inpbuffsize", action="store_true", + help="returns log of input buffer size in B") + parser.add_argument("--get-wgtbuffsize", action="store_true", + help="returns log of weight buffer size in B") + parser.add_argument("--get-accbuffsize", action="store_true", + help="returns log of accum buffer size in B") + parser.add_argument("--get-outbuffsize", action="store_true", + help="returns log of output buffer size in B") args = parser.parse_args() if len(sys.argv) == 1: @@ -46,13 +72,17 @@ def main(): raise RuntimeError("Cannot find config in %s" % str(path_list)) cfg = json.load(open(ok_path_list[0])) cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"] + cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"] pkg = get_pkg_config(cfg) if args.target: print(pkg.target) if args.cflags: - print(" ".join(pkg.cflags)) + cflags_str = " ".join(pkg.cflags) + if cfg["TARGET"] == "pynq": + cflags_str += " -DVTA_TARGET_PYNQ" + print(cflags_str) if args.ldflags: print(" ".join(pkg.ldflags)) @@ -60,6 +90,54 @@ def main(): if args.cfg_json: print(pkg.cfg_json) + if args.cfg_str: + cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}".format( + (1 << cfg["LOG_BATCH"]), + (1 << cfg["LOG_BLOCK_IN"]), + (1 << cfg["LOG_BLOCK_OUT"]), + (1 << cfg["LOG_INP_WIDTH"]), + (1 << cfg["LOG_WGT_WIDTH"]), + cfg["LOG_UOP_BUFF_SIZE"], + cfg["LOG_INP_BUFF_SIZE"], + cfg["LOG_WGT_BUFF_SIZE"], + cfg["LOG_ACC_BUFF_SIZE"]) + print cfg_str + + if args.get_inpwidth: + print(cfg["LOG_INP_WIDTH"]) + + if args.get_wgtwidth: + print(cfg["LOG_WGT_WIDTH"]) + + if args.get_accwidth: + print(cfg["LOG_ACC_WIDTH"]) + + if args.get_outwidth: + print(cfg["LOG_OUT_WIDTH"]) + + if args.get_batch: + print(cfg["LOG_BATCH"]) + + if args.get_blockin: + print(cfg["LOG_BLOCK_IN"]) + + if args.get_blockout: + print(cfg["LOG_BLOCK_OUT"]) + + if args.get_uopbuffsize: + print(cfg["LOG_UOP_BUFF_SIZE"]) + + if args.get_inpbuffsize: + print(cfg["LOG_INP_BUFF_SIZE"]) + + if args.get_wgtbuffsize: + print(cfg["LOG_WGT_BUFF_SIZE"]) + + if args.get_outbuffsize: + print(cfg["LOG_OUT_BUFF_SIZE"]) + + if args.get_accbuffsize: + print(cfg["LOG_ACC_BUFF_SIZE"]) if __name__ == "__main__": main() diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 494e27c8..feaacbe4 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -130,11 +130,15 @@ class Environment(object): self.BLOCK_IN * self.WGT_WIDTH) self.ACC_ELEM_BITS = (self.BATCH * - self.BLOCK_IN * + self.BLOCK_OUT * self.ACC_WIDTH) + self.OUT_ELEM_BITS = (self.BATCH * + self.BLOCK_OUT * + self.OUT_WIDTH) self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8 self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8 self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8 + self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8 # dtypes self.acc_dtype = "int%d" % self.ACC_WIDTH self.inp_dtype = "int%d" % self.INP_WIDTH diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py index ff18bf5b..07000e2c 100644 --- a/vta/python/vta/ir_pass.py +++ b/vta/python/vta/ir_pass.py @@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in): base = 0 for i in range(1, ndim + 1): if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0): - raise RuntimeError("scope %s need need to have block=%d" % (scope, elem_block)) + raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block)) x_size = x_size * buf.shape[ndim - i] if util.equal_const_int(x_size - elem_block, 0): base = i + 1 @@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in): if pad_before or pad_after: raise RuntimeError("Do not support copy into DRAM with pad") if src.scope == env.acc_scope: - elem_width = env.INP_WIDTH # output compression to inp type - elem_bytes = env.INP_ELEM_BYTES # output compression to inp type + elem_width = env.OUT_WIDTH + elem_bytes = env.OUT_ELEM_BYTES mem_type = env.dev.MEM_ID_OUT - data_type = "int%d" % env.INP_WIDTH + data_type = "int%d" % env.OUT_WIDTH task_qid = env.dev.QID_STORE_OUT else: raise RuntimeError("Do not support copy %s->dram" % (src.scope)) diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc index ab4e8fcc..1cb84576 100644 --- a/vta/tests/hardware/common/test_lib.cc +++ b/vta/tests/hardware/common/test_lib.cc @@ -1,11 +1,114 @@ /*! * Copyright (c) 2018 by Contributors - * \file vta_test_lib.cpp + * \file test_lib.cpp * \brief Test library for the VTA design simulation and driver tests. */ #include "./test_lib.h" +#ifdef NO_SIM +#ifdef VTA_TARGET_PYNQ + +uint64_t vta( + uint32_t insn_count, + VTAGenericInsn *insns, + VTAUop *uops, + inp_T *inputs, + wgt_T *weights, + acc_T *biases, + inp_T *outputs) { + // Performance counter variables + uint64_t t_fpga; + struct timespec start, stop; + + // Derive bitstream file + char bitstream[128]; + char str_batch_size[4]; + char str_block_out_size[4]; + char str_block_in_size[4]; + char str_block_bit_width[4]; + snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH); + snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT); + snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN); + snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH); + snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit"); + +#if VTA_DEBUG == 1 + printf("INFO - Programming FPGA: %s!\n", bitstream); +#endif + + // Program VTA + VTAProgram(bitstream); + // Get VTA handles + void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); + void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); + void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); + void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + + // Physical address pointers + uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; + uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; + uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; + uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; + uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; + uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; + +#if VTA_DEBUG == 1 + printf("INFO - Starting FPGA!\n"); +#endif + + clock_gettime(CLOCK_REALTIME, &start); + + // FETCH @ 0x10 : Data signal of insn_count_V + VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); + // FETCH @ 0x18 : Data signal of insns_V + if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); + // LOAD @ 0x10 : Data signal of inputs_V + if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); + // LOAD @ 0x18 : Data signal of weight_V + if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); + // COMPUTE @ 0x20 : Data signal of uops_V + if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); + // COMPUTE @ 0x28 : Data signal of biases_V + if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); + // STORE @ 0x10 : Data signal of outputs_V + if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); + + // VTA start + VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); + VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); + + int flag = 0, t = 0; + for (t = 0; t < 10000000; ++t) { + flag = VTAReadMappedReg(vta_compute_handle, 0x18); + if (flag & VTA_DONE) break; + } + + if (t == 10000000) { + printf("\tWARNING: VTA TIMEOUT!!!!\n"); +#if VTA_DEBUG == 1 + } else { + printf("INFO - FPGA Finished!\n"); +#endif + } + + clock_gettime(CLOCK_REALTIME, &stop); + t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); + + // Unmap VTA register + VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); + VTAUnmapRegister(vta_load_handle, VTA_RANGE); + VTAUnmapRegister(vta_compute_handle, VTA_RANGE); + VTAUnmapRegister(vta_store_handle, VTA_RANGE); + + return t_fpga; +} + +#endif // VTA_TARGET_PYNQ +#endif // NO_SIM + uint32_t globalSeed; const char* getOpcodeString(int opcode, bool use_imm) { @@ -1122,3 +1225,232 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, return -1; } } + + +int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) { + // Some assertions + assert(batch % VTA_BATCH == 0); + assert(in_channels % VTA_BLOCK_IN == 0); + assert(out_channels % VTA_BLOCK_OUT == 0); + + printf("=====================================================================================\n"); + printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n", + batch, in_channels, out_channels, uop_compression); + + // Derive number of elements that need to be loaded/stored + int ins_size = 7; + int uop_size = uop_compression ? + batch / VTA_BATCH : + batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT; + int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN; + int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT; + int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT; + // Make sure we don't exceed buffer bounds + assert(uop_size <= VTA_UOP_BUFF_DEPTH); + assert(inp_size <= VTA_INP_BUFF_DEPTH); + assert(wgt_size <= VTA_WGT_BUFF_DEPTH); + assert(out_size <= VTA_ACC_BUFF_DEPTH); + + // Initialize instruction buffer + VTAGenericInsn *insn_buf = + static_cast(allocBuffer(sizeof(VTAGenericInsn) * ins_size)); + int insn_idx = 0; + + // Load uops + insn_buf[insn_idx++] = get1DLoadStoreInsn( + VTA_OPCODE_LOAD, + VTA_MEM_ID_UOP, + 0, + 0, + uop_size, + 0, + 0, + 0, + 0); + // Load bias + insn_buf[insn_idx++] = get1DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_ACC, // type + 0, // sram offset + 0, // dram offset + out_size, // size + 0, // pop prev dep + 0, // pop next dep + 1, // push prev dep + 0); // push next dep + // Load weight block (pop next) + insn_buf[insn_idx++] = get1DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_WGT, // type + 0, // sram offset + 0, // dram offset + wgt_size, // size + 0, // pop prev dep + 1, // pop next dep + 0, // push prev dep + 0); // push next dep + // Load input block (push next) + insn_buf[insn_idx++] = get1DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_INP, // type + 0, // sram offset + 0, // dram offset + inp_size, // size + 0, // pop prev dep + 0, // pop next dep + 0, // push prev dep + 1); // push next dep + // Perform GEMM (pop prev, push prev if not last, push next if last) + insn_buf[insn_idx++] = getGEMMInsn( + 0, // uop offset + batch / VTA_BATCH, // batch + in_channels / VTA_BLOCK_IN, // in_channels + out_channels / VTA_BLOCK_OUT, // out_channels + uop_compression, // uop_compression + 1, // pop_prev_dep + 0, // pop_next_dep + 0, // push prev dep + 1); // push_next_dep + // Store output block (pop prev, push prev if not last) + insn_buf[insn_idx++] = get1DLoadStoreInsn( + VTA_OPCODE_STORE, // opcode + VTA_MEM_ID_OUT, // type + 0, // sram offset + 0, // dram offset + out_size, // size + 1, // pop prev dep + 0, // pop next dep + 1, // push prev dep + 0); // push next dep + // Finish + insn_buf[insn_idx++] = getFinishInsn(0, 1); + + // Prepare the uop buffer + VTAUop * uop_buf = getGEMMUops( + batch / VTA_BATCH, + in_channels / VTA_BLOCK_IN, + out_channels / VTA_BLOCK_OUT, + uop_compression, + 0); + +#if VTA_DEBUG == 1 + printInstruction(ins_size, insn_buf); + printMicroOp(uop_size, uop_buf); +#endif + + // Initialize inputs + inp_T **inputs = allocInit2dArray(batch, in_channels); + // Initialize weights + wgt_T **weights = allocInit2dArray(out_channels, in_channels); + // Initialize biases + acc_T **biases = allocInit2dArray(batch, out_channels); + + // Reference GEMM implementation + out_T **outputs_ref = alloc2dArray(batch, out_channels); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < out_channels; j++) { + acc_T sum = biases[i][j]; + for (int k = 0; k < in_channels; k++) { + sum += (acc_T) (inputs[i][k] * weights[j][k]); + } + // Set + outputs_ref[i][j] = (out_T) sum; + } + } + + // Prepare the input buffer + inp_T *input_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * inp_size)); + packBuffer(input_buf, + inputs, + batch, + in_channels, + VTA_BATCH, + VTA_BLOCK_IN); + // Prepare the weight buffer + wgt_T *weight_buf = static_cast(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size)); + packBuffer(weight_buf, + weights, + out_channels, + in_channels, + VTA_BLOCK_OUT, + VTA_BLOCK_IN); + // Prepare the bias buffer + acc_T *bias_buf = static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * out_size)); + packBuffer(bias_buf, + biases, + batch, + out_channels, + VTA_BATCH, + VTA_BLOCK_OUT); + // Prepare the output buffer + out_T *output_buf = static_cast(allocBuffer(VTA_INP_ELEM_BYTES * out_size)); + +#ifdef NO_SIM + // Invoke the VTA + uint64_t t_fpga = vta(ins_size, + insn_buf, + uop_buf, + input_buf, + weight_buf, + bias_buf, + output_buf); + // Report on timining + printf("INFO - Synchronization time: %.3lfms\n", static_cast(t_fpga) / 1E6); + printf("INFO - Throughput: %.3lfGOPs/s\n", + static_cast(batch) * in_channels * out_channels * 2 / t_fpga); +#else + // Invoke the VTA + vta(ins_size, + (volatile insn_T *) insn_buf, + (volatile uop_T *) uop_buf, + (volatile inp_vec_T *) input_buf, + (volatile wgt_vec_T *) weight_buf, + (volatile acc_vec_T *) bias_buf, + (volatile out_vec_T *) output_buf); +#endif + + // Unpack output data + out_T **outputs = alloc2dArray(batch, out_channels); + unpackBuffer(outputs, + output_buf, + batch, + out_channels, + VTA_BATCH, + VTA_BLOCK_OUT); + + // Correctness checks + int err = 0; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < out_channels; j++) { + if (outputs_ref[i][j] != outputs[i][j]) { + err++; +#if VTA_DEBUG == 1 + printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, + static_cast(outputs_ref[i][j]), + static_cast(outputs[i][j])); +#endif + } + } + } + + // Free all allocated arrays + free2dArray(inputs, batch, in_channels); + free2dArray(weights, out_channels, in_channels); + free2dArray(biases, batch, out_channels); + free2dArray(outputs_ref, batch, out_channels); + free2dArray(outputs, batch, out_channels); + freeBuffer(insn_buf); + freeBuffer(uop_buf); + freeBuffer(input_buf); + freeBuffer(weight_buf); + freeBuffer(bias_buf); + freeBuffer(output_buf); + + if (err == 0) { + printf("INFO - Blocked GEMM test successful!\n"); + return 0; + } else { + printf("INFO - Blocked GEMM test failed, got %d errors!\n", err); + return -1; + } +} \ No newline at end of file diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h index 458ff713..0e8f30df 100644 --- a/vta/tests/hardware/common/test_lib.h +++ b/vta/tests/hardware/common/test_lib.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2018 by Contributors - * \file vta_test_lib.cpp + * \file test_lib.cpp * \brief Test library for the VTA design simulation and driver tests. */ @@ -17,9 +17,9 @@ #include -#ifdef VTA_PYNQ_TARGET +#ifdef VTA_TARGET_PYNQ #include "../../../src/pynq/pynq_driver.h" -#endif // VTA_PYNQ_TARGET +#endif // VTA_TARGET_PYNQ typedef uint64_t axi_T; typedef uint32_t uop_T; @@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int virtual_threads); +/*! +* \brief VTA GEMM unit test. +* \param batch Batch size. +* \param in_channels Input channels. +* \param out_channels Output channels. +* \param uop_compression Apply micro-op compression. +* \return Number of errors from the test run. +*/ +int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression); + #endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_ diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/pynq/Makefile index dabf55e2..7a862e22 100644 --- a/vta/tests/hardware/pynq/Makefile +++ b/vta/tests/hardware/pynq/Makefile @@ -1,7 +1,7 @@ CC ?= g++ CFLAGS = -Wall -O3 -std=c++11 -I/usr/include LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/ -LIBS = -l:libsds_lib.so -l:libdma.so +LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++ INCLUDE_DIR = ../../../include DRIVER_DIR = ../../../src/pynq TESTLIB_DIR = ../common @@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc OBJECTS = pynq_driver.o test_lib.o metal_test.o EXECUTABLE = vta -# Include top-level config file -ifndef config -ifneq ("$(wildcard ../../../config.mk)", "") - config = ../../../config.mk -else - config = ../../../make/config.mk -endif -endif -include $(config) +# Include VTA config +VTA_CONFIG = python ../../../make/vta_config.py +CFLAGS += `${VTA_CONFIG} --cflags` +LDFLAGS += `${VTA_CONFIG} --ldflags` +VTA_TARGET := $(shell ${VTA_CONFIG} --target) # Define flags -CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0 -CFLAGS += $(ADD_CFLAGS) +CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0 # All Target all: $(EXECUTABLE) diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/pynq/metal_test.cc index 01e73f46..56be244b 100644 --- a/vta/tests/hardware/pynq/metal_test.cc +++ b/vta/tests/hardware/pynq/metal_test.cc @@ -1,6 +1,6 @@ /*! * Copyright (c) 2018 by Contributors - * \file driver_test.cpp + * \file metal_test.cpp * \brief Bare-metal test to test driver and VTA design. */ @@ -13,104 +13,6 @@ #include "../../../src/pynq/pynq_driver.h" #include "../common/test_lib.h" -// VTA invocation (present the same abstraction as in the simulation tests) -uint64_t vta( - uint32_t insn_count, - VTAGenericInsn *insns, - VTAUop *uops, - inp_T *inputs, - wgt_T *weights, - acc_T *biases, - inp_T *outputs) { - // Performance counter variables - uint64_t t_fpga; - struct timespec start, stop; - - // Derive bitstream file - char bitstream[128]; - char str_batch_size[4]; - char str_block_out_size[4]; - char str_block_in_size[4]; - char str_block_bit_width[4]; - snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH); - snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT); - snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN); - snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH); - snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit"); - -#if VTA_DEBUG == 1 - printf("INFO - Programming FPGA: %s!\n", bitstream); -#endif - - // Program VTA - VTAProgram(bitstream); - // Get VTA handles - VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); - VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); - VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); - - // Physical address pointers - uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; - uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; - uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; - uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; - uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; - uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; - -#if VTA_DEBUG == 1 - printf("INFO - Starting FPGA!\n"); -#endif - - clock_gettime(CLOCK_REALTIME, &start); - - // FETCH @ 0x10 : Data signal of insn_count_V - VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); - // FETCH @ 0x18 : Data signal of insns_V - if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); - // LOAD @ 0x10 : Data signal of inputs_V - if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); - // LOAD @ 0x18 : Data signal of weight_V - if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); - // COMPUTE @ 0x20 : Data signal of uops_V - if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); - // COMPUTE @ 0x28 : Data signal of biases_V - if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); - // STORE @ 0x10 : Data signal of outputs_V - if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); - - // VTA start - VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); - VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); - VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); - VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); - - int flag = 0, t = 0; - for (t = 0; t < 10000000; ++t) { - flag = VTAReadMappedReg(vta_compute_handle, 0x18); - if (flag & VTA_DONE) break; - } - - if (t == 10000000) { - printf("\tWARNING: VTA TIMEOUT!!!!\n"); -#if VTA_DEBUG == 1 - } else { - printf("INFO - FPGA Finished!\n"); -#endif - } - - clock_gettime(CLOCK_REALTIME, &stop); - t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); - - // Unmap VTA register - VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); - VTAUnmapRegister(vta_load_handle, VTA_RANGE); - VTAUnmapRegister(vta_compute_handle, VTA_RANGE); - VTAUnmapRegister(vta_store_handle, VTA_RANGE); - - return t_fpga; -} - int main(void) { #if VTA_DEBUG == 1 printParameters();