[HARDWARE, TEST] Fixed hardware generation flow (#34)
This commit is contained in:
Родитель
9f0e8ffe12
Коммит
dae77cdb4c
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python:/home/xilinx/vta/python
|
||||
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/vta/nnvm/tvm/python:/home/xilinx/vta/python
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
|
||||
python -m vta.exec.rpc_server
|
||||
|
|
|
@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls
|
|||
VIVADO = vivado
|
||||
HSI = hsi
|
||||
|
||||
# HLS Mode
|
||||
MODE = all
|
||||
# HLS mode
|
||||
MODE = skip_sim
|
||||
# Debug flag
|
||||
DEBUG = false
|
||||
# SLURM
|
||||
SLURM = false
|
||||
# Prevent generation of DSP
|
||||
|
@ -22,15 +24,26 @@ NO_DSP = false
|
|||
# Prevent generation of ALU
|
||||
NO_ALU = false
|
||||
|
||||
# Include top-level config file
|
||||
ifndef config
|
||||
ifneq ("$(wildcard ../../config.mk)", "")
|
||||
config = ../../config.mk
|
||||
else
|
||||
config = ../../make/config.mk
|
||||
endif
|
||||
endif
|
||||
include $(config)
|
||||
# Process VTA JSON config
|
||||
VTA_CONFIG = python $(CURDIR)/../../make/vta_config.py
|
||||
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
|
||||
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
|
||||
|
||||
#---------------------
|
||||
# VTA Parameters
|
||||
#--------------------
|
||||
VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
|
||||
VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
|
||||
VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
|
||||
VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
|
||||
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
|
||||
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
|
||||
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
|
||||
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
|
||||
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
|
||||
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
|
||||
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
|
||||
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
|
||||
|
||||
#---------------------
|
||||
# Compilation parameters
|
||||
|
@ -50,8 +63,8 @@ TARGET_PER = \
|
|||
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
|
||||
|
||||
# Derive config name
|
||||
CONF = \
|
||||
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
|
||||
CONF_ROOT = $(shell ${VTA_CONFIG} --cfg-str)
|
||||
CONF = $(CONF_ROOT)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
|
||||
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
|
||||
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
|
||||
|
||||
|
@ -60,26 +73,34 @@ ifeq ($(SLURM), true)
|
|||
HW_BUILD_PATH = /scratch/vivado/$(CONF)
|
||||
endif
|
||||
|
||||
.PHONY: all ip bit driver clean clean_all
|
||||
# IP file path
|
||||
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
|
||||
|
||||
all: bit
|
||||
# Bitstream file path
|
||||
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
|
||||
|
||||
ip:
|
||||
.PHONY: all ip bit bsp clean clean_all
|
||||
|
||||
all: bsp
|
||||
ip: $(IP_PATH)
|
||||
bit: $(BIT_PATH)
|
||||
|
||||
$(IP_PATH): $(SRC_DIR)/*
|
||||
mkdir -p $(IP_BUILD_PATH)
|
||||
cd $(IP_BUILD_PATH) && \
|
||||
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
|
||||
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
|
||||
$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
|
||||
$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
|
||||
$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
|
||||
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \
|
||||
$(MODE) $(NO_DSP) $(NO_ALU)
|
||||
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
|
||||
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(TARGET_PER) \
|
||||
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
|
||||
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
|
||||
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
|
||||
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
|
||||
ifeq ($(SLURM), true)
|
||||
mkdir -p $(BUILD_DIR)/hls
|
||||
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
|
||||
endif
|
||||
|
||||
bit: ip
|
||||
$(BIT_PATH): $(IP_PATH)
|
||||
mkdir -p $(HW_BUILD_PATH)
|
||||
cd $(HW_BUILD_PATH) && \
|
||||
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
|
||||
|
@ -92,12 +113,12 @@ ifeq ($(SLURM), true)
|
|||
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
|
||||
endif
|
||||
|
||||
driver: bit
|
||||
bsp: $(BIT_PATH)
|
||||
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
|
||||
cd $(HW_BUILD_PATH)/bsp && make
|
||||
|
||||
clean:
|
||||
rm -rf *.out *.log *.sb figures
|
||||
|
||||
clean_all: clean
|
||||
cleanall: clean
|
||||
rm -rf $(BUILD_DIR)
|
||||
|
|
|
@ -9,65 +9,69 @@
|
|||
# Arg 2: path to sim sources
|
||||
# Arg 3: path to test sources
|
||||
# Arg 4: path to include sources
|
||||
# Arg 5: target clock period
|
||||
# Arg 6: input type width (log)
|
||||
# Arg 7: weight type width (log)
|
||||
# Arg 8: accum type width (log)
|
||||
# Arg 9: output type width (log)
|
||||
# Arg 10: batch size (log)
|
||||
# Arg 11: in block size (log)
|
||||
# Arg 12: out block size (log)
|
||||
# Arg 13: uop buffer size in B (log)
|
||||
# Arg 14: inp buffer size in B (log)
|
||||
# Arg 15: wgt buffer size in B (log)
|
||||
# Arg 16: acc buffer size in B (log)
|
||||
# Arg 17: out buffer size in B (log)
|
||||
# Arg 18: mode
|
||||
# Arg 19: no_dsp
|
||||
# Arg 20: no_alu
|
||||
# Arg 5: mode
|
||||
# Arg 6: debug
|
||||
# Arg 7: no_dsp
|
||||
# Arg 8: no_alu
|
||||
# Arg 9: target clock period
|
||||
# Arg 10: input type width (log)
|
||||
# Arg 11: weight type width (log)
|
||||
# Arg 12: accum type width (log)
|
||||
# Arg 13: output type width (log)
|
||||
# Arg 14: batch size (log)
|
||||
# Arg 15: in block size (log)
|
||||
# Arg 16: out block size (log)
|
||||
# Arg 17: uop buffer size in B (log)
|
||||
# Arg 18: inp buffer size in B (log)
|
||||
# Arg 19: wgt buffer size in B (log)
|
||||
# Arg 20: acc buffer size in B (log)
|
||||
# Arg 21: out buffer size in B (log)
|
||||
|
||||
if { [llength $argv] eq 22 } {
|
||||
if { [llength $argv] eq 23 } {
|
||||
set src_dir [lindex $argv 2]
|
||||
set sim_dir [lindex $argv 3]
|
||||
set test_dir [lindex $argv 4]
|
||||
set include_dir [lindex $argv 5]
|
||||
set target_period [lindex $argv 6]
|
||||
set inp_width [lindex $argv 7]
|
||||
set wgt_width [lindex $argv 8]
|
||||
set acc_width [lindex $argv 9]
|
||||
set out_width [lindex $argv 10]
|
||||
set batch [lindex $argv 11]
|
||||
set block_in [lindex $argv 12]
|
||||
set block_out [lindex $argv 13]
|
||||
set uop_buff_size [lindex $argv 14]
|
||||
set inp_buff_size [lindex $argv 15]
|
||||
set wgt_buff_size [lindex $argv 16]
|
||||
set acc_buff_size [lindex $argv 17]
|
||||
set out_buff_size [lindex $argv 18]
|
||||
set mode [lindex $argv 19]
|
||||
set no_dsp [lindex $argv 20]
|
||||
set no_alu [lindex $argv 21]
|
||||
set mode [lindex $argv 6]
|
||||
set debug [lindex $argv 7]
|
||||
set no_dsp [lindex $argv 8]
|
||||
set no_alu [lindex $argv 9]
|
||||
set target_period [lindex $argv 10]
|
||||
set inp_width [lindex $argv 11]
|
||||
set wgt_width [lindex $argv 12]
|
||||
set acc_width [lindex $argv 13]
|
||||
set out_width [lindex $argv 14]
|
||||
set batch [lindex $argv 15]
|
||||
set block_in [lindex $argv 16]
|
||||
set block_out [lindex $argv 17]
|
||||
set uop_buff_size [lindex $argv 18]
|
||||
set inp_buff_size [lindex $argv 19]
|
||||
set wgt_buff_size [lindex $argv 20]
|
||||
set acc_buff_size [lindex $argv 21]
|
||||
set out_buff_size [lindex $argv 22]
|
||||
} else {
|
||||
set src_dir "../src"
|
||||
set sim_dir "../sim"
|
||||
set test_dir "../../src/test"
|
||||
set include_dir "../../include"
|
||||
set mode "all"
|
||||
set debug "false"
|
||||
set no_dsp "true"
|
||||
set no_alu "false"
|
||||
set target_period 10
|
||||
set inp_width 3
|
||||
set wgt_width 3
|
||||
set acc_width 5
|
||||
set out_width 3
|
||||
set batch 1
|
||||
set block_out 4
|
||||
set block_in 4
|
||||
set block_out 4
|
||||
set uop_buff_size 15
|
||||
set inp_buff_size 15
|
||||
set wgt_buff_size 15
|
||||
set acc_buff_size 17
|
||||
set out_buff_size 15
|
||||
set mode "all"
|
||||
set no_dsp "true"
|
||||
set no_alu "false"
|
||||
exit
|
||||
}
|
||||
|
||||
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
|
||||
|
@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
|
|||
|
||||
# C define flags to pass to compiler
|
||||
set cflags "-I $include_dir -I $src_dir -I $test_dir \
|
||||
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
|
||||
-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
|
||||
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
|
||||
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
|
||||
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
|
||||
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
|
||||
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
|
||||
if {$debug=="true"} {
|
||||
append cflags " -DVTA_DEBUG=1"
|
||||
}
|
||||
if {$no_dsp=="true"} {
|
||||
append cflags " -DNO_DSP"
|
||||
}
|
||||
|
|
|
@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } {
|
|||
set ip_path [lindex $argv 0]
|
||||
set num_threads [lindex $argv 1]
|
||||
set clock_freq [lindex $argv 2]
|
||||
set inp_width [lindex $argv 3]
|
||||
set wgt_width [lindex $argv 4]
|
||||
set out_width [lindex $argv 5]
|
||||
set batch [lindex $argv 6]
|
||||
set out_block [lindex $argv 7]
|
||||
set in_block [lindex $argv 8]
|
||||
set inp_mem_size [lindex $argv 9]
|
||||
set wgt_mem_size [lindex $argv 10]
|
||||
set out_mem_size [lindex $argv 11]
|
||||
set inp_width [expr 1 << [lindex $argv 3]]
|
||||
set wgt_width [expr 1 << [lindex $argv 4]]
|
||||
set out_width [expr 1 << [lindex $argv 5]]
|
||||
set batch [expr 1 << [lindex $argv 6]]
|
||||
set out_block [expr 1 << [lindex $argv 7]]
|
||||
set in_block [expr 1 << [lindex $argv 8]]
|
||||
set inp_mem_size [expr 1 << [lindex $argv 9]]
|
||||
set wgt_mem_size [expr 1 << [lindex $argv 10]]
|
||||
set out_mem_size [expr 1 << [lindex $argv 11]]
|
||||
if {$clock_freq eq 100} {
|
||||
set clock_id 0
|
||||
puts "Setting clock frequency to 100MHz"
|
||||
|
|
|
@ -53,5 +53,8 @@ int main(void) {
|
|||
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
|
||||
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
|
||||
|
||||
// Simple GEMM unit test
|
||||
status |= gemm_test(64, 64, 64, true);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
"LOG_BATCH" : 0,
|
||||
"LOG_BLOCK_IN" : 4,
|
||||
"LOG_BLOCK_OUT" : 4,
|
||||
"LOG_UOP_BUFF_SIZE" : 15,
|
||||
"LOG_UOP_BUFF_SIZE" : 14,
|
||||
"LOG_INP_BUFF_SIZE" : 15,
|
||||
"LOG_WGT_BUFF_SIZE" : 15,
|
||||
"LOG_WGT_BUFF_SIZE" : 18,
|
||||
"LOG_ACC_BUFF_SIZE" : 17
|
||||
}
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
"LOG_BATCH" : 0,
|
||||
"LOG_BLOCK_IN" : 4,
|
||||
"LOG_BLOCK_OUT" : 4,
|
||||
"LOG_UOP_BUFF_SIZE" : 15,
|
||||
"LOG_UOP_BUFF_SIZE" : 14,
|
||||
"LOG_INP_BUFF_SIZE" : 15,
|
||||
"LOG_WGT_BUFF_SIZE" : 15,
|
||||
"LOG_WGT_BUFF_SIZE" : 18,
|
||||
"LOG_ACC_BUFF_SIZE" : 17
|
||||
}
|
||||
|
|
|
@ -28,6 +28,32 @@ def main():
|
|||
help="print all the config json")
|
||||
parser.add_argument("--target", action="store_true",
|
||||
help="print the target")
|
||||
parser.add_argument("--cfg-str", action="store_true",
|
||||
help="print the configuration string")
|
||||
parser.add_argument("--get-inpwidth", action="store_true",
|
||||
help="returns log of input bitwidth")
|
||||
parser.add_argument("--get-wgtwidth", action="store_true",
|
||||
help="returns log of weight bitwidth")
|
||||
parser.add_argument("--get-accwidth", action="store_true",
|
||||
help="returns log of accum bitwidth")
|
||||
parser.add_argument("--get-outwidth", action="store_true",
|
||||
help="returns log of output bitwidth")
|
||||
parser.add_argument("--get-batch", action="store_true",
|
||||
help="returns log of tensor batch dimension")
|
||||
parser.add_argument("--get-blockin", action="store_true",
|
||||
help="returns log of tensor block in dimension")
|
||||
parser.add_argument("--get-blockout", action="store_true",
|
||||
help="returns log of tensor block out dimension")
|
||||
parser.add_argument("--get-uopbuffsize", action="store_true",
|
||||
help="returns log of micro-op buffer size in B")
|
||||
parser.add_argument("--get-inpbuffsize", action="store_true",
|
||||
help="returns log of input buffer size in B")
|
||||
parser.add_argument("--get-wgtbuffsize", action="store_true",
|
||||
help="returns log of weight buffer size in B")
|
||||
parser.add_argument("--get-accbuffsize", action="store_true",
|
||||
help="returns log of accum buffer size in B")
|
||||
parser.add_argument("--get-outbuffsize", action="store_true",
|
||||
help="returns log of output buffer size in B")
|
||||
args = parser.parse_args()
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
|
@ -46,13 +72,17 @@ def main():
|
|||
raise RuntimeError("Cannot find config in %s" % str(path_list))
|
||||
cfg = json.load(open(ok_path_list[0]))
|
||||
cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
|
||||
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
|
||||
pkg = get_pkg_config(cfg)
|
||||
|
||||
if args.target:
|
||||
print(pkg.target)
|
||||
|
||||
if args.cflags:
|
||||
print(" ".join(pkg.cflags))
|
||||
cflags_str = " ".join(pkg.cflags)
|
||||
if cfg["TARGET"] == "pynq":
|
||||
cflags_str += " -DVTA_TARGET_PYNQ"
|
||||
print(cflags_str)
|
||||
|
||||
if args.ldflags:
|
||||
print(" ".join(pkg.ldflags))
|
||||
|
@ -60,6 +90,54 @@ def main():
|
|||
if args.cfg_json:
|
||||
print(pkg.cfg_json)
|
||||
|
||||
if args.cfg_str:
|
||||
cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}".format(
|
||||
(1 << cfg["LOG_BATCH"]),
|
||||
(1 << cfg["LOG_BLOCK_IN"]),
|
||||
(1 << cfg["LOG_BLOCK_OUT"]),
|
||||
(1 << cfg["LOG_INP_WIDTH"]),
|
||||
(1 << cfg["LOG_WGT_WIDTH"]),
|
||||
cfg["LOG_UOP_BUFF_SIZE"],
|
||||
cfg["LOG_INP_BUFF_SIZE"],
|
||||
cfg["LOG_WGT_BUFF_SIZE"],
|
||||
cfg["LOG_ACC_BUFF_SIZE"])
|
||||
print cfg_str
|
||||
|
||||
if args.get_inpwidth:
|
||||
print(cfg["LOG_INP_WIDTH"])
|
||||
|
||||
if args.get_wgtwidth:
|
||||
print(cfg["LOG_WGT_WIDTH"])
|
||||
|
||||
if args.get_accwidth:
|
||||
print(cfg["LOG_ACC_WIDTH"])
|
||||
|
||||
if args.get_outwidth:
|
||||
print(cfg["LOG_OUT_WIDTH"])
|
||||
|
||||
if args.get_batch:
|
||||
print(cfg["LOG_BATCH"])
|
||||
|
||||
if args.get_blockin:
|
||||
print(cfg["LOG_BLOCK_IN"])
|
||||
|
||||
if args.get_blockout:
|
||||
print(cfg["LOG_BLOCK_OUT"])
|
||||
|
||||
if args.get_uopbuffsize:
|
||||
print(cfg["LOG_UOP_BUFF_SIZE"])
|
||||
|
||||
if args.get_inpbuffsize:
|
||||
print(cfg["LOG_INP_BUFF_SIZE"])
|
||||
|
||||
if args.get_wgtbuffsize:
|
||||
print(cfg["LOG_WGT_BUFF_SIZE"])
|
||||
|
||||
if args.get_outbuffsize:
|
||||
print(cfg["LOG_OUT_BUFF_SIZE"])
|
||||
|
||||
if args.get_accbuffsize:
|
||||
print(cfg["LOG_ACC_BUFF_SIZE"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -130,11 +130,15 @@ class Environment(object):
|
|||
self.BLOCK_IN *
|
||||
self.WGT_WIDTH)
|
||||
self.ACC_ELEM_BITS = (self.BATCH *
|
||||
self.BLOCK_IN *
|
||||
self.BLOCK_OUT *
|
||||
self.ACC_WIDTH)
|
||||
self.OUT_ELEM_BITS = (self.BATCH *
|
||||
self.BLOCK_OUT *
|
||||
self.OUT_WIDTH)
|
||||
self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
|
||||
self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
|
||||
self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
|
||||
self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
|
||||
# dtypes
|
||||
self.acc_dtype = "int%d" % self.ACC_WIDTH
|
||||
self.inp_dtype = "int%d" % self.INP_WIDTH
|
||||
|
|
|
@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in):
|
|||
base = 0
|
||||
for i in range(1, ndim + 1):
|
||||
if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
|
||||
raise RuntimeError("scope %s need need to have block=%d" % (scope, elem_block))
|
||||
raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
|
||||
x_size = x_size * buf.shape[ndim - i]
|
||||
if util.equal_const_int(x_size - elem_block, 0):
|
||||
base = i + 1
|
||||
|
@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in):
|
|||
if pad_before or pad_after:
|
||||
raise RuntimeError("Do not support copy into DRAM with pad")
|
||||
if src.scope == env.acc_scope:
|
||||
elem_width = env.INP_WIDTH # output compression to inp type
|
||||
elem_bytes = env.INP_ELEM_BYTES # output compression to inp type
|
||||
elem_width = env.OUT_WIDTH
|
||||
elem_bytes = env.OUT_ELEM_BYTES
|
||||
mem_type = env.dev.MEM_ID_OUT
|
||||
data_type = "int%d" % env.INP_WIDTH
|
||||
data_type = "int%d" % env.OUT_WIDTH
|
||||
task_qid = env.dev.QID_STORE_OUT
|
||||
else:
|
||||
raise RuntimeError("Do not support copy %s->dram" % (src.scope))
|
||||
|
|
|
@ -1,11 +1,114 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_test_lib.cpp
|
||||
* \file test_lib.cpp
|
||||
* \brief Test library for the VTA design simulation and driver tests.
|
||||
*/
|
||||
|
||||
#include "./test_lib.h"
|
||||
|
||||
#ifdef NO_SIM
|
||||
#ifdef VTA_TARGET_PYNQ
|
||||
|
||||
uint64_t vta(
|
||||
uint32_t insn_count,
|
||||
VTAGenericInsn *insns,
|
||||
VTAUop *uops,
|
||||
inp_T *inputs,
|
||||
wgt_T *weights,
|
||||
acc_T *biases,
|
||||
inp_T *outputs) {
|
||||
// Performance counter variables
|
||||
uint64_t t_fpga;
|
||||
struct timespec start, stop;
|
||||
|
||||
// Derive bitstream file
|
||||
char bitstream[128];
|
||||
char str_batch_size[4];
|
||||
char str_block_out_size[4];
|
||||
char str_block_in_size[4];
|
||||
char str_block_bit_width[4];
|
||||
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
|
||||
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
|
||||
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
|
||||
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
|
||||
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
|
||||
|
||||
#if VTA_DEBUG == 1
|
||||
printf("INFO - Programming FPGA: %s!\n", bitstream);
|
||||
#endif
|
||||
|
||||
// Program VTA
|
||||
VTAProgram(bitstream);
|
||||
// Get VTA handles
|
||||
void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
|
||||
void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
|
||||
void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
|
||||
void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
|
||||
|
||||
// Physical address pointers
|
||||
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
|
||||
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
|
||||
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
|
||||
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
|
||||
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
|
||||
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
|
||||
|
||||
#if VTA_DEBUG == 1
|
||||
printf("INFO - Starting FPGA!\n");
|
||||
#endif
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
|
||||
// FETCH @ 0x10 : Data signal of insn_count_V
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
|
||||
// FETCH @ 0x18 : Data signal of insns_V
|
||||
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
|
||||
// LOAD @ 0x10 : Data signal of inputs_V
|
||||
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
|
||||
// LOAD @ 0x18 : Data signal of weight_V
|
||||
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
|
||||
// COMPUTE @ 0x20 : Data signal of uops_V
|
||||
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
|
||||
// COMPUTE @ 0x28 : Data signal of biases_V
|
||||
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
|
||||
// STORE @ 0x10 : Data signal of outputs_V
|
||||
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
|
||||
|
||||
// VTA start
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
|
||||
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
|
||||
|
||||
int flag = 0, t = 0;
|
||||
for (t = 0; t < 10000000; ++t) {
|
||||
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
|
||||
if (flag & VTA_DONE) break;
|
||||
}
|
||||
|
||||
if (t == 10000000) {
|
||||
printf("\tWARNING: VTA TIMEOUT!!!!\n");
|
||||
#if VTA_DEBUG == 1
|
||||
} else {
|
||||
printf("INFO - FPGA Finished!\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
|
||||
|
||||
// Unmap VTA register
|
||||
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
|
||||
|
||||
return t_fpga;
|
||||
}
|
||||
|
||||
#endif // VTA_TARGET_PYNQ
|
||||
#endif // NO_SIM
|
||||
|
||||
uint32_t globalSeed;
|
||||
|
||||
const char* getOpcodeString(int opcode, bool use_imm) {
|
||||
|
@ -1122,3 +1225,232 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
|
||||
// Some assertions
|
||||
assert(batch % VTA_BATCH == 0);
|
||||
assert(in_channels % VTA_BLOCK_IN == 0);
|
||||
assert(out_channels % VTA_BLOCK_OUT == 0);
|
||||
|
||||
printf("=====================================================================================\n");
|
||||
printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
|
||||
batch, in_channels, out_channels, uop_compression);
|
||||
|
||||
// Derive number of elements that need to be loaded/stored
|
||||
int ins_size = 7;
|
||||
int uop_size = uop_compression ?
|
||||
batch / VTA_BATCH :
|
||||
batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
|
||||
int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
|
||||
int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
|
||||
int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
|
||||
// Make sure we don't exceed buffer bounds
|
||||
assert(uop_size <= VTA_UOP_BUFF_DEPTH);
|
||||
assert(inp_size <= VTA_INP_BUFF_DEPTH);
|
||||
assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
|
||||
assert(out_size <= VTA_ACC_BUFF_DEPTH);
|
||||
|
||||
// Initialize instruction buffer
|
||||
VTAGenericInsn *insn_buf =
|
||||
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
|
||||
int insn_idx = 0;
|
||||
|
||||
// Load uops
|
||||
insn_buf[insn_idx++] = get1DLoadStoreInsn(
|
||||
VTA_OPCODE_LOAD,
|
||||
VTA_MEM_ID_UOP,
|
||||
0,
|
||||
0,
|
||||
uop_size,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0);
|
||||
// Load bias
|
||||
insn_buf[insn_idx++] = get1DLoadStoreInsn(
|
||||
VTA_OPCODE_LOAD, // opcode
|
||||
VTA_MEM_ID_ACC, // type
|
||||
0, // sram offset
|
||||
0, // dram offset
|
||||
out_size, // size
|
||||
0, // pop prev dep
|
||||
0, // pop next dep
|
||||
1, // push prev dep
|
||||
0); // push next dep
|
||||
// Load weight block (pop next)
|
||||
insn_buf[insn_idx++] = get1DLoadStoreInsn(
|
||||
VTA_OPCODE_LOAD, // opcode
|
||||
VTA_MEM_ID_WGT, // type
|
||||
0, // sram offset
|
||||
0, // dram offset
|
||||
wgt_size, // size
|
||||
0, // pop prev dep
|
||||
1, // pop next dep
|
||||
0, // push prev dep
|
||||
0); // push next dep
|
||||
// Load input block (push next)
|
||||
insn_buf[insn_idx++] = get1DLoadStoreInsn(
|
||||
VTA_OPCODE_LOAD, // opcode
|
||||
VTA_MEM_ID_INP, // type
|
||||
0, // sram offset
|
||||
0, // dram offset
|
||||
inp_size, // size
|
||||
0, // pop prev dep
|
||||
0, // pop next dep
|
||||
0, // push prev dep
|
||||
1); // push next dep
|
||||
// Perform GEMM (pop prev, push prev if not last, push next if last)
|
||||
insn_buf[insn_idx++] = getGEMMInsn(
|
||||
0, // uop offset
|
||||
batch / VTA_BATCH, // batch
|
||||
in_channels / VTA_BLOCK_IN, // in_channels
|
||||
out_channels / VTA_BLOCK_OUT, // out_channels
|
||||
uop_compression, // uop_compression
|
||||
1, // pop_prev_dep
|
||||
0, // pop_next_dep
|
||||
0, // push prev dep
|
||||
1); // push_next_dep
|
||||
// Store output block (pop prev, push prev if not last)
|
||||
insn_buf[insn_idx++] = get1DLoadStoreInsn(
|
||||
VTA_OPCODE_STORE, // opcode
|
||||
VTA_MEM_ID_OUT, // type
|
||||
0, // sram offset
|
||||
0, // dram offset
|
||||
out_size, // size
|
||||
1, // pop prev dep
|
||||
0, // pop next dep
|
||||
1, // push prev dep
|
||||
0); // push next dep
|
||||
// Finish
|
||||
insn_buf[insn_idx++] = getFinishInsn(0, 1);
|
||||
|
||||
// Prepare the uop buffer
|
||||
VTAUop * uop_buf = getGEMMUops(
|
||||
batch / VTA_BATCH,
|
||||
in_channels / VTA_BLOCK_IN,
|
||||
out_channels / VTA_BLOCK_OUT,
|
||||
uop_compression,
|
||||
0);
|
||||
|
||||
#if VTA_DEBUG == 1
|
||||
printInstruction(ins_size, insn_buf);
|
||||
printMicroOp(uop_size, uop_buf);
|
||||
#endif
|
||||
|
||||
// Initialize inputs
|
||||
inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
|
||||
// Initialize weights
|
||||
wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
|
||||
// Initialize biases
|
||||
acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
|
||||
|
||||
// Reference GEMM implementation
|
||||
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
|
||||
for (int i = 0; i < batch; i++) {
|
||||
for (int j = 0; j < out_channels; j++) {
|
||||
acc_T sum = biases[i][j];
|
||||
for (int k = 0; k < in_channels; k++) {
|
||||
sum += (acc_T) (inputs[i][k] * weights[j][k]);
|
||||
}
|
||||
// Set
|
||||
outputs_ref[i][j] = (out_T) sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the input buffer
|
||||
inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
|
||||
packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
|
||||
inputs,
|
||||
batch,
|
||||
in_channels,
|
||||
VTA_BATCH,
|
||||
VTA_BLOCK_IN);
|
||||
// Prepare the weight buffer
|
||||
wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
|
||||
packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
|
||||
weights,
|
||||
out_channels,
|
||||
in_channels,
|
||||
VTA_BLOCK_OUT,
|
||||
VTA_BLOCK_IN);
|
||||
// Prepare the bias buffer
|
||||
acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
|
||||
packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
|
||||
biases,
|
||||
batch,
|
||||
out_channels,
|
||||
VTA_BATCH,
|
||||
VTA_BLOCK_OUT);
|
||||
// Prepare the output buffer
|
||||
out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
|
||||
|
||||
#ifdef NO_SIM
|
||||
// Invoke the VTA
|
||||
uint64_t t_fpga = vta(ins_size,
|
||||
insn_buf,
|
||||
uop_buf,
|
||||
input_buf,
|
||||
weight_buf,
|
||||
bias_buf,
|
||||
output_buf);
|
||||
// Report on timining
|
||||
printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
|
||||
printf("INFO - Throughput: %.3lfGOPs/s\n",
|
||||
static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
|
||||
#else
|
||||
// Invoke the VTA
|
||||
vta(ins_size,
|
||||
(volatile insn_T *) insn_buf,
|
||||
(volatile uop_T *) uop_buf,
|
||||
(volatile inp_vec_T *) input_buf,
|
||||
(volatile wgt_vec_T *) weight_buf,
|
||||
(volatile acc_vec_T *) bias_buf,
|
||||
(volatile out_vec_T *) output_buf);
|
||||
#endif
|
||||
|
||||
// Unpack output data
|
||||
out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
|
||||
unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
|
||||
output_buf,
|
||||
batch,
|
||||
out_channels,
|
||||
VTA_BATCH,
|
||||
VTA_BLOCK_OUT);
|
||||
|
||||
// Correctness checks
|
||||
int err = 0;
|
||||
for (int i = 0; i < batch; i++) {
|
||||
for (int j = 0; j < out_channels; j++) {
|
||||
if (outputs_ref[i][j] != outputs[i][j]) {
|
||||
err++;
|
||||
#if VTA_DEBUG == 1
|
||||
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
|
||||
static_cast<int>(outputs_ref[i][j]),
|
||||
static_cast<int>(outputs[i][j]));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Free all allocated arrays
|
||||
free2dArray<inp_T>(inputs, batch, in_channels);
|
||||
free2dArray<wgt_T>(weights, out_channels, in_channels);
|
||||
free2dArray<acc_T>(biases, batch, out_channels);
|
||||
free2dArray<out_T>(outputs_ref, batch, out_channels);
|
||||
free2dArray<out_T>(outputs, batch, out_channels);
|
||||
freeBuffer(insn_buf);
|
||||
freeBuffer(uop_buf);
|
||||
freeBuffer(input_buf);
|
||||
freeBuffer(weight_buf);
|
||||
freeBuffer(bias_buf);
|
||||
freeBuffer(output_buf);
|
||||
|
||||
if (err == 0) {
|
||||
printf("INFO - Blocked GEMM test successful!\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
|
||||
return -1;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_test_lib.cpp
|
||||
* \file test_lib.cpp
|
||||
* \brief Test library for the VTA design simulation and driver tests.
|
||||
*/
|
||||
|
||||
|
@ -17,9 +17,9 @@
|
|||
|
||||
#include <vta/driver.h>
|
||||
|
||||
#ifdef VTA_PYNQ_TARGET
|
||||
#ifdef VTA_TARGET_PYNQ
|
||||
#include "../../../src/pynq/pynq_driver.h"
|
||||
#endif // VTA_PYNQ_TARGET
|
||||
#endif // VTA_TARGET_PYNQ
|
||||
|
||||
typedef uint64_t axi_T;
|
||||
typedef uint32_t uop_T;
|
||||
|
@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
|
||||
int virtual_threads);
|
||||
|
||||
/*!
|
||||
* \brief VTA GEMM unit test.
|
||||
* \param batch Batch size.
|
||||
* \param in_channels Input channels.
|
||||
* \param out_channels Output channels.
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \return Number of errors from the test run.
|
||||
*/
|
||||
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
|
||||
|
||||
#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
CC ?= g++
|
||||
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
|
||||
LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
|
||||
LIBS = -l:libsds_lib.so -l:libdma.so
|
||||
LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
|
||||
INCLUDE_DIR = ../../../include
|
||||
DRIVER_DIR = ../../../src/pynq
|
||||
TESTLIB_DIR = ../common
|
||||
|
@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc
|
|||
OBJECTS = pynq_driver.o test_lib.o metal_test.o
|
||||
EXECUTABLE = vta
|
||||
|
||||
# Include top-level config file
|
||||
ifndef config
|
||||
ifneq ("$(wildcard ../../../config.mk)", "")
|
||||
config = ../../../config.mk
|
||||
else
|
||||
config = ../../../make/config.mk
|
||||
endif
|
||||
endif
|
||||
include $(config)
|
||||
# Include VTA config
|
||||
VTA_CONFIG = python ../../../make/vta_config.py
|
||||
CFLAGS += `${VTA_CONFIG} --cflags`
|
||||
LDFLAGS += `${VTA_CONFIG} --ldflags`
|
||||
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
|
||||
|
||||
# Define flags
|
||||
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
|
||||
CFLAGS += $(ADD_CFLAGS)
|
||||
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
|
||||
|
||||
# All Target
|
||||
all: $(EXECUTABLE)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file driver_test.cpp
|
||||
* \file metal_test.cpp
|
||||
* \brief Bare-metal test to test driver and VTA design.
|
||||
*/
|
||||
|
||||
|
@ -13,104 +13,6 @@
|
|||
#include "../../../src/pynq/pynq_driver.h"
|
||||
#include "../common/test_lib.h"
|
||||
|
||||
// VTA invocation (present the same abstraction as in the simulation tests)
|
||||
uint64_t vta(
|
||||
uint32_t insn_count,
|
||||
VTAGenericInsn *insns,
|
||||
VTAUop *uops,
|
||||
inp_T *inputs,
|
||||
wgt_T *weights,
|
||||
acc_T *biases,
|
||||
inp_T *outputs) {
|
||||
// Performance counter variables
|
||||
uint64_t t_fpga;
|
||||
struct timespec start, stop;
|
||||
|
||||
// Derive bitstream file
|
||||
char bitstream[128];
|
||||
char str_batch_size[4];
|
||||
char str_block_out_size[4];
|
||||
char str_block_in_size[4];
|
||||
char str_block_bit_width[4];
|
||||
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
|
||||
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
|
||||
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
|
||||
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
|
||||
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
|
||||
|
||||
#if VTA_DEBUG == 1
|
||||
printf("INFO - Programming FPGA: %s!\n", bitstream);
|
||||
#endif
|
||||
|
||||
// Program VTA
|
||||
VTAProgram(bitstream);
|
||||
// Get VTA handles
|
||||
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
|
||||
|
||||
// Physical address pointers
|
||||
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
|
||||
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
|
||||
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
|
||||
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
|
||||
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
|
||||
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
|
||||
|
||||
#if VTA_DEBUG == 1
|
||||
printf("INFO - Starting FPGA!\n");
|
||||
#endif
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
|
||||
// FETCH @ 0x10 : Data signal of insn_count_V
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
|
||||
// FETCH @ 0x18 : Data signal of insns_V
|
||||
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
|
||||
// LOAD @ 0x10 : Data signal of inputs_V
|
||||
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
|
||||
// LOAD @ 0x18 : Data signal of weight_V
|
||||
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
|
||||
// COMPUTE @ 0x20 : Data signal of uops_V
|
||||
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
|
||||
// COMPUTE @ 0x28 : Data signal of biases_V
|
||||
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
|
||||
// STORE @ 0x10 : Data signal of outputs_V
|
||||
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
|
||||
|
||||
// VTA start
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
|
||||
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
|
||||
|
||||
int flag = 0, t = 0;
|
||||
for (t = 0; t < 10000000; ++t) {
|
||||
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
|
||||
if (flag & VTA_DONE) break;
|
||||
}
|
||||
|
||||
if (t == 10000000) {
|
||||
printf("\tWARNING: VTA TIMEOUT!!!!\n");
|
||||
#if VTA_DEBUG == 1
|
||||
} else {
|
||||
printf("INFO - FPGA Finished!\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
|
||||
|
||||
// Unmap VTA register
|
||||
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
|
||||
|
||||
return t_fpga;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
#if VTA_DEBUG == 1
|
||||
printParameters();
|
||||
|
|
Загрузка…
Ссылка в новой задаче