[HARDWARE, TEST] Fixed hardware generation flow (#34)

This commit is contained in:
Thierry Moreau 2018-05-02 18:13:33 -07:00 коммит произвёл Tianqi Chen
Родитель 9f0e8ffe12
Коммит dae77cdb4c
14 изменённых файлов: 550 добавлений и 198 удалений

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python:/home/xilinx/vta/python
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/vta/nnvm/tvm/python:/home/xilinx/vta/python
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
python -m vta.exec.rpc_server

Просмотреть файл

@ -13,8 +13,10 @@ VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# HLS Mode
MODE = all
# HLS mode
MODE = skip_sim
# Debug flag
DEBUG = false
# SLURM
SLURM = false
# Prevent generation of DSP
@ -22,15 +24,26 @@ NO_DSP = false
# Prevent generation of ALU
NO_ALU = false
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../config.mk)", "")
config = ../../config.mk
else
config = ../../make/config.mk
endif
endif
include $(config)
# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../make/vta_config.py
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
#---------------------
# VTA Parameters
#--------------------
VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
#---------------------
# Compilation parameters
@ -50,8 +63,8 @@ TARGET_PER = \
$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
# Derive config name
CONF = \
$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_LOG_UOP_BUFF_SIZE)_$(VTA_LOG_INP_BUFF_SIZE)_$(VTA_LOG_WGT_BUFF_SIZE)_$(VTA_LOG_ACC_BUFF_SIZE)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
CONF_ROOT = $(shell ${VTA_CONFIG} --cfg-str)
CONF = $(CONF_ROOT)_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
@ -60,26 +73,34 @@ ifeq ($(SLURM), true)
HW_BUILD_PATH = /scratch/vivado/$(CONF)
endif
.PHONY: all ip bit driver clean clean_all
# IP file path
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
all: bit
# Bitstream file path
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
ip:
.PHONY: all ip bit bsp clean clean_all
all: bsp
ip: $(IP_PATH)
bit: $(BIT_PATH)
$(IP_PATH): $(SRC_DIR)/*
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE) \
$(MODE) $(NO_DSP) $(NO_ALU)
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(TARGET_PER) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), true)
mkdir -p $(BUILD_DIR)/hls
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
endif
bit: ip
$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
@ -92,12 +113,12 @@ ifeq ($(SLURM), true)
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
endif
driver: bit
bsp: $(BIT_PATH)
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf *.out *.log *.sb figures
clean_all: clean
cleanall: clean
rm -rf $(BUILD_DIR)

Просмотреть файл

@ -9,65 +9,69 @@
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: target clock period
# Arg 6: input type width (log)
# Arg 7: weight type width (log)
# Arg 8: accum type width (log)
# Arg 9: output type width (log)
# Arg 10: batch size (log)
# Arg 11: in block size (log)
# Arg 12: out block size (log)
# Arg 13: uop buffer size in B (log)
# Arg 14: inp buffer size in B (log)
# Arg 15: wgt buffer size in B (log)
# Arg 16: acc buffer size in B (log)
# Arg 17: out buffer size in B (log)
# Arg 18: mode
# Arg 19: no_dsp
# Arg 20: no_alu
# Arg 5: mode
# Arg 6: debug
# Arg 7: no_dsp
# Arg 8: no_alu
# Arg 9: target clock period
# Arg 10: input type width (log)
# Arg 11: weight type width (log)
# Arg 12: accum type width (log)
# Arg 13: output type width (log)
# Arg 14: batch size (log)
# Arg 15: in block size (log)
# Arg 16: out block size (log)
# Arg 17: uop buffer size in B (log)
# Arg 18: inp buffer size in B (log)
# Arg 19: wgt buffer size in B (log)
# Arg 20: acc buffer size in B (log)
# Arg 21: out buffer size in B (log)
if { [llength $argv] eq 22 } {
if { [llength $argv] eq 23 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set target_period [lindex $argv 6]
set inp_width [lindex $argv 7]
set wgt_width [lindex $argv 8]
set acc_width [lindex $argv 9]
set out_width [lindex $argv 10]
set batch [lindex $argv 11]
set block_in [lindex $argv 12]
set block_out [lindex $argv 13]
set uop_buff_size [lindex $argv 14]
set inp_buff_size [lindex $argv 15]
set wgt_buff_size [lindex $argv 16]
set acc_buff_size [lindex $argv 17]
set out_buff_size [lindex $argv 18]
set mode [lindex $argv 19]
set no_dsp [lindex $argv 20]
set no_alu [lindex $argv 21]
set mode [lindex $argv 6]
set debug [lindex $argv 7]
set no_dsp [lindex $argv 8]
set no_alu [lindex $argv 9]
set target_period [lindex $argv 10]
set inp_width [lindex $argv 11]
set wgt_width [lindex $argv 12]
set acc_width [lindex $argv 13]
set out_width [lindex $argv 14]
set batch [lindex $argv 15]
set block_in [lindex $argv 16]
set block_out [lindex $argv 17]
set uop_buff_size [lindex $argv 18]
set inp_buff_size [lindex $argv 19]
set wgt_buff_size [lindex $argv 20]
set acc_buff_size [lindex $argv 21]
set out_buff_size [lindex $argv 22]
} else {
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
set include_dir "../../include"
set mode "all"
set debug "false"
set no_dsp "true"
set no_alu "false"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_out 4
set block_in 4
set block_out 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
set mode "all"
set no_dsp "true"
set no_alu "false"
exit
}
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
@ -124,12 +128,15 @@ proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
# C define flags to pass to compiler
set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
if {$debug=="true"} {
append cflags " -DVTA_DEBUG=1"
}
if {$no_dsp=="true"} {
append cflags " -DNO_DSP"
}

Просмотреть файл

@ -26,15 +26,15 @@ if { [llength $argv] eq 12 } {
set ip_path [lindex $argv 0]
set num_threads [lindex $argv 1]
set clock_freq [lindex $argv 2]
set inp_width [lindex $argv 3]
set wgt_width [lindex $argv 4]
set out_width [lindex $argv 5]
set batch [lindex $argv 6]
set out_block [lindex $argv 7]
set in_block [lindex $argv 8]
set inp_mem_size [lindex $argv 9]
set wgt_mem_size [lindex $argv 10]
set out_mem_size [lindex $argv 11]
set inp_width [expr 1 << [lindex $argv 3]]
set wgt_width [expr 1 << [lindex $argv 4]]
set out_width [expr 1 << [lindex $argv 5]]
set batch [expr 1 << [lindex $argv 6]]
set out_block [expr 1 << [lindex $argv 7]]
set in_block [expr 1 << [lindex $argv 8]]
set inp_mem_size [expr 1 << [lindex $argv 9]]
set wgt_mem_size [expr 1 << [lindex $argv 10]]
set out_mem_size [expr 1 << [lindex $argv 11]]
if {$clock_freq eq 100} {
set clock_id 0
puts "Setting clock frequency to 100MHz"

Просмотреть файл

@ -53,5 +53,8 @@ int main(void) {
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
// Simple GEMM unit test
status |= gemm_test(64, 64, 64, true);
return status;
}

Просмотреть файл

@ -7,8 +7,8 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_UOP_BUFF_SIZE" : 14,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}

Просмотреть файл

@ -7,8 +7,8 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_UOP_BUFF_SIZE" : 14,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}

Просмотреть файл

@ -28,6 +28,32 @@ def main():
help="print all the config json")
parser.add_argument("--target", action="store_true",
help="print the target")
parser.add_argument("--cfg-str", action="store_true",
help="print the configuration string")
parser.add_argument("--get-inpwidth", action="store_true",
help="returns log of input bitwidth")
parser.add_argument("--get-wgtwidth", action="store_true",
help="returns log of weight bitwidth")
parser.add_argument("--get-accwidth", action="store_true",
help="returns log of accum bitwidth")
parser.add_argument("--get-outwidth", action="store_true",
help="returns log of output bitwidth")
parser.add_argument("--get-batch", action="store_true",
help="returns log of tensor batch dimension")
parser.add_argument("--get-blockin", action="store_true",
help="returns log of tensor block in dimension")
parser.add_argument("--get-blockout", action="store_true",
help="returns log of tensor block out dimension")
parser.add_argument("--get-uopbuffsize", action="store_true",
help="returns log of micro-op buffer size in B")
parser.add_argument("--get-inpbuffsize", action="store_true",
help="returns log of input buffer size in B")
parser.add_argument("--get-wgtbuffsize", action="store_true",
help="returns log of weight buffer size in B")
parser.add_argument("--get-accbuffsize", action="store_true",
help="returns log of accum buffer size in B")
parser.add_argument("--get-outbuffsize", action="store_true",
help="returns log of output buffer size in B")
args = parser.parse_args()
if len(sys.argv) == 1:
@ -46,13 +72,17 @@ def main():
raise RuntimeError("Cannot find config in %s" % str(path_list))
cfg = json.load(open(ok_path_list[0]))
cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
pkg = get_pkg_config(cfg)
if args.target:
print(pkg.target)
if args.cflags:
print(" ".join(pkg.cflags))
cflags_str = " ".join(pkg.cflags)
if cfg["TARGET"] == "pynq":
cflags_str += " -DVTA_TARGET_PYNQ"
print(cflags_str)
if args.ldflags:
print(" ".join(pkg.ldflags))
@ -60,6 +90,54 @@ def main():
if args.cfg_json:
print(pkg.cfg_json)
if args.cfg_str:
cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"])
print cfg_str
if args.get_inpwidth:
print(cfg["LOG_INP_WIDTH"])
if args.get_wgtwidth:
print(cfg["LOG_WGT_WIDTH"])
if args.get_accwidth:
print(cfg["LOG_ACC_WIDTH"])
if args.get_outwidth:
print(cfg["LOG_OUT_WIDTH"])
if args.get_batch:
print(cfg["LOG_BATCH"])
if args.get_blockin:
print(cfg["LOG_BLOCK_IN"])
if args.get_blockout:
print(cfg["LOG_BLOCK_OUT"])
if args.get_uopbuffsize:
print(cfg["LOG_UOP_BUFF_SIZE"])
if args.get_inpbuffsize:
print(cfg["LOG_INP_BUFF_SIZE"])
if args.get_wgtbuffsize:
print(cfg["LOG_WGT_BUFF_SIZE"])
if args.get_outbuffsize:
print(cfg["LOG_OUT_BUFF_SIZE"])
if args.get_accbuffsize:
print(cfg["LOG_ACC_BUFF_SIZE"])
if __name__ == "__main__":
main()

Просмотреть файл

@ -130,11 +130,15 @@ class Environment(object):
self.BLOCK_IN *
self.WGT_WIDTH)
self.ACC_ELEM_BITS = (self.BATCH *
self.BLOCK_IN *
self.BLOCK_OUT *
self.ACC_WIDTH)
self.OUT_ELEM_BITS = (self.BATCH *
self.BLOCK_OUT *
self.OUT_WIDTH)
self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
# dtypes
self.acc_dtype = "int%d" % self.ACC_WIDTH
self.inp_dtype = "int%d" % self.INP_WIDTH

Просмотреть файл

@ -339,7 +339,7 @@ def inject_dma_intrin(stmt_in):
base = 0
for i in range(1, ndim + 1):
if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
raise RuntimeError("scope %s need need to have block=%d" % (scope, elem_block))
raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
x_size = x_size * buf.shape[ndim - i]
if util.equal_const_int(x_size - elem_block, 0):
base = i + 1
@ -469,10 +469,10 @@ def inject_dma_intrin(stmt_in):
if pad_before or pad_after:
raise RuntimeError("Do not support copy into DRAM with pad")
if src.scope == env.acc_scope:
elem_width = env.INP_WIDTH # output compression to inp type
elem_bytes = env.INP_ELEM_BYTES # output compression to inp type
elem_width = env.OUT_WIDTH
elem_bytes = env.OUT_ELEM_BYTES
mem_type = env.dev.MEM_ID_OUT
data_type = "int%d" % env.INP_WIDTH
data_type = "int%d" % env.OUT_WIDTH
task_qid = env.dev.QID_STORE_OUT
else:
raise RuntimeError("Do not support copy %s->dram" % (src.scope))

Просмотреть файл

@ -1,11 +1,114 @@
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test_lib.cpp
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
#include "./test_lib.h"
#ifdef NO_SIM
#ifdef VTA_TARGET_PYNQ
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[128];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
#if VTA_DEBUG == 1
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif
// Program VTA
VTAProgram(bitstream);
// Get VTA handles
void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
#endif
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
#if VTA_DEBUG == 1
} else {
printf("INFO - FPGA Finished!\n");
#endif
}
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
}
#endif // VTA_TARGET_PYNQ
#endif // NO_SIM
uint32_t globalSeed;
const char* getOpcodeString(int opcode, bool use_imm) {
@ -1122,3 +1225,232 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
return -1;
}
}
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
// Some assertions
assert(batch % VTA_BATCH == 0);
assert(in_channels % VTA_BLOCK_IN == 0);
assert(out_channels % VTA_BLOCK_OUT == 0);
printf("=====================================================================================\n");
printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
batch, in_channels, out_channels, uop_compression);
// Derive number of elements that need to be loaded/stored
int ins_size = 7;
int uop_size = uop_compression ?
batch / VTA_BATCH :
batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
// Make sure we don't exceed buffer bounds
assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(inp_size <= VTA_INP_BUFF_DEPTH);
assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
assert(out_size <= VTA_ACC_BUFF_DEPTH);
// Initialize instruction buffer
VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0;
// Load uops
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD,
VTA_MEM_ID_UOP,
0,
0,
uop_size,
0,
0,
0,
0);
// Load bias
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_ACC, // type
0, // sram offset
0, // dram offset
out_size, // size
0, // pop prev dep
0, // pop next dep
1, // push prev dep
0); // push next dep
// Load weight block (pop next)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_WGT, // type
0, // sram offset
0, // dram offset
wgt_size, // size
0, // pop prev dep
1, // pop next dep
0, // push prev dep
0); // push next dep
// Load input block (push next)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_INP, // type
0, // sram offset
0, // dram offset
inp_size, // size
0, // pop prev dep
0, // pop next dep
0, // push prev dep
1); // push next dep
// Perform GEMM (pop prev, push prev if not last, push next if last)
insn_buf[insn_idx++] = getGEMMInsn(
0, // uop offset
batch / VTA_BATCH, // batch
in_channels / VTA_BLOCK_IN, // in_channels
out_channels / VTA_BLOCK_OUT, // out_channels
uop_compression, // uop_compression
1, // pop_prev_dep
0, // pop_next_dep
0, // push prev dep
1); // push_next_dep
// Store output block (pop prev, push prev if not last)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_STORE, // opcode
VTA_MEM_ID_OUT, // type
0, // sram offset
0, // dram offset
out_size, // size
1, // pop prev dep
0, // pop next dep
1, // push prev dep
0); // push next dep
// Finish
insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer
VTAUop * uop_buf = getGEMMUops(
batch / VTA_BATCH,
in_channels / VTA_BLOCK_IN,
out_channels / VTA_BLOCK_OUT,
uop_compression,
0);
#if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf);
#endif
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
// Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_channels; j++) {
acc_T sum = biases[i][j];
for (int k = 0; k < in_channels; k++) {
sum += (acc_T) (inputs[i][k] * weights[j][k]);
}
// Set
outputs_ref[i][j] = (out_T) sum;
}
}
// Prepare the input buffer
inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_channels,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer
wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_channels,
in_channels,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer
acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_channels,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer
out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
#ifdef NO_SIM
// Invoke the VTA
uint64_t t_fpga = vta(ins_size,
insn_buf,
uop_buf,
input_buf,
weight_buf,
bias_buf,
output_buf);
// Report on timining
printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3lfGOPs/s\n",
static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
#else
// Invoke the VTA
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile inp_vec_T *) input_buf,
(volatile wgt_vec_T *) weight_buf,
(volatile acc_vec_T *) bias_buf,
(volatile out_vec_T *) output_buf);
#endif
// Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
output_buf,
batch,
out_channels,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks
int err = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_channels; j++) {
if (outputs_ref[i][j] != outputs[i][j]) {
err++;
#if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif
}
}
}
// Free all allocated arrays
free2dArray<inp_T>(inputs, batch, in_channels);
free2dArray<wgt_T>(weights, out_channels, in_channels);
free2dArray<acc_T>(biases, batch, out_channels);
free2dArray<out_T>(outputs_ref, batch, out_channels);
free2dArray<out_T>(outputs, batch, out_channels);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(input_buf);
freeBuffer(weight_buf);
freeBuffer(bias_buf);
freeBuffer(output_buf);
if (err == 0) {
printf("INFO - Blocked GEMM test successful!\n");
return 0;
} else {
printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
return -1;
}
}

Просмотреть файл

@ -1,6 +1,6 @@
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test_lib.cpp
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
@ -17,9 +17,9 @@
#include <vta/driver.h>
#ifdef VTA_PYNQ_TARGET
#ifdef VTA_TARGET_PYNQ
#include "../../../src/pynq/pynq_driver.h"
#endif // VTA_PYNQ_TARGET
#endif // VTA_TARGET_PYNQ
typedef uint64_t axi_T;
typedef uint32_t uop_T;
@ -300,4 +300,14 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads);
/*!
* \brief VTA GEMM unit test.
* \param batch Batch size.
* \param in_channels Input channels.
* \param out_channels Output channels.
* \param uop_compression Apply micro-op compression.
* \return Number of errors from the test run.
*/
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression);
#endif // TESTS_HARDWARE_COMMON_TEST_LIB_H_

Просмотреть файл

@ -1,7 +1,7 @@
CC ?= g++
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
LIBS = -l:libsds_lib.so -l:libdma.so
LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
INCLUDE_DIR = ../../../include
DRIVER_DIR = ../../../src/pynq
TESTLIB_DIR = ../common
@ -10,19 +10,14 @@ SOURCES = pynq_driver.cc test_lib.cc
OBJECTS = pynq_driver.o test_lib.o metal_test.o
EXECUTABLE = vta
# Include top-level config file
ifndef config
ifneq ("$(wildcard ../../../config.mk)", "")
config = ../../../config.mk
else
config = ../../../make/config.mk
endif
endif
include $(config)
# Include VTA config
VTA_CONFIG = python ../../../make/vta_config.py
CFLAGS += `${VTA_CONFIG} --cflags`
LDFLAGS += `${VTA_CONFIG} --ldflags`
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
# Define flags
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
CFLAGS += $(ADD_CFLAGS)
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
# All Target
all: $(EXECUTABLE)

Просмотреть файл

@ -1,6 +1,6 @@
/*!
* Copyright (c) 2018 by Contributors
* \file driver_test.cpp
* \file metal_test.cpp
* \brief Bare-metal test to test driver and VTA design.
*/
@ -13,104 +13,6 @@
#include "../../../src/pynq/pynq_driver.h"
#include "../common/test_lib.h"
// VTA invocation (present the same abstraction as in the simulation tests)
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[128];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
#if VTA_DEBUG == 1
printf("INFO - Programming FPGA: %s!\n", bitstream);
#endif
// Program VTA
VTAProgram(bitstream);
// Get VTA handles
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
#endif
clock_gettime(CLOCK_REALTIME, &start);
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
// LOAD @ 0x10 : Data signal of inputs_V
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
// LOAD @ 0x18 : Data signal of weight_V
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
// COMPUTE @ 0x20 : Data signal of uops_V
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
// COMPUTE @ 0x28 : Data signal of biases_V
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
// STORE @ 0x10 : Data signal of outputs_V
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
// VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
if (flag & VTA_DONE) break;
}
if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
#if VTA_DEBUG == 1
} else {
printf("INFO - FPGA Finished!\n");
#endif
}
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
return t_fpga;
}
int main(void) {
#if VTA_DEBUG == 1
printParameters();