[REFACTOR] Code base refactoring (#5)
This commit is contained in:
Родитель
0979e9aaf5
Коммит
28a10b6904
12
vta/Makefile
12
vta/Makefile
|
@ -54,9 +54,13 @@ endif
|
|||
|
||||
all: lib/libvta.$(SHARED_LIBRARY_SUFFIX)
|
||||
|
||||
SRC = $(wildcard src/*.cc src/*.cc)
|
||||
ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
|
||||
ALL_DEP = $(ALL_OBJ)
|
||||
VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
|
||||
ifeq ($(TARGET), PYNQ_TARGET)
|
||||
VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
|
||||
LDFLAGS += -L/usr/lib -lsds_lib
|
||||
LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so
|
||||
endif
|
||||
VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC))
|
||||
|
||||
test: $(TEST)
|
||||
|
||||
|
@ -65,7 +69,7 @@ build/src/%.o: src/%.cc
|
|||
$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
|
||||
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP)
|
||||
lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
|
||||
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
|
||||
python -m tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so
|
|
@ -2,9 +2,9 @@
|
|||
ROOTDIR = $(CURDIR)
|
||||
BUILD_DIR = $(ROOTDIR)/build
|
||||
SCRIPT_DIR = $(ROOTDIR)/scripts
|
||||
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
|
||||
SRC_DIR = $(ROOTDIR)/src
|
||||
SIM_DIR = $(ROOTDIR)/sim
|
||||
TEST_DIR = $(ROOTDIR)/../../src/test
|
||||
TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
|
||||
INCLUDE_DIR = $(ROOTDIR)/../../include
|
||||
|
||||
# Executables
|
||||
|
@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls
|
|||
VIVADO = vivado
|
||||
HSI = hsi
|
||||
|
||||
# Build parameters:
|
||||
# Include top-level config file
|
||||
ifndef config
|
||||
ifneq ("$(wildcard ../../config.mk)", "")
|
||||
config = ../../config.mk
|
||||
else
|
||||
config = ../../make/config.mk
|
||||
endif
|
||||
endif
|
||||
include $(config)
|
||||
|
||||
#---------------------
|
||||
# Compilation parameters
|
||||
#--------------------
|
||||
|
||||
# Number of threads during compilation
|
||||
NUM_THREADS = 8
|
||||
|
||||
# Target Frequency
|
||||
CLOCK_FREQ = 100
|
||||
# Log of input width in bits
|
||||
LOG_INP_WIDTH = 3
|
||||
# Log of weight width in bits
|
||||
LOG_WGT_WIDTH = 3
|
||||
# Log of accum width in bits
|
||||
LOG_ACC_WIDTH = 5
|
||||
# Log of output width in bits
|
||||
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
|
||||
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BATCH = 0
|
||||
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_IN_BLOCK = 4
|
||||
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_OUT_BLOCK = 4
|
||||
# Log of uop buffer size in Bytes
|
||||
LOG_UOP_BUFF_SIZE = 15
|
||||
# Log of inp buffer size in Bytes
|
||||
LOG_INP_BUFF_SIZE = 15
|
||||
# Log of wgt buffer size in Bytes
|
||||
LOG_WGT_BUFF_SIZE = 15
|
||||
# Log of acc buffer size in Bytes
|
||||
LOG_ACC_BUFF_SIZE = 17
|
||||
# Log of out buffer size in Bytes
|
||||
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
|
||||
|
||||
# Derived parameter
|
||||
# Input width in bits
|
||||
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
|
||||
# Weight width in bits
|
||||
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
|
||||
# Output width in bits
|
||||
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
|
||||
# Tensor batch size
|
||||
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
|
||||
# Tensor outer block size
|
||||
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
|
||||
# Tensor inner block size
|
||||
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
|
||||
# Uop buffer size in Bytes
|
||||
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
|
||||
# Inp buffer size in Bytes
|
||||
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
|
||||
# Wgt buffer size in Bytes
|
||||
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
|
||||
# Acc buffer size in Bytes
|
||||
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
|
||||
# Out buffer size in Bytes
|
||||
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
|
||||
# Timing closure compensation (0 for none, 3 for highest)
|
||||
TIMING_CLOSURE_COMP = 0
|
||||
|
||||
# Derive clock target period
|
||||
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
|
||||
|
@ -85,7 +54,7 @@ ip:
|
|||
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
|
||||
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
|
||||
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
|
||||
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
|
||||
$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
|
||||
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
|
||||
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } {
|
|||
}
|
||||
|
||||
# C define flags to pass to compiler
|
||||
set cflags "-I $include_dir -I $include_dir/hardware/hls \
|
||||
set cflags "-I $include_dir -I $src_dir -I $test_dir \
|
||||
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
|
||||
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
|
||||
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
|
||||
|
@ -127,7 +127,7 @@ open_project vta_sim
|
|||
set_top vta
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
|
||||
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
|
||||
add_files -tb $test_dir/test_lib.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csim_design -clean
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "vta.h"
|
||||
#include "vta_test_lib.h"
|
||||
#include "../src/vta.h"
|
||||
#include "../../../tests/hardware/common/test_lib.h"
|
||||
|
||||
int main(void)
|
||||
{
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "vta.h"
|
||||
#include "./vta.h"
|
||||
|
||||
void fetch (
|
||||
uint32_t insn_count,
|
|
@ -11,8 +11,88 @@
|
|||
#include <ap_int.h>
|
||||
#include <hls_stream.h>
|
||||
|
||||
#include "vta_typedefs.h"
|
||||
#include "vta_params.h"
|
||||
#include <vta/hw_spec.h>
|
||||
|
||||
/* \typedef uop_T Micro-op datatype*/
|
||||
typedef ap_uint<UOP_WIDTH> uop_T;
|
||||
|
||||
/* \typedef inp_T Input datatype*/
|
||||
typedef ap_int<INP_WIDTH> inp_T;
|
||||
|
||||
/* \typedef wgt_T Weight datatype*/
|
||||
typedef ap_int<WGT_WIDTH> wgt_T;
|
||||
|
||||
/* \typedef out_T Output datatype*/
|
||||
typedef ap_int<OUT_WIDTH> out_T;
|
||||
|
||||
/* \typedef acc_T Accumulator datatype*/
|
||||
typedef ap_int<ACC_WIDTH> acc_T;
|
||||
|
||||
/* \typedef mul_T Multiplier output datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
|
||||
|
||||
/* \typedef sum_T GEMM accumulator datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
|
||||
|
||||
/* \typedef inp_vec_T Input vector datatype*/
|
||||
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
|
||||
|
||||
/* \typedef wgt_vec_T Weight vector datatype*/
|
||||
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
|
||||
|
||||
/* \typedef acc_vec_T Accumulator vector datatype*/
|
||||
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
|
||||
|
||||
/* \typedef out_vec_T Output vector datatype*/
|
||||
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
|
||||
|
||||
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
|
||||
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
|
||||
|
||||
/* \typedef inp_idx_T Input SRAM index datatype*/
|
||||
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
|
||||
|
||||
/* \typedef wgt_idx_T Weight SRAM index datatype*/
|
||||
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
|
||||
|
||||
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
|
||||
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
|
||||
|
||||
/* \typedef opcode_T Opcode datatype*/
|
||||
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
|
||||
|
||||
/* \typedef insn_T Instruction datatype*/
|
||||
typedef ap_uint<INS_WIDTH> insn_T;
|
||||
|
||||
/* \typedef loop_T Loop bound datatype*/
|
||||
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
|
||||
|
||||
/* \typedef memop_id_T Memory operation ID datatype*/
|
||||
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
|
||||
|
||||
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
|
||||
|
||||
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
|
||||
|
||||
/* \typedef memop_size_T Memory operation range datatype*/
|
||||
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
|
||||
|
||||
/* \typedef memop_stride_T Memory operation stride datatype*/
|
||||
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
|
||||
|
||||
/* \typedef memop_pad_T Memory operation pad width datatype*/
|
||||
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
|
||||
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
|
||||
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
|
||||
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
|
||||
|
||||
/*!
|
||||
* \brief Fetch module.
|
|
@ -1,97 +0,0 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_typedefs.h
|
||||
* \brief Type definitions for VTA HLS design.
|
||||
*/
|
||||
#ifndef VTA_TYPEDEFS_H_
|
||||
#define VTA_TYPEDEFS_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <ap_axi_sdata.h>
|
||||
#include <ap_int.h>
|
||||
#include <hls_stream.h>
|
||||
|
||||
#include "vta_params.h"
|
||||
|
||||
/* \typedef uop_T Micro-op datatype*/
|
||||
typedef ap_uint<UOP_WIDTH> uop_T;
|
||||
|
||||
/* \typedef inp_T Input datatype*/
|
||||
typedef ap_int<INP_WIDTH> inp_T;
|
||||
|
||||
/* \typedef wgt_T Weight datatype*/
|
||||
typedef ap_int<WGT_WIDTH> wgt_T;
|
||||
|
||||
/* \typedef out_T Output datatype*/
|
||||
typedef ap_int<OUT_WIDTH> out_T;
|
||||
|
||||
/* \typedef acc_T Accumulator datatype*/
|
||||
typedef ap_int<ACC_WIDTH> acc_T;
|
||||
|
||||
/* \typedef mul_T Multiplier output datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
|
||||
|
||||
/* \typedef sum_T GEMM accumulator datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
|
||||
|
||||
/* \typedef inp_vec_T Input vector datatype*/
|
||||
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
|
||||
|
||||
/* \typedef wgt_vec_T Weight vector datatype*/
|
||||
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
|
||||
|
||||
/* \typedef acc_vec_T Accumulator vector datatype*/
|
||||
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
|
||||
|
||||
/* \typedef out_vec_T Output vector datatype*/
|
||||
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
|
||||
|
||||
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
|
||||
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
|
||||
|
||||
/* \typedef inp_idx_T Input SRAM index datatype*/
|
||||
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
|
||||
|
||||
/* \typedef wgt_idx_T Weight SRAM index datatype*/
|
||||
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
|
||||
|
||||
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
|
||||
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
|
||||
|
||||
/* \typedef opcode_T Opcode datatype*/
|
||||
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
|
||||
|
||||
/* \typedef insn_T Instruction datatype*/
|
||||
typedef ap_uint<INS_WIDTH> insn_T;
|
||||
|
||||
/* \typedef loop_T Loop bound datatype*/
|
||||
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
|
||||
|
||||
/* \typedef memop_id_T Memory operation ID datatype*/
|
||||
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
|
||||
|
||||
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
|
||||
|
||||
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
|
||||
|
||||
/* \typedef memop_size_T Memory operation range datatype*/
|
||||
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
|
||||
|
||||
/* \typedef memop_stride_T Memory operation stride datatype*/
|
||||
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
|
||||
|
||||
/* \typedef memop_pad_T Memory operation pad width datatype*/
|
||||
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
|
||||
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
|
||||
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
|
||||
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
|
||||
|
||||
#endif // VTA_TYPEDEFS_H_
|
|
@ -0,0 +1,100 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_driver.h
|
||||
* \brief General driver interface.
|
||||
*/
|
||||
|
||||
#ifndef VTA_DRIVER_H_
|
||||
#define VTA_DRIVER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define CACHED 1
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define NOT_CACHED 0
|
||||
|
||||
/*! \brief VTA command handle */
|
||||
typedef void * VTAHandle;
|
||||
|
||||
/*!
|
||||
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
|
||||
* \param size Size of the region in Bytes.
|
||||
* \param cached Region can be set to not cached (write-back) if set to 0.
|
||||
* \return A pointer to the allocated region.
|
||||
*/
|
||||
void* VTAMemAlloc(size_t size, int cached);
|
||||
|
||||
/*!
|
||||
* \brief Frees a physically contiguous region in memory.
|
||||
* \param buf Buffer to free.
|
||||
*/
|
||||
void VTAMemFree(void* buf);
|
||||
|
||||
/*!
|
||||
* \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
|
||||
* \param buf Pointer to memory region allocated with VTAMemAlloc.
|
||||
* \return The physical address of the memory region.
|
||||
*/
|
||||
uint32_t VTAGetMemPhysAddr(void* buf);
|
||||
|
||||
/*!
|
||||
* \brief Flushes the region of memory out of the CPU cache to DRAM.
|
||||
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
|
||||
* \param size Size of the region to flush in Bytes.
|
||||
*/
|
||||
void VTAFlushCache(void* buf, int size);
|
||||
|
||||
/*!
|
||||
* \brief Invalidates the region of memory that is cached.
|
||||
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
|
||||
* \param size Size of the region to invalidate in Bytes.
|
||||
*/
|
||||
void VTAInvalidateCache(void* buf, int size);
|
||||
|
||||
/*!
|
||||
* \brief Returns a memory map to FPGA configuration registers.
|
||||
* \param addr The base physical address of the configuration registers.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
* \return A pointer to the memory mapped region.
|
||||
*/
|
||||
void *VTAMapRegister(unsigned addr, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Deletes the configuration register memory map.
|
||||
* \param vta The memory mapped region.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
*/
|
||||
void VTAUnmapRegister(void *vta, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Writes to a memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to write to.
|
||||
* \param val The value to be written to the memory mapped register.
|
||||
*/
|
||||
void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
|
||||
|
||||
/*!
|
||||
* \brief Reads from the memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to read from.
|
||||
* \return The value read from the memory mapped register.
|
||||
*/
|
||||
unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
|
||||
|
||||
/*!
|
||||
* \brief Programming the bit stream on the FPGA.
|
||||
* \param bitstream The path to the bit stream file.
|
||||
*/
|
||||
void VTAProgram(const char* bitstream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_DRIVER_H_
|
|
@ -3,8 +3,13 @@
|
|||
* \file vta_defines.h
|
||||
* \brief Preprocessor definitions for VTA HLS design and runtime.
|
||||
*/
|
||||
#ifndef VTA_DEFINES_H_
|
||||
#define VTA_DEFINES_H_
|
||||
|
||||
#ifndef VTA_HW_SPEC_H_
|
||||
#define VTA_HW_SPEC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
@ -556,4 +561,7 @@ typedef struct {
|
|||
uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH;
|
||||
} VTAUop;
|
||||
|
||||
#endif // VTA_DEFINES_H_
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_HW_SPEC_H_
|
|
@ -0,0 +1,274 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file runtime.h
|
||||
* \brief VTA runtime library.
|
||||
*/
|
||||
|
||||
#ifndef VTA_RUNTIME_H_
|
||||
#define VTA_RUNTIME_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "./driver.h"
|
||||
|
||||
#define VTA_MEMCPY_H2D 1
|
||||
#define VTA_MEMCPY_D2H 2
|
||||
#define VTA_MEMCPY_D2D 3
|
||||
|
||||
#define VTA_DEBUG_DUMP_INSN (1 << 1)
|
||||
#define VTA_DEBUG_DUMP_UOP (1 << 2)
|
||||
#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
|
||||
#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
|
||||
#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
|
||||
|
||||
/*! \brief VTA command handle */
|
||||
typedef void * VTACommandHandle;
|
||||
|
||||
/*! \brief Shutdown hook of VTA to cleanup resources */
|
||||
void VTARuntimeShutdown();
|
||||
|
||||
/*!
|
||||
* \brief Get thread local command handle.
|
||||
* \return A thread local command handle.
|
||||
*/
|
||||
VTACommandHandle VTATLSCommandHandle();
|
||||
|
||||
/*!
|
||||
* \brief Allocate data buffer.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param size Buffer size.
|
||||
* \return A pointer to the allocated buffer.
|
||||
*/
|
||||
void* VTABufferAlloc(VTACommandHandle cmd, size_t size);
|
||||
|
||||
/*!
|
||||
* \brief Free data buffer.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param buffer The data buffer to be freed.
|
||||
*/
|
||||
void VTABufferFree(VTACommandHandle cmd, void* buffer);
|
||||
|
||||
/*!
|
||||
* \brief Get the buffer access pointer on CPU.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param buffer The data buffer.
|
||||
* \return The pointer that can be accessed by the CPU.
|
||||
*/
|
||||
void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
|
||||
|
||||
/*!
|
||||
* \brief Copy data buffer from one location to another.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param from The source buffer base address.
|
||||
* \param from_offset The offset of the source buffer.
|
||||
* \param to The target buffer base address.
|
||||
* \param to_offset The offset of the target buffer.
|
||||
* \param size Size of copy.
|
||||
* \param kind_mask The memory copy kind.
|
||||
*/
|
||||
void VTABufferCopy(VTACommandHandle cmd,
|
||||
const void* from,
|
||||
size_t from_offset,
|
||||
void* to,
|
||||
size_t to_offset,
|
||||
size_t size,
|
||||
int kind_mask);
|
||||
|
||||
/*!
|
||||
* \brief Set debug mode on the command handle.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param debug_flag The debug flag.
|
||||
*/
|
||||
void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
|
||||
|
||||
/*!
|
||||
* \brief Perform a write barrier to make a memory region visible to the CPU.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param buffer The head buffer pointer.
|
||||
* \param elem_bits The size in bits of each element.
|
||||
* \param start The start of the region (in elements).
|
||||
* \param extent The end of the region (in elements).
|
||||
*/
|
||||
void VTAWriteBarrier(VTACommandHandle cmd,
|
||||
void* buffer, uint32_t elem_bits,
|
||||
uint32_t start, uint32_t extent);
|
||||
|
||||
/*!
|
||||
* \brief Perform a read barrier to a memory region visible to VTA.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param buffer The head buffer pointer.
|
||||
* \param elem_bits The unit bits of each elements.
|
||||
* \param start The start of the region (in elements).
|
||||
* \param extent The end of the region (in elements).
|
||||
*/
|
||||
void VTAReadBarrier(VTACommandHandle cmd,
|
||||
void* buffer, uint32_t elem_bits,
|
||||
uint32_t start, uint32_t extent);
|
||||
|
||||
/*!
|
||||
* \brief Perform a 2D data load from DRAM.
|
||||
* Sizes are measured in units of vector elements.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param src_dram_addr Source DRAM address.
|
||||
* \param src_elem_offset The source DRAM offset in number of unit elements.
|
||||
* \param x_size The lowest dimension (x axis) size in number of unit elements.
|
||||
* \param y_size The number of rows (y axis).
|
||||
* \param x_stride The x axis stride.
|
||||
* \param x_pad_before The start padding on x axis.
|
||||
* \param y_pad_before The start padding on y axis.
|
||||
* \param x_pad_after The end padding on x axis.
|
||||
* \param y_pad_after The end padding of y axis.
|
||||
* \param dst_sram_index Destination SRAM index.
|
||||
* \param dst_memory_type Destination memory type.
|
||||
*/
|
||||
void VTALoadBuffer2D(VTACommandHandle cmd,
|
||||
void* src_dram_addr,
|
||||
uint32_t src_elem_offset,
|
||||
uint32_t x_size,
|
||||
uint32_t y_size,
|
||||
uint32_t x_stride,
|
||||
uint32_t x_pad_before,
|
||||
uint32_t y_pad_before,
|
||||
uint32_t x_pad_after,
|
||||
uint32_t y_pad_after,
|
||||
uint32_t dst_sram_index,
|
||||
uint32_t dst_memory_type);
|
||||
|
||||
/*!
|
||||
* \brief Perform a 2D data store into DRAM
|
||||
* Sizes are measured in units of vector elements.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param src_sram_index Source SRAM index.
|
||||
* \param src_memory_type Source memory type.
|
||||
* \param dst_dram_addr Destination DRAM address.
|
||||
* \param x_size The lowest dimension (x axis) size in number of unit elements.
|
||||
* \param y_size The number of rows.
|
||||
* \param x_stride The x axis stride.
|
||||
*/
|
||||
void VTAStoreBuffer2D(VTACommandHandle cmd,
|
||||
uint32_t src_sram_index,
|
||||
uint32_t src_memory_type,
|
||||
void* dst_dram_addr,
|
||||
uint32_t dst_elem_offset,
|
||||
uint32_t x_size,
|
||||
uint32_t y_size,
|
||||
uint32_t x_stride);
|
||||
|
||||
/*!
|
||||
* \brief Push uop into kernel buffer.
|
||||
* In GEMM mode, do a blocked GEMM with 2d access pattern.
|
||||
* In ALU mode, do a vectorized ALU operation with 2d access pattern.
|
||||
*
|
||||
* \code
|
||||
*
|
||||
* DType accum[INP_BUFF_DEPTH][l][n];
|
||||
* DType weight[WGT_BUFF_DEPTH][n][m];
|
||||
* DType input[INP_BUFF_DEPTH][l][m];
|
||||
* if reset_out == 1
|
||||
* accum[dst_index] = 0
|
||||
* elif mode == 0
|
||||
* accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
|
||||
* else
|
||||
* if (use_imm)
|
||||
* accum[dst_index] = opcode(accum[dst_index], imm_val);
|
||||
* else
|
||||
* accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
|
||||
*
|
||||
* \endcode
|
||||
*
|
||||
* \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
|
||||
* \param reset_out Resets the accum to 0.
|
||||
* \param dst_index The accum memory index.
|
||||
* \param src_index The input memory (gemm) / accum memory (alu) index.
|
||||
* \param wgt_index The weight memory index.
|
||||
* \param opcode The ALU opcode.
|
||||
* \param use_imm Use immediate in ALU mode if set to true.
|
||||
* \param imm_val Immediate value in ALU mode.
|
||||
*/
|
||||
void VTAUopPush(uint32_t mode,
|
||||
uint32_t reset_out,
|
||||
uint32_t dst_index,
|
||||
uint32_t src_index,
|
||||
uint32_t wgt_index,
|
||||
uint32_t opcode,
|
||||
uint32_t use_imm,
|
||||
uint32_t imm_val);
|
||||
|
||||
/*!
|
||||
* \brief Mark start of a micro op loop.
|
||||
* \param extent The extent of the loop.
|
||||
* \param dst_factor The accum factor.
|
||||
* \param src_factor The input factor.
|
||||
* \param wgt_factor The weight factor.
|
||||
*/
|
||||
void VTAUopLoopBegin(uint32_t extent,
|
||||
uint32_t dst_factor,
|
||||
uint32_t src_factor,
|
||||
uint32_t wgt_factor);
|
||||
|
||||
/*!
|
||||
* \brief Mark end of a micro op loop.
|
||||
*/
|
||||
void VTAUopLoopEnd();
|
||||
|
||||
/*!
|
||||
* \brief Push GEMM uop kernel into the command handle.
|
||||
* \param uop_handle The uop cache handle.
|
||||
* \param finit The initalization function to initialize uop.
|
||||
* \param signature The closure arguments of the finit.
|
||||
* \param nbytes Number of bytes to in the closure arguments.
|
||||
* \return 0 if success.
|
||||
*/
|
||||
int VTAPushGEMMOp(void** uop_handle,
|
||||
int (*finit)(void*),
|
||||
void* signature,
|
||||
int nbytes);
|
||||
|
||||
/*!
|
||||
* \brief Push ALU uop kernel into the command handle.
|
||||
* \param uop_handle The uop cache handle.
|
||||
* \param finit The initalization function to initialize uop.
|
||||
* \param signature The closure arguments of the finit.
|
||||
* \param nbytes Number of bytes to in the closure arguments.
|
||||
* \return 0 if success.
|
||||
*/
|
||||
int VTAPushALUOp(void** uop_handle,
|
||||
int (*finit)(void*),
|
||||
void* signature,
|
||||
int nbytes);
|
||||
|
||||
/*!
|
||||
* \brief Push dependence token.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param from_qid The source queue.
|
||||
* \param to_qid The destination queue.
|
||||
* \return 0 if success.
|
||||
*/
|
||||
int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
|
||||
|
||||
/*!
|
||||
* \brief Pop dependence signal.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param from_qid The source queue.
|
||||
* \param to_qid The destination queue.
|
||||
* \return 0 if success.
|
||||
*/
|
||||
int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
|
||||
|
||||
/*!
|
||||
* \brief Synchronize the command handle.
|
||||
* Commit all the instructions to VTA and wait until
|
||||
* the accelerator finishes its job.
|
||||
* Perform all of the out-of-order DRAM stores.
|
||||
* \param cmd The VTA command handle.
|
||||
* \param wait_cycles The limit of poll cycles.
|
||||
*
|
||||
*/
|
||||
void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_RUNTIME_H_
|
|
@ -1,152 +0,0 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_pynq_driver.h
|
||||
* \brief VTA driver for Pynq board.
|
||||
*/
|
||||
|
||||
#ifndef VTA_PYNQ_DRIVER_H_
|
||||
#define VTA_PYNQ_DRIVER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __arm__
|
||||
#include "libxlnk_cma.h"
|
||||
#else
|
||||
void* cma_alloc(size_t size, int cached);
|
||||
void cma_free(void* buf);
|
||||
uint32_t cma_get_phy_addr(void* buf);
|
||||
void xlnkFlushCache(void* buf, int size);
|
||||
void xlnkInvalidateCache(void* buf, int size);
|
||||
#endif
|
||||
|
||||
/*! \brief VTA command handle */
|
||||
typedef void * VTAHandle;
|
||||
|
||||
/*! \brief DMA command handle */
|
||||
typedef struct {
|
||||
/*! \brief Register map to the AXI DMA control registers*/
|
||||
void *dma_register_map;
|
||||
/*! \brief Transmit data descriptor*/
|
||||
void *mm2s_descriptor_register_map;
|
||||
/*! \brief Receive data descriptor*/
|
||||
void *s2mm_descriptor_register_map;
|
||||
/*! \brief Transmit data descriptor physical address*/
|
||||
uint32_t mm2s_descriptor_phy;
|
||||
/*! \brief Receive data descriptor physical address*/
|
||||
uint32_t s2mm_descriptor_phy;
|
||||
/*! \brief Descriptor size */
|
||||
uint32_t descriptor_size;
|
||||
/*! \brief Transaction count for tx channel */
|
||||
uint32_t mm2s_count;
|
||||
/*! \brief Transaction count for rx channel */
|
||||
uint32_t s2mm_count;
|
||||
/*! \brief Multi-channel mode enable */
|
||||
int multichannel_en;
|
||||
} DMAHandle;
|
||||
|
||||
/*! \brief partial bitstream status file path */
|
||||
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
|
||||
/*! \brief bitstream destination file path */
|
||||
#define BS_XDEVCFG "/dev/xdevcfg"
|
||||
|
||||
/*! \brief Path to /dev/mem */
|
||||
#define DEV_MEM_PATH "/dev/mem"
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_LENGTH 4
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
|
||||
|
||||
/*! \brief VTA configuration register address range */
|
||||
#define VTA_RANGE 0x100
|
||||
/*! \brief VTA configuration register start value */
|
||||
#define VTA_START 0x1
|
||||
/*! \brief VTA configuration register auto-restart value */
|
||||
#define VTA_AUTORESTART 0x81
|
||||
/*! \brief VTA configuration register done value */
|
||||
#define VTA_DONE 0x1
|
||||
|
||||
/*! \brief VTA fetch stage configuration register address
|
||||
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_FETCH_ADDR 0x43C00000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_COMPUTE_ADDR 0x43C10000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_LOAD_ADDR 0x43C20000
|
||||
/*! \brief VTA store stage configuration register address
|
||||
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_STORE_ADDR 0x43C30000
|
||||
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define CACHED 1
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define NOT_CACHED 0
|
||||
|
||||
/*! \brief log2 of SDS buffer size limit */
|
||||
#define LOG_MAX_XFER 22
|
||||
/*! \brief SDS buffer size limit */
|
||||
#define MAX_XFER (1<<LOG_MAX_XFER)
|
||||
|
||||
/*!
|
||||
* \brief Returns a memory map to FPGA configuration registers.
|
||||
* \param addr The base physical address of the configuration registers.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
* \return A pointer to the memory mapped region.
|
||||
*/
|
||||
void *MapRegister(unsigned addr, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Deletes the configuration register memory map.
|
||||
* \param vta The memory mapped region.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
*/
|
||||
void UnmapRegister(void *vta, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Writes to a memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to write to.
|
||||
* \param val The value to be written to the memory mapped register.
|
||||
*/
|
||||
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
|
||||
|
||||
/*!
|
||||
* \brief Reads from the memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to read from.
|
||||
* \return The value read from the memory mapped register.
|
||||
*/
|
||||
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
|
||||
|
||||
/*!
|
||||
* \brief Programming the bit stream on the FPGA.
|
||||
* \param bitstream The path to the bit stream file.
|
||||
*/
|
||||
void ProgramVTA(const char* bitstream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_PYNQ_DRIVER_H_
|
|
@ -25,3 +25,72 @@ ADD_LDFLAGS=
|
|||
|
||||
# the additional compile flags you want to add
|
||||
ADD_CFLAGS=
|
||||
|
||||
# the hardware target
|
||||
TARGET=PYNQ_TARGET
|
||||
|
||||
#---------------------
|
||||
# VTA hardware parameters
|
||||
#--------------------
|
||||
|
||||
# Log of input/activation width in bits (default 3 -> 8 bits)
|
||||
LOG_INP_WIDTH = 3
|
||||
# Log of kernel weight width in bits (default 3 -> 8 bits)
|
||||
LOG_WGT_WIDTH = 3
|
||||
# Log of accum width in bits (default 5 -> 32 bits)
|
||||
LOG_ACC_WIDTH = 5
|
||||
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BATCH = 0
|
||||
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BLOCK_IN = 4
|
||||
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BLOCK_OUT = 4
|
||||
# Log of uop buffer size in Bytes
|
||||
LOG_UOP_BUFF_SIZE = 15
|
||||
# Log of inp buffer size in Bytes
|
||||
LOG_INP_BUFF_SIZE = 15
|
||||
# Log of wgt buffer size in Bytes
|
||||
LOG_WGT_BUFF_SIZE = 15
|
||||
# Log of acc buffer size in Bytes
|
||||
LOG_ACC_BUFF_SIZE = 17
|
||||
|
||||
#---------------------
|
||||
# Derived VTA hardware parameters
|
||||
#--------------------
|
||||
|
||||
# Input width in bits
|
||||
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
|
||||
# Weight width in bits
|
||||
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
|
||||
# Log of output width in bits
|
||||
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
|
||||
# Output width in bits
|
||||
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
|
||||
# Tensor batch size
|
||||
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
|
||||
# Tensor outer block size
|
||||
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
|
||||
# Tensor inner block size
|
||||
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
|
||||
# Uop buffer size in Bytes
|
||||
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
|
||||
# Inp buffer size in Bytes
|
||||
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
|
||||
# Wgt buffer size in Bytes
|
||||
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
|
||||
# Acc buffer size in Bytes
|
||||
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
|
||||
# Log of out buffer size in Bytes
|
||||
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
|
||||
# Out buffer size in Bytes
|
||||
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
|
||||
|
||||
# Update ADD_CFLAGS
|
||||
ADD_CFLAGS += \
|
||||
-D$(TARGET) \
|
||||
-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
|
||||
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
|
||||
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
|
||||
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
|
||||
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
|
||||
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
|
|
@ -4,15 +4,31 @@
|
|||
* \brief VTA driver for Pynq board.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "vta_pynq_driver.h"
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#include <vta/driver.h>
|
||||
#include "./pynq_driver.h"
|
||||
|
||||
void *MapRegister(uint32_t addr, size_t length) {
|
||||
|
||||
void* VTAMemAlloc(size_t size, int cached) {
|
||||
return cma_alloc(size, cached);
|
||||
}
|
||||
|
||||
void VTAMemFree(void* buf) {
|
||||
cma_free(buf);
|
||||
}
|
||||
|
||||
uint32_t VTAGetMemPhysAddr(void* buf) {
|
||||
return cma_get_phy_addr(buf);
|
||||
}
|
||||
|
||||
void VTAFlushCache(void* buf, int size) {
|
||||
xlnkFlushCache(buf, size);
|
||||
}
|
||||
|
||||
void VTAInvalidateCache(void* buf, int size) {
|
||||
xlnkInvalidateCache(buf, size);
|
||||
}
|
||||
|
||||
void *VTAMapRegister(uint32_t addr, size_t length) {
|
||||
|
||||
// Align the base address with the pages
|
||||
uint32_t virt_base = addr & ~(getpagesize() - 1);
|
||||
|
@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) {
|
|||
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
|
||||
}
|
||||
|
||||
void UnmapRegister(void *vta, size_t length) {
|
||||
void VTAUnmapRegister(void *vta, size_t length) {
|
||||
// Unmap memory
|
||||
int status = munmap(vta, length);
|
||||
assert(status==0);
|
||||
}
|
||||
|
||||
void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
|
||||
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
|
||||
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
|
||||
}
|
||||
|
||||
uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
|
||||
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
|
||||
return *((volatile uint32_t *) (((char *) base_addr) + offset));
|
||||
}
|
||||
|
||||
void ProgramVTA(const char* bitstream) {
|
||||
void VTAProgram(const char* bitstream) {
|
||||
|
||||
int elem;
|
||||
FILE *src, *dst, *partial;
|
|
@ -0,0 +1,83 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_pynq_driver.h
|
||||
* \brief VTA driver for Pynq board.
|
||||
*/
|
||||
|
||||
#ifndef VTA_PYNQ_DRIVER_H_
|
||||
#define VTA_PYNQ_DRIVER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __arm__
|
||||
#include <libxlnk_cma.h>
|
||||
#else
|
||||
void* cma_alloc(size_t size, int cached);
|
||||
void cma_free(void* buf);
|
||||
uint32_t cma_get_phy_addr(void* buf);
|
||||
void xlnkFlushCache(void* buf, int size);
|
||||
void xlnkInvalidateCache(void* buf, int size);
|
||||
#endif
|
||||
|
||||
/*! \brief partial bitstream status file path */
|
||||
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
|
||||
/*! \brief bitstream destination file path */
|
||||
#define BS_XDEVCFG "/dev/xdevcfg"
|
||||
|
||||
/*! \brief Path to /dev/mem */
|
||||
#define DEV_MEM_PATH "/dev/mem"
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_LENGTH 4
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
|
||||
|
||||
/*! \brief VTA configuration register address range */
|
||||
#define VTA_RANGE 0x100
|
||||
/*! \brief VTA configuration register start value */
|
||||
#define VTA_START 0x1
|
||||
/*! \brief VTA configuration register auto-restart value */
|
||||
#define VTA_AUTORESTART 0x81
|
||||
/*! \brief VTA configuration register done value */
|
||||
#define VTA_DONE 0x1
|
||||
|
||||
/*! \brief VTA fetch stage configuration register address
|
||||
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_FETCH_ADDR 0x43C00000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_COMPUTE_ADDR 0x43C10000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_LOAD_ADDR 0x43C20000
|
||||
/*! \brief VTA store stage configuration register address
|
||||
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_STORE_ADDR 0x43C30000
|
||||
|
||||
/*! \brief Buffer size limit */
|
||||
#define MAX_XFER (1<<22)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_PYNQ_DRIVER_H_
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,106 @@
|
|||
// simply include the driver for now.
|
||||
#include <tvm/runtime/registry.h>
|
||||
#include <dmlc/thread_local.h>
|
||||
#include <vta/runtime.h>
|
||||
#include "../../tvm/src/runtime/workspace_pool.h"
|
||||
|
||||
namespace tvm {
|
||||
namespace runtime {
|
||||
|
||||
std::string VTARPCGetPath(const std::string& name) {
|
||||
static const PackedFunc* f =
|
||||
runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
|
||||
CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
|
||||
return (*f)(name);
|
||||
}
|
||||
|
||||
// Global functions that can be called
|
||||
TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
|
||||
.set_body([](TVMArgs args, TVMRetValue* rv) {
|
||||
std::string path = VTARPCGetPath(args[0]);
|
||||
VTAProgram(path.c_str());
|
||||
LOG(INFO) << "VTA initialization end with bistream " << path;
|
||||
});
|
||||
|
||||
TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown")
|
||||
.set_body([](TVMArgs args, TVMRetValue* rv) {
|
||||
VTARuntimeShutdown();
|
||||
});
|
||||
|
||||
class VTADeviceAPI final : public DeviceAPI {
|
||||
public:
|
||||
void SetDevice(TVMContext ctx) final {}
|
||||
|
||||
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
|
||||
if (kind == kExist) {
|
||||
*rv = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void* AllocDataSpace(TVMContext ctx,
|
||||
size_t size, size_t alignment,
|
||||
TVMType type_hint) final {
|
||||
return VTABufferAlloc(VTATLSCommandHandle(), size);
|
||||
}
|
||||
|
||||
void FreeDataSpace(TVMContext ctx, void* ptr) final {
|
||||
VTABufferFree(VTATLSCommandHandle(), ptr);
|
||||
}
|
||||
|
||||
void CopyDataFromTo(const void* from,
|
||||
size_t from_offset,
|
||||
void* to,
|
||||
size_t to_offset,
|
||||
size_t size,
|
||||
TVMContext ctx_from,
|
||||
TVMContext ctx_to,
|
||||
TVMStreamHandle stream) final {
|
||||
int kind_mask = 0;
|
||||
if (ctx_from.device_type != kDLCPU) {
|
||||
kind_mask |= 2;
|
||||
}
|
||||
if (ctx_to.device_type != kDLCPU) {
|
||||
kind_mask |= 1;
|
||||
}
|
||||
VTABufferCopy(VTATLSCommandHandle(),
|
||||
from, from_offset,
|
||||
to, to_offset,
|
||||
size, kind_mask);
|
||||
}
|
||||
|
||||
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
|
||||
}
|
||||
|
||||
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
|
||||
|
||||
void FreeWorkspace(TVMContext ctx, void* data) final;
|
||||
|
||||
static const std::shared_ptr<VTADeviceAPI>& Global() {
|
||||
static std::shared_ptr<VTADeviceAPI> inst =
|
||||
std::make_shared<VTADeviceAPI>();
|
||||
return inst;
|
||||
}
|
||||
};
|
||||
|
||||
struct VTAWorkspacePool : public WorkspacePool {
|
||||
VTAWorkspacePool() :
|
||||
WorkspacePool(static_cast<DLDeviceType>(kExtDev),
|
||||
VTADeviceAPI::Global()) {}
|
||||
};
|
||||
|
||||
void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
|
||||
return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
|
||||
->AllocWorkspace(ctx, size);
|
||||
}
|
||||
|
||||
void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
|
||||
dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
|
||||
}
|
||||
|
||||
TVM_REGISTER_GLOBAL("device_api.ext_dev")
|
||||
.set_body([](TVMArgs args, TVMRetValue* rv) {
|
||||
DeviceAPI* ptr = VTADeviceAPI::Global().get();
|
||||
*rv = static_cast<void*>(ptr);
|
||||
});
|
||||
} // namespace runtime
|
||||
} // namespace tvm
|
|
@ -1,59 +0,0 @@
|
|||
CC ?= g++
|
||||
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
|
||||
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
|
||||
LIBS = -l:libsds_lib.so -l:libdma.so
|
||||
SRC_DIR = ../../src
|
||||
INCLUDE_DIR = ../../include
|
||||
DRIVER_DIR = $(SRC_DIR)/driver/pynq
|
||||
TESTLIB_DIR = $(SRC_DIR)/test
|
||||
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
|
||||
SOURCES = vta_pynq_driver.c vta_test_lib.cc
|
||||
OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
|
||||
EXECUTABLE = vta
|
||||
|
||||
# VTA Parameters
|
||||
# Log of input width in bits
|
||||
LOG_INP_WIDTH = 3
|
||||
# Log of weight width in bits
|
||||
LOG_WGT_WIDTH = 3
|
||||
# Log of accum width in bits
|
||||
LOG_ACC_WIDTH = 5
|
||||
# Log of output width in bits
|
||||
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
|
||||
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BATCH = 0
|
||||
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_IN_BLOCK = 4
|
||||
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_OUT_BLOCK = 4
|
||||
# Log of uop buffer size in Bytes
|
||||
LOG_UOP_BUFF_SIZE = 15
|
||||
# Log of inp buffer size in Bytes
|
||||
LOG_INP_BUFF_SIZE = 15
|
||||
# Log of wgt buffer size in Bytes
|
||||
LOG_WGT_BUFF_SIZE = 15
|
||||
# Log of acc buffer size in Bytes
|
||||
LOG_ACC_BUFF_SIZE = 17
|
||||
# Log of out buffer size in Bytes
|
||||
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
|
||||
|
||||
# Define flags
|
||||
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
|
||||
-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
|
||||
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
|
||||
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
|
||||
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
|
||||
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
|
||||
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
|
||||
|
||||
# All Target
|
||||
all: $(EXECUTABLE)
|
||||
|
||||
%.o: %.cc $(SOURCES)
|
||||
$(CC) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
$(EXECUTABLE): $(OBJECTS)
|
||||
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
|
||||
|
||||
clean:
|
||||
rm -rf *.o $(EXECUTABLE)
|
|
@ -4,7 +4,7 @@
|
|||
* \brief Test library for the VTA design simulation and driver tests.
|
||||
*/
|
||||
|
||||
#include "vta_test_lib.h"
|
||||
#include "./test_lib.h"
|
||||
|
||||
const char* getOpcodeString(int opcode, bool use_imm) {
|
||||
// Returns string name
|
||||
|
@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
|
|||
|
||||
void * allocBuffer(size_t num_bytes) {
|
||||
#ifdef NO_SIM
|
||||
return cma_alloc(num_bytes, CACHED);
|
||||
return VTAMemAlloc(num_bytes, CACHED);
|
||||
#else
|
||||
return malloc(num_bytes);
|
||||
#endif
|
||||
|
@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) {
|
|||
|
||||
void freeBuffer(void * buffer) {
|
||||
#ifdef NO_SIM
|
||||
return cma_free(buffer);
|
||||
return VTAMemFree(buffer);
|
||||
#else
|
||||
return free(buffer);
|
||||
#endif
|
||||
|
@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
|
|||
|
||||
// Allocate buffer
|
||||
#ifdef NO_SIM
|
||||
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
#else
|
||||
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
|
||||
#endif
|
||||
|
@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
|
|||
|
||||
// Allocate buffer
|
||||
#ifdef NO_SIM
|
||||
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
#else
|
||||
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
|
||||
#endif
|
||||
|
@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
|
|||
|
||||
// Allocate buffer
|
||||
#ifdef NO_SIM
|
||||
VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
|
||||
#else
|
||||
VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
|
||||
#endif
|
||||
|
@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
}
|
||||
|
||||
// Compute reference output
|
||||
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, vector_size);
|
||||
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
|
||||
for (int i = 0; i < batch; i ++) {
|
||||
for (int j = 0; j < vector_size; j ++) {
|
||||
acc_T tmp = 0;
|
||||
|
@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
tmp = inputs[i][j] >> immediate[i / BATCH];
|
||||
}
|
||||
// Set
|
||||
outputs_ref[i][j] = (inp_T) tmp;
|
||||
outputs_ref[i][j] = (out_T) tmp;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);
|
||||
|
||||
// Prepare output buffer
|
||||
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
|
||||
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
|
||||
|
||||
#ifdef NO_SIM
|
||||
// Invoke the VTA
|
||||
|
@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
#endif
|
||||
|
||||
// Unpack output buffer
|
||||
inp_T **outputs = alloc2dArray<inp_T>(batch, vector_size);
|
||||
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
|
||||
out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
|
||||
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
|
||||
|
||||
// Correctness checks
|
||||
int err = 0;
|
||||
|
@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
|
|||
// Free all allocated arrays
|
||||
free(immediate);
|
||||
free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
|
||||
free2dArray<inp_T>(outputs_ref, batch, vector_size);
|
||||
free2dArray<inp_T>(outputs, batch, vector_size);
|
||||
free2dArray<out_T>(outputs_ref, batch, vector_size);
|
||||
free2dArray<out_T>(outputs, batch, vector_size);
|
||||
freeBuffer(insn_buf);
|
||||
freeBuffer(uop_buf);
|
||||
freeBuffer(bias_buf);
|
||||
|
@ -891,17 +891,17 @@ virtual_threads=%d\n",
|
|||
int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
|
||||
int uop_size = uop_compression ? block / BATCH * virtual_threads :
|
||||
block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
|
||||
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
|
||||
int inp_size = batch / BATCH * in_feat / BLOCK_IN;
|
||||
int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
|
||||
int out_size = batch / BATCH * out_feat / BLOCK_OUT;
|
||||
// Blocked buffer sizes (in terms of elements)
|
||||
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
|
||||
int inp_block_size = block / BATCH * block / BLOCK_IN;
|
||||
int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
|
||||
int out_block_size = block / BATCH * block / BLOCK_OUT;
|
||||
// Make sure we don't exceed buffer bounds
|
||||
assert(uop_size <= UOP_BUFF_DEPTH);
|
||||
assert(wgt_block_size <= WGT_BUFF_DEPTH);
|
||||
assert(inp_block_size <= INP_BUFF_DEPTH);
|
||||
assert(wgt_block_size <= WGT_BUFF_DEPTH);
|
||||
assert(out_block_size <= ACC_BUFF_DEPTH);
|
||||
|
||||
// Initialize instruction buffer
|
||||
|
@ -1017,15 +1017,15 @@ virtual_threads=%d\n",
|
|||
printMicroOp(uop_size, uop_buf);
|
||||
#endif
|
||||
|
||||
// Initialize weights
|
||||
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
|
||||
// Initialize inputs
|
||||
inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
|
||||
// Initialize weights
|
||||
wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
|
||||
// Initialize biases
|
||||
acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);
|
||||
|
||||
// Reference GEMM implementation
|
||||
inp_T **outputs_ref = alloc2dArray<inp_T>(batch, out_feat);
|
||||
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
|
||||
for (int i = 0; i < batch; i ++) {
|
||||
for (int j = 0; j < out_feat; j ++) {
|
||||
acc_T sum = biases[i][j];
|
||||
|
@ -1033,21 +1033,21 @@ virtual_threads=%d\n",
|
|||
sum += (acc_T) (inputs[i][k] * weights[j][k]);
|
||||
}
|
||||
// Set
|
||||
outputs_ref[i][j] = (inp_T) sum;
|
||||
outputs_ref[i][j] = (out_T) sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the weight buffer
|
||||
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
|
||||
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
|
||||
// Prepare the input buffer
|
||||
inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
|
||||
packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
|
||||
// Prepare the weight buffer
|
||||
wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
|
||||
packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
|
||||
// Prepare the bias buffer
|
||||
acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
|
||||
packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
|
||||
// Prepare the output buffer
|
||||
inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size);
|
||||
out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);
|
||||
|
||||
#ifdef NO_SIM
|
||||
// Invoke the VTA
|
||||
|
@ -1069,8 +1069,8 @@ virtual_threads=%d\n",
|
|||
#endif
|
||||
|
||||
// Unpack output data
|
||||
inp_T **outputs = alloc2dArray<inp_T>(batch, out_feat);
|
||||
unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
|
||||
out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
|
||||
unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
|
||||
|
||||
// Correctness checks
|
||||
int err = 0;
|
||||
|
@ -1087,15 +1087,15 @@ virtual_threads=%d\n",
|
|||
}
|
||||
|
||||
// Free all allocated arrays
|
||||
free2dArray<wgt_T>(weights, out_feat, in_feat);
|
||||
free2dArray<inp_T>(inputs, batch, in_feat);
|
||||
free2dArray<wgt_T>(weights, out_feat, in_feat);
|
||||
free2dArray<acc_T>(biases, batch, out_feat);
|
||||
free2dArray<inp_T>(outputs_ref, batch, out_feat);
|
||||
free2dArray<inp_T>(outputs, batch, out_feat);
|
||||
free2dArray<out_T>(outputs_ref, batch, out_feat);
|
||||
free2dArray<out_T>(outputs, batch, out_feat);
|
||||
freeBuffer((void *) insn_buf);
|
||||
freeBuffer((void *) uop_buf);
|
||||
freeBuffer((void *) weight_buf);
|
||||
freeBuffer((void *) input_buf);
|
||||
freeBuffer((void *) weight_buf);
|
||||
freeBuffer((void *) bias_buf);
|
||||
freeBuffer((void *) output_buf);
|
||||
|
|
@ -7,21 +7,25 @@
|
|||
#ifndef VTA_TESTLIB_H_
|
||||
#define VTA_TESTLIB_H_
|
||||
|
||||
#include "vta_params.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <vta/hw_spec.h>
|
||||
|
||||
#ifdef NO_SIM
|
||||
|
||||
#include "vta_pynq_driver.h"
|
||||
#include <vta/driver.h>
|
||||
|
||||
#ifdef PYNQ_TARGET
|
||||
#include "../../../src/pynq/pynq_driver.h"
|
||||
#endif //PYNQ_TARGET
|
||||
|
||||
typedef uint64_t axi_T;
|
||||
typedef uint32_t uop_T;
|
||||
typedef int8_t wgt_T;
|
||||
typedef int8_t inp_T;
|
||||
typedef int8_t out_T;
|
||||
typedef int32_t acc_T;
|
||||
|
||||
uint64_t vta (
|
||||
|
@ -35,8 +39,7 @@ uint64_t vta (
|
|||
|
||||
#else //NO_SIM
|
||||
|
||||
#include "vta.h"
|
||||
#include "vta_typedefs.h"
|
||||
#include "../../../hardware/vivado/src/vta.h"
|
||||
|
||||
#endif //NO_SIM
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
CC ?= g++
|
||||
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
|
||||
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
|
||||
LIBS = -l:libsds_lib.so -l:libdma.so
|
||||
INCLUDE_DIR = ../../../include
|
||||
DRIVER_DIR = ../../../src/pynq
|
||||
TESTLIB_DIR = ../common
|
||||
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
|
||||
SOURCES = pynq_driver.cc test_lib.cc
|
||||
OBJECTS = pynq_driver.o test_lib.o metal_test.o
|
||||
EXECUTABLE = vta
|
||||
|
||||
# Include top-level config file
|
||||
ifndef config
|
||||
ifneq ("$(wildcard ../../../config.mk)", "")
|
||||
config = ../../../config.mk
|
||||
else
|
||||
config = ../../../make/config.mk
|
||||
endif
|
||||
endif
|
||||
include $(config)
|
||||
|
||||
# Define flags
|
||||
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
|
||||
CFLAGS += $(ADD_CFLAGS)
|
||||
|
||||
# All Target
|
||||
all: $(EXECUTABLE)
|
||||
|
||||
%.o: %.cc $(SOURCES)
|
||||
$(CC) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
$(EXECUTABLE): $(OBJECTS)
|
||||
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
|
||||
|
||||
clean:
|
||||
rm -rf *.o $(EXECUTABLE)
|
|
@ -9,8 +9,9 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include "vta_test_lib.h"
|
||||
#include "vta_pynq_driver.h"
|
||||
#include <vta/driver.h>
|
||||
#include "../../../src/pynq/pynq_driver.h"
|
||||
#include "../common/test_lib.h"
|
||||
|
||||
// VTA invocation (present the same abstraction as in the simulation tests)
|
||||
uint64_t vta (
|
||||
|
@ -43,18 +44,18 @@ uint64_t vta (
|
|||
#endif
|
||||
|
||||
// Program VTA
|
||||
ProgramVTA(bitstream);
|
||||
VTAProgram(bitstream);
|
||||
// Get VTA handles
|
||||
VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
|
||||
|
||||
// Physical address pointers
|
||||
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
|
||||
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
|
||||
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
|
||||
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
|
||||
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
|
||||
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
|
||||
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
|
||||
|
||||
|
@ -65,29 +66,29 @@ uint64_t vta (
|
|||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
|
||||
// FETCH @ 0x10 : Data signal of insn_count_V
|
||||
WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
|
||||
// FETCH @ 0x18 : Data signal of insns_V
|
||||
if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
|
||||
// LOAD @ 0x10 : Data signal of weight_V
|
||||
if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
|
||||
// LOAD @ 0x18 : Data signal of inputs_V
|
||||
if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
|
||||
if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
|
||||
// LOAD @ 0x10 : Data signal of inputs_V
|
||||
if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
|
||||
// LOAD @ 0x18 : Data signal of weight_V
|
||||
if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
|
||||
// COMPUTE @ 0x20 : Data signal of uops_V
|
||||
if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
|
||||
if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
|
||||
// COMPUTE @ 0x28 : Data signal of biases_V
|
||||
if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
|
||||
if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
|
||||
// STORE @ 0x10 : Data signal of outputs_V
|
||||
if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
|
||||
if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
|
||||
|
||||
// VTA start
|
||||
WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
|
||||
WriteMappedReg(vta_load_handle, 0x0, 0x81);
|
||||
WriteMappedReg(vta_compute_handle, 0x0, 0x81);
|
||||
WriteMappedReg(vta_store_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
|
||||
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
|
||||
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
|
||||
|
||||
int flag = 0, t = 0;
|
||||
for (t = 0; t < 10000000; ++t) {
|
||||
flag = ReadMappedReg(vta_compute_handle, 0x18);
|
||||
flag = VTAReadMappedReg(vta_compute_handle, 0x18);
|
||||
if (flag & VTA_DONE) break;
|
||||
}
|
||||
|
||||
|
@ -104,10 +105,10 @@ uint64_t vta (
|
|||
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
|
||||
|
||||
// Unmap VTA register
|
||||
UnmapRegister(vta_fetch_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_load_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_compute_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_store_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_load_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
|
||||
VTAUnmapRegister(vta_store_handle, VTA_RANGE);
|
||||
|
||||
return t_fpga;
|
||||
};
|
Загрузка…
Ссылка в новой задаче