From 28a10b690429434da0c827e7995f689985c0aa78 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Sun, 18 Mar 2018 00:21:54 -0700 Subject: [PATCH] [REFACTOR] Code base refactoring (#5) --- vta/Makefile | 12 +- vta/apps/pynq_rpc/start_rpc_server.sh | 4 + vta/hardware/vivado/Makefile | 71 +- vta/hardware/vivado/scripts/hls.tcl | 4 +- vta/hardware/vivado/sim/vta_test.cc | 4 +- .../hls => hardware/vivado/src}/vta.cc | 2 +- .../hls => hardware/vivado/src}/vta.h | 84 +- vta/include/hardware/hls/vta_typedefs.h | 97 -- vta/include/vta/driver.h | 100 ++ vta/include/{vta_params.h => vta/hw_spec.h} | 14 +- vta/include/vta/runtime.h | 274 ++++ vta/include/vta_pynq_driver.h | 152 -- vta/make/config.mk | 69 + .../vta_pynq_driver.c => pynq/pynq_driver.cc} | 40 +- vta/src/pynq/pynq_driver.h | 83 + vta/src/runtime.cc | 1410 +++++++++++++++++ vta/src/tvm/vta_device_api.cc | 106 ++ vta/tests/driver/Makefile | 59 - .../hardware/common/test_lib.cc} | 60 +- .../hardware/common/test_lib.h} | 13 +- vta/tests/hardware/pynq/Makefile | 37 + .../pynq/metal_test.cc} | 53 +- 22 files changed, 2302 insertions(+), 446 deletions(-) create mode 100755 vta/apps/pynq_rpc/start_rpc_server.sh rename vta/{src/hardware/hls => hardware/vivado/src}/vta.cc (99%) rename vta/{include/hardware/hls => hardware/vivado/src}/vta.h (65%) delete mode 100644 vta/include/hardware/hls/vta_typedefs.h create mode 100644 vta/include/vta/driver.h rename vta/include/{vta_params.h => vta/hw_spec.h} (99%) create mode 100644 vta/include/vta/runtime.h delete mode 100644 vta/include/vta_pynq_driver.h rename vta/src/{driver/pynq/vta_pynq_driver.c => pynq/pynq_driver.cc} (67%) create mode 100644 vta/src/pynq/pynq_driver.h create mode 100644 vta/src/runtime.cc create mode 100644 vta/src/tvm/vta_device_api.cc delete mode 100644 vta/tests/driver/Makefile rename vta/{src/test/vta_test_lib.cc => tests/hardware/common/test_lib.cc} (97%) rename vta/{include/vta_test_lib.h => tests/hardware/common/test_lib.h} (97%) create mode 100644 vta/tests/hardware/pynq/Makefile rename vta/tests/{driver/driver_test.cc => hardware/pynq/metal_test.cc} (71%) diff --git a/vta/Makefile b/vta/Makefile index 20414cde..6007ed2e 100644 --- a/vta/Makefile +++ b/vta/Makefile @@ -54,9 +54,13 @@ endif all: lib/libvta.$(SHARED_LIBRARY_SUFFIX) -SRC = $(wildcard src/*.cc src/*.cc) -ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC)) -ALL_DEP = $(ALL_OBJ) +VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc) +ifeq ($(TARGET), PYNQ_TARGET) + VTA_LIB_SRC += $(wildcard src/pynq/*.cc) + LDFLAGS += -L/usr/lib -lsds_lib + LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so +endif +VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC)) test: $(TEST) @@ -65,7 +69,7 @@ build/src/%.o: src/%.cc $(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d $(CXX) -c $(CFLAGS) -c $< -o $@ -lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP) +lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ) @mkdir -p $(@D) $(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS) diff --git a/vta/apps/pynq_rpc/start_rpc_server.sh b/vta/apps/pynq_rpc/start_rpc_server.sh new file mode 100755 index 00000000..d5a1202a --- /dev/null +++ b/vta/apps/pynq_rpc/start_rpc_server.sh @@ -0,0 +1,4 @@ +#!/bin/bash +export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ +python -m tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so diff --git a/vta/hardware/vivado/Makefile b/vta/hardware/vivado/Makefile index b52fb628..dfcb0631 100644 --- a/vta/hardware/vivado/Makefile +++ b/vta/hardware/vivado/Makefile @@ -2,9 +2,9 @@ ROOTDIR = $(CURDIR) BUILD_DIR = $(ROOTDIR)/build SCRIPT_DIR = $(ROOTDIR)/scripts -SRC_DIR = $(ROOTDIR)/../../src/hardware/hls +SRC_DIR = $(ROOTDIR)/src SIM_DIR = $(ROOTDIR)/sim -TEST_DIR = $(ROOTDIR)/../../src/test +TEST_DIR = $(ROOTDIR)/../../tests/hardware/common INCLUDE_DIR = $(ROOTDIR)/../../include # Executables @@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls VIVADO = vivado HSI = hsi -# Build parameters: +# Include top-level config file +ifndef config +ifneq ("$(wildcard ../../config.mk)", "") + config = ../../config.mk +else + config = ../../make/config.mk +endif +endif +include $(config) + +#--------------------- +# Compilation parameters +#-------------------- + # Number of threads during compilation NUM_THREADS = 8 + # Target Frequency CLOCK_FREQ = 100 -# Log of input width in bits -LOG_INP_WIDTH = 3 -# Log of weight width in bits -LOG_WGT_WIDTH = 3 -# Log of accum width in bits -LOG_ACC_WIDTH = 5 -# Log of output width in bits -LOG_OUT_WIDTH = $(LOG_INP_WIDTH) -# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) -LOG_BATCH = 0 -# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) -LOG_IN_BLOCK = 4 -# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) -LOG_OUT_BLOCK = 4 -# Log of uop buffer size in Bytes -LOG_UOP_BUFF_SIZE = 15 -# Log of inp buffer size in Bytes -LOG_INP_BUFF_SIZE = 15 -# Log of wgt buffer size in Bytes -LOG_WGT_BUFF_SIZE = 15 -# Log of acc buffer size in Bytes -LOG_ACC_BUFF_SIZE = 17 -# Log of out buffer size in Bytes -LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) -# Derived parameter -# Input width in bits -INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" ) -# Weight width in bits -WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" ) -# Output width in bits -OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" ) -# Tensor batch size -BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" ) -# Tensor outer block size -IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" ) -# Tensor inner block size -OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" ) -# Uop buffer size in Bytes -UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" ) -# Inp buffer size in Bytes -INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" ) -# Wgt buffer size in Bytes -WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" ) -# Acc buffer size in Bytes -ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" ) -# Out buffer size in Bytes -OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) +# Timing closure compensation (0 for none, 3 for highest) +TIMING_CLOSURE_COMP = 0 # Derive clock target period TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" ) @@ -85,7 +54,7 @@ ip: $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \ $(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \ - $(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \ + $(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \ $(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \ $(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE) diff --git a/vta/hardware/vivado/scripts/hls.tcl b/vta/hardware/vivado/scripts/hls.tcl index 55b3c01d..220c8f3b 100644 --- a/vta/hardware/vivado/scripts/hls.tcl +++ b/vta/hardware/vivado/scripts/hls.tcl @@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } { } # C define flags to pass to compiler -set cflags "-I $include_dir -I $include_dir/hardware/hls \ +set cflags "-I $include_dir -I $src_dir -I $test_dir \ -DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \ -DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \ -DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \ @@ -127,7 +127,7 @@ open_project vta_sim set_top vta add_files $src_dir/vta.cc -cflags $cflags add_files -tb $sim_dir/vta_test.cc -cflags $cflags -add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags +add_files -tb $test_dir/test_lib.cc -cflags $cflags open_solution "solution0" init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out csim_design -clean diff --git a/vta/hardware/vivado/sim/vta_test.cc b/vta/hardware/vivado/sim/vta_test.cc index 858926aa..2031186f 100644 --- a/vta/hardware/vivado/sim/vta_test.cc +++ b/vta/hardware/vivado/sim/vta_test.cc @@ -8,8 +8,8 @@ #include #include -#include "vta.h" -#include "vta_test_lib.h" +#include "../src/vta.h" +#include "../../../tests/hardware/common/test_lib.h" int main(void) { diff --git a/vta/src/hardware/hls/vta.cc b/vta/hardware/vivado/src/vta.cc similarity index 99% rename from vta/src/hardware/hls/vta.cc rename to vta/hardware/vivado/src/vta.cc index 206fb5fe..7a0f9ded 100644 --- a/vta/src/hardware/hls/vta.cc +++ b/vta/hardware/vivado/src/vta.cc @@ -8,7 +8,7 @@ #include #include -#include "vta.h" +#include "./vta.h" void fetch ( uint32_t insn_count, diff --git a/vta/include/hardware/hls/vta.h b/vta/hardware/vivado/src/vta.h similarity index 65% rename from vta/include/hardware/hls/vta.h rename to vta/hardware/vivado/src/vta.h index b959d8a2..5dd4d953 100644 --- a/vta/include/hardware/hls/vta.h +++ b/vta/hardware/vivado/src/vta.h @@ -11,8 +11,88 @@ #include #include -#include "vta_typedefs.h" -#include "vta_params.h" +#include + +/* \typedef uop_T Micro-op datatype*/ +typedef ap_uint uop_T; + +/* \typedef inp_T Input datatype*/ +typedef ap_int inp_T; + +/* \typedef wgt_T Weight datatype*/ +typedef ap_int wgt_T; + +/* \typedef out_T Output datatype*/ +typedef ap_int out_T; + +/* \typedef acc_T Accumulator datatype*/ +typedef ap_int acc_T; + +/* \typedef mul_T Multiplier output datatype*/ +typedef ap_int mul_T; + +/* \typedef sum_T GEMM accumulator datatype*/ +typedef ap_int sum_T; + +/* \typedef inp_vec_T Input vector datatype*/ +typedef ap_uint inp_vec_T; + +/* \typedef wgt_vec_T Weight vector datatype*/ +typedef ap_uint wgt_vec_T; + +/* \typedef acc_vec_T Accumulator vector datatype*/ +typedef ap_uint acc_vec_T; + +/* \typedef out_vec_T Output vector datatype*/ +typedef ap_uint out_vec_T; + +/* \typedef uop_idx_T Micro-op SRAM index datatype*/ +typedef ap_uint uop_idx_T; + +/* \typedef inp_idx_T Input SRAM index datatype*/ +typedef ap_uint inp_idx_T; + +/* \typedef wgt_idx_T Weight SRAM index datatype*/ +typedef ap_uint wgt_idx_T; + +/* \typedef acc_idx_T Accumulator SRAM index datatype*/ +typedef ap_uint acc_idx_T; + +/* \typedef opcode_T Opcode datatype*/ +typedef ap_uint opcode_T; + +/* \typedef insn_T Instruction datatype*/ +typedef ap_uint insn_T; + +/* \typedef loop_T Loop bound datatype*/ +typedef ap_uint loop_T; + +/* \typedef memop_id_T Memory operation ID datatype*/ +typedef ap_uint memop_id_T; + +/* \typedef memop_sram_T Memory operation SRAM index datatype*/ +typedef ap_uint memop_sram_T; + +/* \typedef memop_dram_T Memory operation DRAM index datatype*/ +typedef ap_uint memop_dram_T; + +/* \typedef memop_size_T Memory operation range datatype*/ +typedef ap_uint memop_size_T; + +/* \typedef memop_stride_T Memory operation stride datatype*/ +typedef ap_uint memop_stride_T; + +/* \typedef memop_pad_T Memory operation pad width datatype*/ +typedef ap_uint memop_pad_T; + +/* \typedef aluop_opcode_T ALU operation opcode datatype*/ +typedef ap_uint aluop_opcode_T; + +/* \typedef aluop_opcode_T ALU operation immediate datatype*/ +typedef ap_int aluop_imm_T; + +/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ +typedef ap_uint aluop_sh_imm_T; /*! * \brief Fetch module. diff --git a/vta/include/hardware/hls/vta_typedefs.h b/vta/include/hardware/hls/vta_typedefs.h deleted file mode 100644 index b2e90e23..00000000 --- a/vta/include/hardware/hls/vta_typedefs.h +++ /dev/null @@ -1,97 +0,0 @@ -/*! - * Copyright (c) 2018 by Contributors - * \file vta_typedefs.h - * \brief Type definitions for VTA HLS design. - */ -#ifndef VTA_TYPEDEFS_H_ -#define VTA_TYPEDEFS_H_ - -#include -#include -#include -#include - -#include "vta_params.h" - -/* \typedef uop_T Micro-op datatype*/ -typedef ap_uint uop_T; - -/* \typedef inp_T Input datatype*/ -typedef ap_int inp_T; - -/* \typedef wgt_T Weight datatype*/ -typedef ap_int wgt_T; - -/* \typedef out_T Output datatype*/ -typedef ap_int out_T; - -/* \typedef acc_T Accumulator datatype*/ -typedef ap_int acc_T; - -/* \typedef mul_T Multiplier output datatype*/ -typedef ap_int mul_T; - -/* \typedef sum_T GEMM accumulator datatype*/ -typedef ap_int sum_T; - -/* \typedef inp_vec_T Input vector datatype*/ -typedef ap_uint inp_vec_T; - -/* \typedef wgt_vec_T Weight vector datatype*/ -typedef ap_uint wgt_vec_T; - -/* \typedef acc_vec_T Accumulator vector datatype*/ -typedef ap_uint acc_vec_T; - -/* \typedef out_vec_T Output vector datatype*/ -typedef ap_uint out_vec_T; - -/* \typedef uop_idx_T Micro-op SRAM index datatype*/ -typedef ap_uint uop_idx_T; - -/* \typedef inp_idx_T Input SRAM index datatype*/ -typedef ap_uint inp_idx_T; - -/* \typedef wgt_idx_T Weight SRAM index datatype*/ -typedef ap_uint wgt_idx_T; - -/* \typedef acc_idx_T Accumulator SRAM index datatype*/ -typedef ap_uint acc_idx_T; - -/* \typedef opcode_T Opcode datatype*/ -typedef ap_uint opcode_T; - -/* \typedef insn_T Instruction datatype*/ -typedef ap_uint insn_T; - -/* \typedef loop_T Loop bound datatype*/ -typedef ap_uint loop_T; - -/* \typedef memop_id_T Memory operation ID datatype*/ -typedef ap_uint memop_id_T; - -/* \typedef memop_sram_T Memory operation SRAM index datatype*/ -typedef ap_uint memop_sram_T; - -/* \typedef memop_dram_T Memory operation DRAM index datatype*/ -typedef ap_uint memop_dram_T; - -/* \typedef memop_size_T Memory operation range datatype*/ -typedef ap_uint memop_size_T; - -/* \typedef memop_stride_T Memory operation stride datatype*/ -typedef ap_uint memop_stride_T; - -/* \typedef memop_pad_T Memory operation pad width datatype*/ -typedef ap_uint memop_pad_T; - -/* \typedef aluop_opcode_T ALU operation opcode datatype*/ -typedef ap_uint aluop_opcode_T; - -/* \typedef aluop_opcode_T ALU operation immediate datatype*/ -typedef ap_int aluop_imm_T; - -/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ -typedef ap_uint aluop_sh_imm_T; - -#endif // VTA_TYPEDEFS_H_ diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h new file mode 100644 index 00000000..2b5e0ea9 --- /dev/null +++ b/vta/include/vta/driver.h @@ -0,0 +1,100 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file vta_driver.h + * \brief General driver interface. + */ + +#ifndef VTA_DRIVER_H_ +#define VTA_DRIVER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/*! \brief Memory management constants with libxlnk_cma */ +#define CACHED 1 +/*! \brief Memory management constants with libxlnk_cma */ +#define NOT_CACHED 0 + +/*! \brief VTA command handle */ +typedef void * VTAHandle; + +/*! + * \brief Allocates physically contiguous region in memory (limited by MAX_XFER). + * \param size Size of the region in Bytes. + * \param cached Region can be set to not cached (write-back) if set to 0. + * \return A pointer to the allocated region. + */ +void* VTAMemAlloc(size_t size, int cached); + +/*! + * \brief Frees a physically contiguous region in memory. + * \param buf Buffer to free. + */ +void VTAMemFree(void* buf); + +/*! + * \brief Returns a physical address to the region of memory allocated with VTAMemAlloc. + * \param buf Pointer to memory region allocated with VTAMemAlloc. + * \return The physical address of the memory region. + */ +uint32_t VTAGetMemPhysAddr(void* buf); + +/*! + * \brief Flushes the region of memory out of the CPU cache to DRAM. + * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. + * \param size Size of the region to flush in Bytes. + */ +void VTAFlushCache(void* buf, int size); + +/*! + * \brief Invalidates the region of memory that is cached. + * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated. + * \param size Size of the region to invalidate in Bytes. + */ +void VTAInvalidateCache(void* buf, int size); + +/*! + * \brief Returns a memory map to FPGA configuration registers. + * \param addr The base physical address of the configuration registers. + * \param length The size of the memory mapped region in bytes. + * \return A pointer to the memory mapped region. + */ +void *VTAMapRegister(unsigned addr, size_t length); + +/*! + * \brief Deletes the configuration register memory map. + * \param vta The memory mapped region. + * \param length The size of the memory mapped region in bytes. + */ +void VTAUnmapRegister(void *vta, size_t length); + +/*! + * \brief Writes to a memory mapped configuration register. + * \param vta_base The handle to the memory mapped configuration registers. + * \param offset The offset of the register to write to. + * \param val The value to be written to the memory mapped register. + */ +void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val); + +/*! + * \brief Reads from the memory mapped configuration register. + * \param vta_base The handle to the memory mapped configuration registers. + * \param offset The offset of the register to read from. + * \return The value read from the memory mapped register. + */ +unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset); + +/*! + * \brief Programming the bit stream on the FPGA. + * \param bitstream The path to the bit stream file. + */ +void VTAProgram(const char* bitstream); + +#ifdef __cplusplus +} +#endif +#endif // VTA_DRIVER_H_ diff --git a/vta/include/vta_params.h b/vta/include/vta/hw_spec.h similarity index 99% rename from vta/include/vta_params.h rename to vta/include/vta/hw_spec.h index 748e77d9..b18e94e6 100644 --- a/vta/include/vta_params.h +++ b/vta/include/vta/hw_spec.h @@ -3,8 +3,13 @@ * \file vta_defines.h * \brief Preprocessor definitions for VTA HLS design and runtime. */ -#ifndef VTA_DEFINES_H_ -#define VTA_DEFINES_H_ + +#ifndef VTA_HW_SPEC_H_ +#define VTA_HW_SPEC_H_ + +#ifdef __cplusplus +extern "C" { +#endif #include @@ -556,4 +561,7 @@ typedef struct { uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH; } VTAUop; -#endif // VTA_DEFINES_H_ +#ifdef __cplusplus +} +#endif +#endif // VTA_HW_SPEC_H_ diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h new file mode 100644 index 00000000..e1aae32f --- /dev/null +++ b/vta/include/vta/runtime.h @@ -0,0 +1,274 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file runtime.h + * \brief VTA runtime library. + */ + +#ifndef VTA_RUNTIME_H_ +#define VTA_RUNTIME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./driver.h" + +#define VTA_MEMCPY_H2D 1 +#define VTA_MEMCPY_D2H 2 +#define VTA_MEMCPY_D2D 3 + +#define VTA_DEBUG_DUMP_INSN (1 << 1) +#define VTA_DEBUG_DUMP_UOP (1 << 2) +#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3) +#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4) +#define VTA_DEBUG_FORCE_SERIAL (1 << 5) + +/*! \brief VTA command handle */ +typedef void * VTACommandHandle; + +/*! \brief Shutdown hook of VTA to cleanup resources */ +void VTARuntimeShutdown(); + +/*! + * \brief Get thread local command handle. + * \return A thread local command handle. + */ +VTACommandHandle VTATLSCommandHandle(); + +/*! + * \brief Allocate data buffer. + * \param cmd The VTA command handle. + * \param size Buffer size. + * \return A pointer to the allocated buffer. + */ +void* VTABufferAlloc(VTACommandHandle cmd, size_t size); + +/*! + * \brief Free data buffer. + * \param cmd The VTA command handle. + * \param buffer The data buffer to be freed. + */ +void VTABufferFree(VTACommandHandle cmd, void* buffer); + +/*! + * \brief Get the buffer access pointer on CPU. + * \param cmd The VTA command handle. + * \param buffer The data buffer. + * \return The pointer that can be accessed by the CPU. + */ +void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer); + +/*! + * \brief Copy data buffer from one location to another. + * \param cmd The VTA command handle. + * \param from The source buffer base address. + * \param from_offset The offset of the source buffer. + * \param to The target buffer base address. + * \param to_offset The offset of the target buffer. + * \param size Size of copy. + * \param kind_mask The memory copy kind. + */ +void VTABufferCopy(VTACommandHandle cmd, + const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + int kind_mask); + +/*! + * \brief Set debug mode on the command handle. + * \param cmd The VTA command handle. + * \param debug_flag The debug flag. + */ +void VTASetDebugMode(VTACommandHandle cmd, int debug_flag); + +/*! + * \brief Perform a write barrier to make a memory region visible to the CPU. + * \param cmd The VTA command handle. + * \param buffer The head buffer pointer. + * \param elem_bits The size in bits of each element. + * \param start The start of the region (in elements). + * \param extent The end of the region (in elements). + */ +void VTAWriteBarrier(VTACommandHandle cmd, + void* buffer, uint32_t elem_bits, + uint32_t start, uint32_t extent); + +/*! + * \brief Perform a read barrier to a memory region visible to VTA. + * \param cmd The VTA command handle. + * \param buffer The head buffer pointer. + * \param elem_bits The unit bits of each elements. + * \param start The start of the region (in elements). + * \param extent The end of the region (in elements). + */ +void VTAReadBarrier(VTACommandHandle cmd, + void* buffer, uint32_t elem_bits, + uint32_t start, uint32_t extent); + +/*! + * \brief Perform a 2D data load from DRAM. + * Sizes are measured in units of vector elements. + * \param cmd The VTA command handle. + * \param src_dram_addr Source DRAM address. + * \param src_elem_offset The source DRAM offset in number of unit elements. + * \param x_size The lowest dimension (x axis) size in number of unit elements. + * \param y_size The number of rows (y axis). + * \param x_stride The x axis stride. + * \param x_pad_before The start padding on x axis. + * \param y_pad_before The start padding on y axis. + * \param x_pad_after The end padding on x axis. + * \param y_pad_after The end padding of y axis. + * \param dst_sram_index Destination SRAM index. + * \param dst_memory_type Destination memory type. + */ +void VTALoadBuffer2D(VTACommandHandle cmd, + void* src_dram_addr, + uint32_t src_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride, + uint32_t x_pad_before, + uint32_t y_pad_before, + uint32_t x_pad_after, + uint32_t y_pad_after, + uint32_t dst_sram_index, + uint32_t dst_memory_type); + +/*! + * \brief Perform a 2D data store into DRAM + * Sizes are measured in units of vector elements. + * \param cmd The VTA command handle. + * \param src_sram_index Source SRAM index. + * \param src_memory_type Source memory type. + * \param dst_dram_addr Destination DRAM address. + * \param x_size The lowest dimension (x axis) size in number of unit elements. + * \param y_size The number of rows. + * \param x_stride The x axis stride. + */ +void VTAStoreBuffer2D(VTACommandHandle cmd, + uint32_t src_sram_index, + uint32_t src_memory_type, + void* dst_dram_addr, + uint32_t dst_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride); + +/*! + * \brief Push uop into kernel buffer. + * In GEMM mode, do a blocked GEMM with 2d access pattern. + * In ALU mode, do a vectorized ALU operation with 2d access pattern. + * + * \code + * + * DType accum[INP_BUFF_DEPTH][l][n]; + * DType weight[WGT_BUFF_DEPTH][n][m]; + * DType input[INP_BUFF_DEPTH][l][m]; + * if reset_out == 1 + * accum[dst_index] = 0 + * elif mode == 0 + * accum[dst_index] += GEMM(input[src_index], weight[wgt_index]); + * else + * if (use_imm) + * accum[dst_index] = opcode(accum[dst_index], imm_val); + * else + * accum[dst_index] = opcode(accum[dst_index], accum[src_index]); + * + * \endcode + * + * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1. + * \param reset_out Resets the accum to 0. + * \param dst_index The accum memory index. + * \param src_index The input memory (gemm) / accum memory (alu) index. + * \param wgt_index The weight memory index. + * \param opcode The ALU opcode. + * \param use_imm Use immediate in ALU mode if set to true. + * \param imm_val Immediate value in ALU mode. + */ +void VTAUopPush(uint32_t mode, + uint32_t reset_out, + uint32_t dst_index, + uint32_t src_index, + uint32_t wgt_index, + uint32_t opcode, + uint32_t use_imm, + uint32_t imm_val); + +/*! + * \brief Mark start of a micro op loop. + * \param extent The extent of the loop. + * \param dst_factor The accum factor. + * \param src_factor The input factor. + * \param wgt_factor The weight factor. + */ +void VTAUopLoopBegin(uint32_t extent, + uint32_t dst_factor, + uint32_t src_factor, + uint32_t wgt_factor); + +/*! + * \brief Mark end of a micro op loop. + */ +void VTAUopLoopEnd(); + +/*! + * \brief Push GEMM uop kernel into the command handle. + * \param uop_handle The uop cache handle. + * \param finit The initalization function to initialize uop. + * \param signature The closure arguments of the finit. + * \param nbytes Number of bytes to in the closure arguments. + * \return 0 if success. + */ +int VTAPushGEMMOp(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes); + +/*! + * \brief Push ALU uop kernel into the command handle. + * \param uop_handle The uop cache handle. + * \param finit The initalization function to initialize uop. + * \param signature The closure arguments of the finit. + * \param nbytes Number of bytes to in the closure arguments. + * \return 0 if success. + */ +int VTAPushALUOp(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes); + +/*! + * \brief Push dependence token. + * \param cmd The VTA command handle. + * \param from_qid The source queue. + * \param to_qid The destination queue. + * \return 0 if success. + */ +int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid); + +/*! + * \brief Pop dependence signal. + * \param cmd The VTA command handle. + * \param from_qid The source queue. + * \param to_qid The destination queue. + * \return 0 if success. + */ +int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid); + +/*! + * \brief Synchronize the command handle. + * Commit all the instructions to VTA and wait until + * the accelerator finishes its job. + * Perform all of the out-of-order DRAM stores. + * \param cmd The VTA command handle. + * \param wait_cycles The limit of poll cycles. + * + */ +void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles); + +#ifdef __cplusplus +} +#endif +#endif // VTA_RUNTIME_H_ diff --git a/vta/include/vta_pynq_driver.h b/vta/include/vta_pynq_driver.h deleted file mode 100644 index 9a078468..00000000 --- a/vta/include/vta_pynq_driver.h +++ /dev/null @@ -1,152 +0,0 @@ -/*! - * Copyright (c) 2018 by Contributors - * \file vta_pynq_driver.h - * \brief VTA driver for Pynq board. - */ - -#ifndef VTA_PYNQ_DRIVER_H_ -#define VTA_PYNQ_DRIVER_H_ - -#ifdef __cplusplus -extern "C" { -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __arm__ -#include "libxlnk_cma.h" -#else -void* cma_alloc(size_t size, int cached); -void cma_free(void* buf); -uint32_t cma_get_phy_addr(void* buf); -void xlnkFlushCache(void* buf, int size); -void xlnkInvalidateCache(void* buf, int size); -#endif - -/*! \brief VTA command handle */ -typedef void * VTAHandle; - -/*! \brief DMA command handle */ -typedef struct { - /*! \brief Register map to the AXI DMA control registers*/ - void *dma_register_map; - /*! \brief Transmit data descriptor*/ - void *mm2s_descriptor_register_map; - /*! \brief Receive data descriptor*/ - void *s2mm_descriptor_register_map; - /*! \brief Transmit data descriptor physical address*/ - uint32_t mm2s_descriptor_phy; - /*! \brief Receive data descriptor physical address*/ - uint32_t s2mm_descriptor_phy; - /*! \brief Descriptor size */ - uint32_t descriptor_size; - /*! \brief Transaction count for tx channel */ - uint32_t mm2s_count; - /*! \brief Transaction count for rx channel */ - uint32_t s2mm_count; - /*! \brief Multi-channel mode enable */ - int multichannel_en; -} DMAHandle; - -/*! \brief partial bitstream status file path */ -#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" -/*! \brief bitstream destination file path */ -#define BS_XDEVCFG "/dev/xdevcfg" - -/*! \brief Path to /dev/mem */ -#define DEV_MEM_PATH "/dev/mem" -/*! \brief MMIO driver constant */ -#define MMIO_WORD_LENGTH 4 -/*! \brief MMIO driver constant */ -#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) - -/*! \brief VTA configuration register address range */ -#define VTA_RANGE 0x100 -/*! \brief VTA configuration register start value */ -#define VTA_START 0x1 -/*! \brief VTA configuration register auto-restart value */ -#define VTA_AUTORESTART 0x81 -/*! \brief VTA configuration register done value */ -#define VTA_DONE 0x1 - -/*! \brief VTA fetch stage configuration register address -* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_FETCH_ADDR 0x43C00000 -/*! \brief VTA compute stage configuration register address -* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_COMPUTE_ADDR 0x43C10000 -/*! \brief VTA compute stage configuration register address -* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_LOAD_ADDR 0x43C20000 -/*! \brief VTA store stage configuration register address -* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define -* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) -*/ -#define VTA_STORE_ADDR 0x43C30000 - -/*! \brief Memory management constants with libxlnk_cma */ -#define CACHED 1 -/*! \brief Memory management constants with libxlnk_cma */ -#define NOT_CACHED 0 - -/*! \brief log2 of SDS buffer size limit */ -#define LOG_MAX_XFER 22 -/*! \brief SDS buffer size limit */ -#define MAX_XFER (1< 8 bits) +LOG_INP_WIDTH = 3 +# Log of kernel weight width in bits (default 3 -> 8 bits) +LOG_WGT_WIDTH = 3 +# Log of accum width in bits (default 5 -> 32 bits) +LOG_ACC_WIDTH = 5 +# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) +LOG_BATCH = 0 +# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) +LOG_BLOCK_IN = 4 +# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) +LOG_BLOCK_OUT = 4 +# Log of uop buffer size in Bytes +LOG_UOP_BUFF_SIZE = 15 +# Log of inp buffer size in Bytes +LOG_INP_BUFF_SIZE = 15 +# Log of wgt buffer size in Bytes +LOG_WGT_BUFF_SIZE = 15 +# Log of acc buffer size in Bytes +LOG_ACC_BUFF_SIZE = 17 + +#--------------------- +# Derived VTA hardware parameters +#-------------------- + +# Input width in bits +INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" ) +# Weight width in bits +WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" ) +# Log of output width in bits +LOG_OUT_WIDTH = $(LOG_INP_WIDTH) +# Output width in bits +OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" ) +# Tensor batch size +BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" ) +# Tensor outer block size +IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" ) +# Tensor inner block size +OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" ) +# Uop buffer size in Bytes +UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" ) +# Inp buffer size in Bytes +INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" ) +# Wgt buffer size in Bytes +WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" ) +# Acc buffer size in Bytes +ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" ) +# Log of out buffer size in Bytes +LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) +# Out buffer size in Bytes +OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" ) + +# Update ADD_CFLAGS +ADD_CFLAGS += \ + -D$(TARGET) \ + -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \ + -DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \ + -DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \ + -DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \ + -DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \ + -DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE) \ No newline at end of file diff --git a/vta/src/driver/pynq/vta_pynq_driver.c b/vta/src/pynq/pynq_driver.cc similarity index 67% rename from vta/src/driver/pynq/vta_pynq_driver.c rename to vta/src/pynq/pynq_driver.cc index ca9464cf..b4f78db0 100644 --- a/vta/src/driver/pynq/vta_pynq_driver.c +++ b/vta/src/pynq/pynq_driver.cc @@ -4,15 +4,31 @@ * \brief VTA driver for Pynq board. */ -#ifdef __cplusplus -extern "C" { -#endif -#include "vta_pynq_driver.h" -#ifdef __cplusplus -} -#endif +#include +#include "./pynq_driver.h" -void *MapRegister(uint32_t addr, size_t length) { + +void* VTAMemAlloc(size_t size, int cached) { + return cma_alloc(size, cached); +} + +void VTAMemFree(void* buf) { + cma_free(buf); +} + +uint32_t VTAGetMemPhysAddr(void* buf) { + return cma_get_phy_addr(buf); +} + +void VTAFlushCache(void* buf, int size) { + xlnkFlushCache(buf, size); +} + +void VTAInvalidateCache(void* buf, int size) { + xlnkInvalidateCache(buf, size); +} + +void *VTAMapRegister(uint32_t addr, size_t length) { // Align the base address with the pages uint32_t virt_base = addr & ~(getpagesize() - 1); @@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) { return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base); } -void UnmapRegister(void *vta, size_t length) { +void VTAUnmapRegister(void *vta, size_t length) { // Unmap memory int status = munmap(vta, length); assert(status==0); } -void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { +void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { *((volatile uint32_t *) (((char *) base_addr) + offset)) = val; } -uint32_t ReadMappedReg(void* base_addr, uint32_t offset) { +uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { return *((volatile uint32_t *) (((char *) base_addr) + offset)); } -void ProgramVTA(const char* bitstream) { +void VTAProgram(const char* bitstream) { int elem; FILE *src, *dst, *partial; diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h new file mode 100644 index 00000000..9e948282 --- /dev/null +++ b/vta/src/pynq/pynq_driver.h @@ -0,0 +1,83 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file vta_pynq_driver.h + * \brief VTA driver for Pynq board. + */ + +#ifndef VTA_PYNQ_DRIVER_H_ +#define VTA_PYNQ_DRIVER_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __arm__ +#include +#else +void* cma_alloc(size_t size, int cached); +void cma_free(void* buf); +uint32_t cma_get_phy_addr(void* buf); +void xlnkFlushCache(void* buf, int size); +void xlnkInvalidateCache(void* buf, int size); +#endif + +/*! \brief partial bitstream status file path */ +#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" +/*! \brief bitstream destination file path */ +#define BS_XDEVCFG "/dev/xdevcfg" + +/*! \brief Path to /dev/mem */ +#define DEV_MEM_PATH "/dev/mem" +/*! \brief MMIO driver constant */ +#define MMIO_WORD_LENGTH 4 +/*! \brief MMIO driver constant */ +#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) + +/*! \brief VTA configuration register address range */ +#define VTA_RANGE 0x100 +/*! \brief VTA configuration register start value */ +#define VTA_START 0x1 +/*! \brief VTA configuration register auto-restart value */ +#define VTA_AUTORESTART 0x81 +/*! \brief VTA configuration register done value */ +#define VTA_DONE 0x1 + +/*! \brief VTA fetch stage configuration register address +* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define +* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) +*/ +#define VTA_FETCH_ADDR 0x43C00000 +/*! \brief VTA compute stage configuration register address +* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define +* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) +*/ +#define VTA_COMPUTE_ADDR 0x43C10000 +/*! \brief VTA compute stage configuration register address +* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define +* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) +*/ +#define VTA_LOAD_ADDR 0x43C20000 +/*! \brief VTA store stage configuration register address +* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define +* in xparameters.h (under build/vivado//export/bsp/ps7_cortexa9_0/include) +*/ +#define VTA_STORE_ADDR 0x43C30000 + +/*! \brief Buffer size limit */ +#define MAX_XFER (1<<22) + +#ifdef __cplusplus +} +#endif +#endif // VTA_PYNQ_DRIVER_H_ \ No newline at end of file diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc new file mode 100644 index 00000000..570816e5 --- /dev/null +++ b/vta/src/runtime.cc @@ -0,0 +1,1410 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file vta_runtime.cc + * \brief VTA runtime for PYNQ in C++11 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef PYNQ_TARGET +#include "./pynq/pynq_driver.h" +#endif //PYNQ_TARGET + +namespace vta { + +/*! \brief Enable coherent access between VTA and CPU. */ +static const bool kBufferCoherent = true; + +/*! + * \brief Data buffer represents data on CMA. + */ +struct DataBuffer { + /*! \return Virtual address of the data. */ + void* virt_addr() const { + return data_; + } + /*! \return Physical address of the data. */ + uint32_t phy_addr() const { + return phy_addr_; + } + /*! + * \brief Invalidate the cache of given location in data buffer. + * \param offset The offset to the data. + * \param size The size of the data. + */ + void InvalidateCache(size_t offset, size_t size) { + if (!kBufferCoherent) { + VTAInvalidateCache(reinterpret_cast(phy_addr_ + offset), size); + } + } + /*! + * \brief Invalidate the cache of certain location in data buffer. + * \param offset The offset to the data. + * \param size The size of the data. + */ + void FlushCache(size_t offset, size_t size) { + if (!kBufferCoherent) { + VTAFlushCache(reinterpret_cast(phy_addr_ + offset), size); + } + } + /*! + * \brief Allocate a buffer of a given size. + * \param size The size of the buffer. + */ + static DataBuffer* Alloc(size_t size) { + void* data = VTAMemAlloc(size, 1); + assert(data != nullptr); + DataBuffer* buffer = new DataBuffer(); + buffer->data_ = data; + buffer->phy_addr_ = VTAGetMemPhysAddr(data); + return buffer; + } + /*! + * \brief Free the data buffer. + * \param buffer The buffer to be freed. + */ + static void Free(DataBuffer* buffer) { + VTAMemFree(buffer->data_); + delete buffer; + } + /*! + * \brief Create data buffer header from buffer ptr. + * \param buffer The buffer pointer. + * \return The corresponding data buffer header. + */ + static DataBuffer* FromHandle(const void* buffer) { + return const_cast( + reinterpret_cast(buffer)); + } + + private: + /*! \brief The internal data. */ + void* data_; + /*! \brief The physical address of the buffer, excluding header. */ + uint32_t phy_addr_; +}; + +/*! + * \brief Micro op kernel. + * Contains functions to construct the kernel with prefix Push. + */ +class UopKernel { + public: + /*! \brief Loop information. */ + struct LoopEntry { + uint32_t extent; + uint32_t dst_factor; + uint32_t src_factor; + uint32_t wgt_factor; + }; + /*! + * \brief Construct UopKernel with signature. + * \param signature The pointer to signature. + * \param nbytes Number of bytes. + */ + UopKernel(const char* signature, int nbytes) + : signature_(signature, signature + nbytes) { + } + /*! + * \brief Verify if the signature is correct. + * \param signature Signature ptr. + * \param nbytes Number of bytes. + */ + bool MatchSignature(void* signature, int nbytes) const { + if (static_cast(nbytes) != signature_.size()) return false; + return memcmp(signature, signature_.data(), nbytes) == 0; + } + /*! \return Whether the kernel is cached in SRAM. */ + bool cached() const { + return sram_begin_ != sram_end_; + } + /*! \return The length of the micro op sequence. */ + size_t size() const { + return seq_.size(); + } + /*! \return The micro-op data. */ + const VTAUop* data() const { + return seq_.data(); + } + /*! \return The loop structure. */ + const std::vector& loop() const { + return loop_; + } + /*! + * \brief Declare loop start. + * \param extent The loop extent. + * \param dst_factor Loop factor of accum index. + * \param src_factor Loop factor of input index + * \param wgt_factor Loop factor of weight index. + */ + void PushLoopBegin(uint32_t extent, + uint32_t dst_factor, + uint32_t src_factor, + uint32_t wgt_factor) { + LoopEntry le; + le.extent = extent; + le.dst_factor = dst_factor; + le.src_factor = src_factor; + le.wgt_factor = wgt_factor; + assert(seq_.size() == 0); + assert(loop_.size() < 2); + loop_.push_back(le); + ++loop_ptr_; + } + /*! + * \brief Declare loop end. + */ + void PushLoopEnd() { + --loop_ptr_; + } + /*! + * \brief Push micro op into kernel. + * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1. + * \param reset_out Resets the accum to 0. + * \param dst_index The accum memory index. + * \param src_index The input memory (gemm) / accum memory (alu) index. + * \param wgt_index The weight memory index. + * \param opcode The ALU opcode. + * \param use_imm Use immediate in ALU mode if set to true. + * \param imm_val Immediate value in ALU mode. + */ + void Push(uint32_t mode, + uint32_t reset_out, + uint32_t dst_index, + uint32_t src_index, + uint32_t wgt_index, + uint32_t opcode, + uint32_t use_imm, + uint32_t imm_val) { + // The loop nest structure + VerifyDep(dst_index); + VTAUop op; + op.reset_out = reset_out; + op.dst_idx = dst_index; + op.src_idx = src_index; + op.wgt_idx = wgt_index; + seq_.push_back(op); + // Ensure that mode is consistent if set + if (mode_==0xFFFFFFFF) { + mode_ = mode; + } else { + assert(mode_==mode); + } + // Check kernel op and imm/imm_val in ALU mode + if (mode==1) { + if (opcode_==0xFFFFFFFF) { + opcode_=opcode; + use_imm_=use_imm; + imm_val_=imm_val; + } else { + assert(opcode_==opcode); + assert(use_imm_==use_imm); + assert(imm_val_==imm_val); + } + } + } + /*! \brief Dump kernel micro ops to stdout. */ + void Dump() { + uint32_t size = seq_.size(); + printf("There are %u uops\n", size); + for (uint32_t i = 0; i < size; ++i) { + printf("[%04u]\t acc=%u, inp=%u, wgt=%u, reset_out=%u\n", + i, + seq_[i].dst_idx, + seq_[i].src_idx, + seq_[i].wgt_idx, + seq_[i].reset_out); + + } + printf("\n"); + } + + public: + // The kernel's mode, opcode, immediate setting and value + uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU + uint32_t opcode_{0xFFFFFFFF}; + bool use_imm_{false}; + uint16_t imm_val_{0}; + private: + // Verify that we don't write to the same acc_mem index two cycles in a row + void VerifyDep(uint32_t dst_index) { + size_t step = std::min(static_cast(2U), seq_.size()); + for (size_t i = seq_.size() - step; i < seq_.size(); ++i) { + assert(seq_[i].dst_idx != dst_index); + } + } + // The uop buffer + template + friend class UopQueue; + friend class CommandQueue; + // SRAM location if begin != end. + uint32_t sram_begin_{0}; + uint32_t sram_end_{0}; + // The signature used for verification + std::vector signature_; + // Internal sequence + std::vector seq_; + // The loop nest structure specific to ALU instructions + std::vector loop_; + // The loop pointer + size_t loop_ptr_{0}; +}; + +/*! + * \brief Base class of all queues to send and recv serial data. + * \param kElemBytes Element unit bytes. + * \param kMaxBytes Maximum number of bytes. + * \param kCoherent Whether we have coherent access to the buffer. + * \param kAlwaysCache Wether we should use cached memory. + */ +class BaseQueue { + public: + ~BaseQueue() { + if (dram_buffer_ != nullptr) { + VTAMemFree(dram_buffer_); + } + } + /*! \return Content of DRAM buffer. */ + char* dram_buffer() const { + return dram_buffer_; + } + /*! \return Physical address of DRAM. */ + uint32_t dram_phy_addr() const { + return dram_phy_addr_; + } + /*! \return Whether there is pending information. */ + bool pending() const { + return sram_begin_ != sram_end_; + } + /*! \brief Initialize the space of the buffer. */ + void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) { + coherent_ = coherent; + always_cache_ = always_cache; + elem_bytes_ = elem_bytes; + dram_buffer_ = static_cast(VTAMemAlloc( + max_bytes, coherent || always_cache_)); + assert(dram_buffer_ != nullptr); + dram_phy_addr_ = VTAGetMemPhysAddr(dram_buffer_); + } + /*! + * \brief Reset the pointer of the buffer. + * Set SRAM pointer to be the current end. + */ + void Reset() { + dram_begin_ = dram_end_ = 0; + sram_begin_ = sram_end_; + } + void AutoReadBarrier() { + ReadBarrier(elem_bytes_ * 8, 0, dram_end_); + } + /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */ + void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) { + if (!coherent_ && always_cache_ && dram_extent != 0) { + dram_begin = dram_begin * elem_bits / 8; + dram_extent = dram_extent * elem_bits / 8; + VTAFlushCache(reinterpret_cast(dram_phy_addr_ + dram_begin), + dram_extent); + } + } + /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */ + void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) { + if (!coherent_ && always_cache_ && dram_extent != 0) { + dram_begin = dram_begin * elem_bits / 8; + dram_extent = dram_extent * elem_bits / 8; + VTAInvalidateCache(reinterpret_cast(dram_phy_addr_ + dram_begin), + dram_extent); + } + } + + protected: + // Cache coherence access + bool coherent_{false}; + // Make the buffer cacheable + bool always_cache_{false}; + // Element bytes + uint32_t elem_bytes_{0}; + // Begin location of current SRAM read in FIFO mode + uint32_t sram_begin_{0}; + // End location of current SRAM write in FIFO mode + uint32_t sram_end_{0}; + // The current pending offset in DRAM in FIFO mode + uint32_t dram_begin_{0}; + // The current pending offset in DRAM in FIFO mode + uint32_t dram_end_{0}; + // The buffer in DRAM + char* dram_buffer_{nullptr}; + // Physics address of the buffer + uint32_t dram_phy_addr_; +}; + +/*! + * \brief Micro op buffer that manages the micro op cache. + */ +template +class UopQueue : public BaseQueue { + public: + void InitSpace() { + BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache); + } + // Push data to the queue + template + void Push(UopKernel* kernel, FAutoSync fautosync) { + if (kernel->cached()) return; + size_t num_op = kernel->size(); + if (dram_end_ + num_op > kMaxElems) { + fautosync(); + assert(dram_end_ <= kMaxElems); + } + assert(num_op <= kMaxNumUop); + uint32_t uop_begin = 0; + if (sram_end_ + num_op > kMaxElems) { + // Need to evict + cache_ptr_ = 0; + sram_end_ = num_op; + } else { + uop_begin = sram_end_; + sram_end_ += num_op; + } + // Simple eviction policy + uint32_t evict_begin = cache_ptr_; + for (;cache_ptr_ < cache_.size(); ++cache_ptr_) { + if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break; + cache_[cache_ptr_]->sram_begin_ = 0; + cache_[cache_ptr_]->sram_end_ = 0; + } + memcpy(dram_buffer_ + dram_end_ * kElemBytes, + kernel->data(), + num_op * kElemBytes); + dram_end_ += num_op; + kernel->sram_begin_ = uop_begin; + kernel->sram_end_ = sram_end_; + assert(uop_begin != sram_end_); + cache_.insert(cache_.begin() + cache_ptr_, kernel); + cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_); + cache_ptr_ = evict_begin + 1; + } + // Flush as weight load + void FlushUopLoad(VTAMemInsn* insn) { + if (sram_begin_ != sram_end_) { + assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_)); + insn->memory_type = MEM_ID_UOP; + insn->sram_base = sram_begin_; + insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_; + insn->y_size = 1; + insn->x_size = (dram_end_ - dram_begin_); + insn->x_stride = (dram_end_ - dram_begin_); + insn->y_pad_0 = 0; + insn->y_pad_1 = 0; + insn->x_pad_0 = 0; + insn->x_pad_1 = 0; + // Reset indices + sram_begin_ = sram_end_; + dram_begin_ = dram_end_; + } + } + + private: + // Cache pointer + uint32_t cache_ptr_{0}; + // Cached ring, sorted by sram_begin + std::vector cache_; + // Constants + static constexpr int kElemBytes = sizeof(VTAUop); + static constexpr int kMaxNumUop = UOP_BUFF_DEPTH; + static constexpr int kMaxElems = kMaxBytes / kElemBytes; +}; + +// Internal kernel structure +class UopKernelMap { + public: + // Simple hash map + UopKernel** Get(void* signature, + int nbytes) { + uint32_t key = 0; + assert(nbytes == 0 || nbytes == sizeof(int)); + if (nbytes == sizeof(int)) { + memcpy(&key, signature, sizeof(int)); + key = key + 1; + } + assert(key < 100); + if (kmap_.size() <= key) { + kmap_.resize(key + 1, nullptr); + } + return &(kmap_[key]); + } + + private: + std::vector kmap_; +}; + +enum PipelineStage : int { + kNoneStage = 0, + kLoadStage = 1, + kComputeStage = 2, + kStoreStage = 3 +}; + +// Instruction Queue +template +class InsnQueue : public BaseQueue { + public: + /*! \brief Initialize the space. */ + void InitSpace() { + BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache); + // Initialize the stage + std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0); + std::fill(pending_pop_next_, pending_pop_next_ + 4, 0); + } + /*! \return The data pointer. */ + VTAGenericInsn* data() { + return reinterpret_cast(dram_buffer_); + } + /*! \return Number of instructions. */ + uint32_t count() { + return dram_end_; + } + // Insert dependency push of load + void DepPop(int from, int to) { + // NOTE: This instruction executes on queue[to] + if (from < to) { + if (pending_pop_prev_[to]) { + this->CommitPendingPop(to); + } + pending_pop_prev_[to] = 1; + } else { + if (pending_pop_next_[to]) { + this->CommitPendingPop(to); + } + pending_pop_next_[to] = 1; + } + // Impossible condition + assert(from != kLoadStage || to != kStoreStage); + assert(to != kLoadStage || to != kComputeStage); + } + // Insert dependency push of load + void DepPush(int from, int to) { + // NOTE: this instruction executes on queue[from] + this->CommitPendingPop(from); + if (dram_end_ != 0) { + VTAMemInsn* mptr = + reinterpret_cast(dram_buffer_) + dram_end_ - 1; + if (GetPipelineStage(mptr) == from) { + if (from < to && !mptr->push_next_dep) { + // push(LD->C) or push(C->ST) + mptr->push_next_dep = true; return; + } else if (from > to && !mptr->push_prev_dep) { + // push(C->LD) or push(ST->C) + mptr->push_prev_dep = true; return; + } + } + } + if (from < to) { + // Push next dep + PushNoop(from, false, true, false, false); + } else { + // Push prev dep + PushNoop(from, true, false, false, false); + } + } + // Create a new instruction for a GEMM stage + VTAGemInsn* CreateGemInsn() { + return reinterpret_cast( + Create(kComputeStage)); + } + // Create a new instruction for a ALU stage + VTAAluInsn* CreateAluInsn() { + return reinterpret_cast( + Create(kComputeStage)); + } + // Create a new instruction for a memory stage + VTAMemInsn* CreateMemInsn(int memory_type) { + return reinterpret_cast( + Create(GetMemPipelineStage(memory_type))); + } + // create a new instruction for a store stage + VTAMemInsn* CreateStoreInsn() { + return reinterpret_cast( + Create(kStoreStage)); + } + // Rewrite instruction stream to force serial execution + void RewriteForceSerial() { + int insn_count = count(); + VTAMemInsn* mem_ptr = reinterpret_cast(data()); + for (int i = 1; i < insn_count; ++i) { + PipelineStage prev = GetPipelineStage(mem_ptr + i - 1); + PipelineStage now = GetPipelineStage(mem_ptr + i); + if (prev==kLoadStage && now==kComputeStage) { + mem_ptr[i - 1].push_prev_dep = false; + mem_ptr[i - 1].push_next_dep = true; + mem_ptr[i].pop_prev_dep = true; + mem_ptr[i].pop_next_dep = false; + } else if (prev==kComputeStage && now==kLoadStage) { + mem_ptr[i - 1].push_prev_dep = true; + mem_ptr[i - 1].push_next_dep = false; + mem_ptr[i].pop_prev_dep = false; + mem_ptr[i].pop_next_dep = true; + } else if (prev==kStoreStage && now==kComputeStage) { + mem_ptr[i - 1].push_prev_dep = true; + mem_ptr[i - 1].push_next_dep = false; + mem_ptr[i].pop_prev_dep = false; + mem_ptr[i].pop_next_dep = true; + } else if (prev==kComputeStage && now==kStoreStage) { + mem_ptr[i - 1].push_prev_dep = false; + mem_ptr[i - 1].push_next_dep = true; + mem_ptr[i].pop_prev_dep = true; + mem_ptr[i].pop_next_dep = false; + } else { + mem_ptr[i - 1].push_prev_dep = false; + mem_ptr[i - 1].push_next_dep = false; + mem_ptr[i].pop_prev_dep = false; + mem_ptr[i].pop_next_dep = false; + } + } + } + + // Helper function: Get Opcode string + const char* getOpcodeString(int opcode, bool use_imm) { + // The string name + if (opcode==ALU_OPCODE_MIN) { + if (use_imm) { + return "min imm"; + } else { + return "min"; + } + } else if (opcode==ALU_OPCODE_MAX) { + if (use_imm) { + return "max imm"; + } else { + return "max"; + } + } else if (opcode==ALU_OPCODE_ADD) { + if (use_imm) { + return "add imm"; + } else { + return "add"; + } + } else if (opcode==ALU_OPCODE_SUB) { + if (use_imm) { + return "sub imm"; + } else { + return "sub"; + } + } else if (opcode==ALU_OPCODE_MUL) { + if (use_imm) { + return "mul imm"; + } else { + return "mul"; + } + } else if (opcode==ALU_OPCODE_SHL) { + return "shl"; + } else if (opcode==ALU_OPCODE_SHR) { + return "shr"; + } + + return "unknown op"; + } + + // Dump instructions in the queue + void DumpInsn() { + // Keep tabs on dependence queues + int l2g_queue = 0; + int g2l_queue = 0; + int s2g_queue = 0; + int g2s_queue = 0; + // Converter + union VTAInsn c; + // Iterate over all instructions + int insn_count = count(); + const VTAGenericInsn* insn = data(); + printf("There are %u instructions\n", insn_count); + for (int i = 0; i < insn_count; ++i) { + // Fetch instruction and decode opcode + c.generic = insn[i]; + printf("INSTRUCTION %u: ", i); + if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { + if (c.mem.x_size == 0) { + if (c.mem.opcode == OPCODE_STORE) { + printf("NOP-STORE-STAGE\n"); + } + else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { + printf("NOP-COMPUTE-STAGE\n"); + } else { + printf("NOP-MEMORY-STAGE\n"); + } + printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + // Count status in queues + if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == OPCODE_STORE) { + assert(c.mem.pop_next_dep == false); + assert(c.mem.push_next_dep == false); + if (c.mem.pop_prev_dep) g2s_queue--; + if (c.mem.push_prev_dep) s2g_queue++; + } else if (c.mem.opcode == OPCODE_LOAD && + (c.mem.memory_type == MEM_ID_INP || + c.mem.memory_type == MEM_ID_WGT) ) { + assert(c.mem.pop_prev_dep == false); + assert(c.mem.push_prev_dep == false); + if (c.mem.pop_next_dep) g2l_queue--; + if (c.mem.push_next_dep) l2g_queue++; + } else { + if (c.mem.pop_prev_dep) l2g_queue--; + if (c.mem.push_prev_dep) g2l_queue++; + if (c.mem.pop_next_dep) s2g_queue--; + if (c.mem.push_next_dep) g2s_queue++; + } + } else if (c.mem.opcode == OPCODE_GEMM) { + // Print instruction field information + if (c.gemm.pop_prev_dep) l2g_queue--; + if (c.gemm.push_prev_dep) g2l_queue++; + if (c.gemm.pop_next_dep) s2g_queue--; + if (c.gemm.push_next_dep) g2s_queue++; + } + printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); + printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + continue; + } + // Print instruction field information + if (c.mem.opcode==OPCODE_LOAD) { + printf("LOAD "); + if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n"); + if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n"); + if (c.mem.memory_type == MEM_ID_INP) printf("INP\n"); + if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n"); + } + if (c.mem.opcode==OPCODE_STORE) { + printf("STORE\n"); + } + printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", + static_cast(c.mem.dram_base), + static_cast(c.mem.sram_base)); + printf("\ty: size=%d, pad=[%d, %d]\n", + static_cast(c.mem.y_size), + static_cast(c.mem.y_pad_0), + static_cast(c.mem.y_pad_1)); + printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", + static_cast(c.mem.x_size), + static_cast(c.mem.x_stride), + static_cast(c.mem.x_pad_0), + static_cast(c.mem.x_pad_1)); + } else if (c.mem.opcode==OPCODE_GEMM) { + // Print instruction field information + printf("GEMM\n"); + + printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\trange (%d, %d)\n", + static_cast(c.gemm.uop_bgn), + static_cast(c.gemm.uop_end)); + printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_out), + static_cast(c.gemm.wgt_factor_out), + static_cast(c.gemm.src_factor_out), + static_cast(c.gemm.dst_factor_out)); + printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_in), + static_cast(c.gemm.wgt_factor_in), + static_cast(c.gemm.src_factor_in), + static_cast(c.gemm.dst_factor_in)); + } else if (c.mem.opcode == OPCODE_ALU) { + // Print instruction field information + printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); + printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + printf("\trange (%d, %d)\n", + static_cast(c.alu.uop_bgn), + static_cast(c.alu.uop_end)); + printf("\touter loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_out), + static_cast(c.alu.dst_factor_out), + static_cast(c.alu.src_factor_out)); + printf("\tinner loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_in), + static_cast(c.alu.dst_factor_in), + static_cast(c.alu.src_factor_in)); + } else if (c.mem.opcode == OPCODE_FINISH) { + printf("FINISH\n"); + } + + // Count status in queues + if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) { + if (c.mem.opcode == OPCODE_STORE) { + assert(c.mem.pop_next_dep == false); + assert(c.mem.push_next_dep == false); + if (c.mem.pop_prev_dep) g2s_queue--; + if (c.mem.push_prev_dep) s2g_queue++; + } else if (c.mem.opcode == OPCODE_LOAD && + (c.mem.memory_type == MEM_ID_INP || + c.mem.memory_type == MEM_ID_WGT) ) { + assert(c.mem.pop_prev_dep == false); + assert(c.mem.push_prev_dep == false); + if (c.mem.pop_next_dep) g2l_queue--; + if (c.mem.push_next_dep) l2g_queue++; + } else { + if (c.mem.pop_prev_dep) l2g_queue--; + if (c.mem.push_prev_dep) g2l_queue++; + if (c.mem.pop_next_dep) s2g_queue--; + if (c.mem.push_next_dep) g2s_queue++; + } + } else if (c.mem.opcode == OPCODE_GEMM || + c.mem.opcode == OPCODE_ALU) { + // Print instruction field information + if (c.gemm.pop_prev_dep) l2g_queue--; + if (c.gemm.push_prev_dep) g2l_queue++; + if (c.gemm.pop_next_dep) s2g_queue--; + if (c.gemm.push_next_dep) g2s_queue++; + } + printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); + printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + } + } + + // Commit all pending pop of corresponding stage + void CommitPendingPop(int stage) { + // Handle the LD<->compute queue + // NOTE: pop executes on target(stage) + assert(stage > 0 && stage < 4); + if (pending_pop_prev_[stage] || + pending_pop_next_[stage]) { + PushNoop(stage, false, false, + pending_pop_prev_[stage], + pending_pop_next_[stage]); + pending_pop_prev_[stage] = 0; + pending_pop_next_[stage] = 0; + } + } + + void CommitPending() { + for (int i = kLoadStage; i <= kStoreStage; ++i) { + CommitPendingPop(i); + } + } + + bool PendingPop() { + for (int i = kLoadStage; i <= kStoreStage; ++i) { + if (pending_pop_prev_[i]) return true; + if (pending_pop_next_[i]) return true; + } + return false; + } + + protected: + /*! \return Add new instruction to the buffer. */ + VTAGenericInsn* NextInsn() { + VTAGenericInsn* insn = data() + dram_end_; + ++dram_end_; + assert(dram_end_ < kMaxElems); + return insn; + } + // Create a new instruction for a given stage + VTAGenericInsn* Create(PipelineStage stage) { + VTAGenericInsn* gptr = NextInsn(); + VTAMemInsn* mptr = reinterpret_cast(gptr); + mptr->pop_prev_dep = pending_pop_prev_[stage]; + mptr->pop_next_dep = pending_pop_next_[stage]; + mptr->push_prev_dep = false; + mptr->push_next_dep = false; + pending_pop_prev_[stage] = 0; + pending_pop_next_[stage] = 0; + return gptr; + } + // Get stage of the memory + static PipelineStage GetMemPipelineStage(int memory_type) { + if (memory_type == MEM_ID_ACC) return kComputeStage; + if (memory_type == MEM_ID_UOP) return kComputeStage; + return kLoadStage; + } + // Get stage of the computation + static PipelineStage GetPipelineStage(VTAMemInsn* insn) { + if (insn->opcode == OPCODE_GEMM) return kComputeStage; + if (insn->opcode == OPCODE_ALU) return kComputeStage; + if (insn->opcode == OPCODE_LOAD) { + if (insn->x_size == 0) return kNoneStage; + if (insn->memory_type == MEM_ID_ACC) return kComputeStage; + if (insn->memory_type == MEM_ID_UOP) return kComputeStage; + return kLoadStage; + } + if (insn->opcode == OPCODE_STORE) { + // FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0 + // For now we'll refrain from checking the memory_type to avoid an assertion error... + return kStoreStage; + } + assert(false); + return kNoneStage; + } + // Push no-op + void PushNoop(int stage, + bool push_prev_dep, bool push_next_dep, + bool pop_prev_dep, bool pop_next_dep) { + VTAMemInsn* insn = reinterpret_cast(NextInsn()); + insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD); + insn->push_prev_dep = push_prev_dep; + insn->push_next_dep = push_next_dep; + insn->pop_prev_dep = pop_prev_dep; + insn->pop_next_dep = pop_next_dep; + insn->sram_base = 0; + insn->dram_base = 0; + insn->y_size = 0; + insn->x_size = 0; + insn->x_stride = 0; + insn->y_pad_0 = 0; + insn->y_pad_1 = 0; + insn->x_pad_0 = 0; + insn->x_pad_1 = 0; + insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP); + } + + private: + // Pending pop of each isntruction queue, qid=0 is not used + int pending_pop_prev_[4]; + int pending_pop_next_[4]; + static constexpr int kElemBytes = sizeof(VTAGenericInsn); + static constexpr int kMaxElems = kMaxBytes / kElemBytes; +}; + +/*! + * \brief The command queue object that handles the request. + */ +class CommandQueue { + public: + CommandQueue() { + this->InitSpace(); + } + void InitSpace() { + uop_queue_.InitSpace(); + insn_queue_.InitSpace(); + // VTA stage handles + vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); + vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); + vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); + vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); + printf("Initialize VTACommandHandle...\n"); + } + + ~CommandQueue() { + // Close VTA stage handle + VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE); + VTAUnmapRegister(vta_load_handle_, VTA_RANGE); + VTAUnmapRegister(vta_compute_handle_, VTA_RANGE); + VTAUnmapRegister(vta_store_handle_, VTA_RANGE); + printf("Close VTACommandhandle...\n"); + } + + uint32_t GetElemBytes(uint32_t memory_id) { + switch (memory_id){ + case MEM_ID_UOP: return UOP_ELEM_BYTES; + case MEM_ID_INP: return INP_ELEM_BYTES; + case MEM_ID_WGT: return WGT_ELEM_BYTES; + case MEM_ID_ACC: return ACC_ELEM_BYTES; + case MEM_ID_OUT: return INP_ELEM_BYTES; + default: break; + } + printf("Memory id not recognized: %d\n", memory_id); + assert(false); + return 0; + } + + void LoadBuffer2D(void* src_dram_addr, + uint32_t src_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride, + uint32_t x_pad_before, + uint32_t y_pad_before, + uint32_t x_pad_after, + uint32_t y_pad_after, + uint32_t dst_sram_index, + uint32_t dst_memory_type) { + VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type); + insn->opcode = OPCODE_LOAD; + insn->memory_type = dst_memory_type; + insn->sram_base = dst_sram_index; + DataBuffer* src = DataBuffer::FromHandle(src_dram_addr); + insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset; + insn->y_size = y_size; + insn->x_size = x_size; + insn->x_stride = x_stride; + insn->y_pad_0 = y_pad_before; + insn->y_pad_1 = y_pad_after; + insn->x_pad_0 = x_pad_before; + insn->x_pad_1 = x_pad_after; + this->CheckInsnOverFlow(); + } + + void StoreBuffer2D(uint32_t src_sram_index, + uint32_t src_memory_type, + void* dst_dram_addr, + uint32_t dst_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride) { + VTAMemInsn* insn = insn_queue_.CreateStoreInsn(); + insn->opcode = OPCODE_STORE; + insn->memory_type = src_memory_type; + insn->sram_base = src_sram_index; + DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr); + insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset; + insn->y_size = y_size; + insn->x_size = x_size; + insn->x_stride = x_stride; + insn->y_pad_0 = 0; + insn->y_pad_1 = 0; + insn->x_pad_0 = 0; + insn->x_pad_1 = 0; + this->CheckInsnOverFlow(); + } + + void DepPush(int from_qid, int to_qid) { + insn_queue_.DepPush(from_qid, to_qid); + } + + void DepPop(int from_qid, int to_qid) { + insn_queue_.DepPop(from_qid, to_qid); + } + + void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) { + if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) { + uint32_t elem_bytes = (elem_bits + 8 - 1) / 8; + DataBuffer::FromHandle(buffer)->FlushCache( + elem_bytes * start, elem_bytes * extent); + } + } + + void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) { + if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) { + uint32_t elem_bytes = (elem_bits + 8 - 1) / 8; + DataBuffer::FromHandle(buffer)->InvalidateCache( + elem_bytes * start, elem_bytes * extent); + } + } + + void Synchronize(uint32_t wait_cycles) { + // Insert dependences to force serialization + if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) { + insn_queue_.RewriteForceSerial(); + } + // This will issue finish after last store finishes + insn_queue_.DepPush(kStoreStage, kComputeStage); + insn_queue_.DepPush(kLoadStage, kComputeStage); + insn_queue_.DepPop(kStoreStage, kComputeStage); + insn_queue_.DepPop(kLoadStage, kComputeStage); + insn_queue_.CommitPendingPop(kComputeStage); + // NOTE: FINISH cannot contain pop + VTAGemInsn* insn = insn_queue_.CreateGemInsn(); + insn->opcode = OPCODE_FINISH; + assert(!insn_queue_.PendingPop()); + // Check if there are no instruction to execute at all + if (insn_queue_.count() == 0) return; + // Synchronization for the queues + uop_queue_.AutoReadBarrier(); + insn_queue_.AutoReadBarrier(); + // Dump instructions if debug enabled + if (debug_flag_ & VTA_DEBUG_DUMP_INSN) { + insn_queue_.DumpInsn(); + } + // Make sure that the last instruction is a finish instruction + assert(reinterpret_cast( + insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH); + +#ifdef PYNQ_TARGET + // Make sure that we don't exceed contiguous physical memory limits + assert(insn_queue_.count() < MAX_XFER); + + // NOTE: Register address map is derived from the auto-generated + // driver files available under hardware/build/vivado//export/driver + // FETCH @ 0x10 : Data signal of insn_count_V + VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count()); + // FETCH @ 0x18 : Data signal of insns_V + VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr()); + // LOAD @ 0x10 : Data signal of inputs_V + VTAWriteMappedReg(vta_load_handle_, 0x10, 0); + // LOAD @ 0x18 : Data signal of weight_V + VTAWriteMappedReg(vta_load_handle_, 0x18, 0); + // COMPUTE @ 0x10 : Data signal of uops_V + VTAWriteMappedReg(vta_compute_handle_, 0x20, 0); + // COMPUTE @ 0x18 : Data signal of biases_V + VTAWriteMappedReg(vta_compute_handle_, 0x28, 0); + // STORE @ 0x10 : Data signal of outputs_V + VTAWriteMappedReg(vta_store_handle_, 0x10, 0); + + // VTA start + VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); + VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART); + + // Loop until the VTA is done + unsigned t, flag = 0; + for (t = 0; t < wait_cycles; ++t) { + flag = VTAReadMappedReg(vta_compute_handle_, 0x18); + if (flag == VTA_DONE) break; + std::this_thread::yield(); + } + // Report error if timeout + assert(t < wait_cycles); +#endif //PYNQ_TARGET + + // Reset buffers + uop_queue_.Reset(); + insn_queue_.Reset(); + } + + // Get record kernel + UopKernel* record_kernel() const { + assert(record_kernel_ != nullptr); + return record_kernel_; + } + + // Set debug flag + void SetDebugFlag(int debug_flag) { + debug_flag_ = debug_flag; + } + + void PushGEMMOp(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes) { + UopKernelMap** uptr = reinterpret_cast(uop_handle); + if (uptr[0] == nullptr) { + uptr[0] = new UopKernelMap(); + } + UopKernel** kptr = uptr[0]->Get(signature, nbytes); + if (kptr[0] == nullptr) { + record_kernel_ = new UopKernel(static_cast(signature), nbytes); + assert((*finit)(signature) == 0); + kptr[0] = static_cast(record_kernel_); + if (debug_flag_ & VTA_DEBUG_DUMP_UOP) { + record_kernel_->Dump(); + } + record_kernel_ = nullptr; + } + this->PushGEMMOp(static_cast(kptr[0])); + this->CheckInsnOverFlow(); + } + + void PushALUUop(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes) { + UopKernelMap** uptr = reinterpret_cast(uop_handle); + if (uptr[0] == nullptr) { + uptr[0] = new UopKernelMap(); + } + UopKernel** kptr = uptr[0]->Get(signature, nbytes); + if (kptr[0] == nullptr) { + record_kernel_ = new UopKernel(static_cast(signature), nbytes); + assert((*finit)(signature) == 0); + kptr[0] = static_cast(record_kernel_); + if (debug_flag_ & VTA_DEBUG_DUMP_UOP) { + record_kernel_->Dump(); + } + record_kernel_ = nullptr; + } + this->PushALUUop(static_cast(kptr[0])); + this->CheckInsnOverFlow(); + } + + static std::shared_ptr& ThreadLocal() { + static std::shared_ptr inst = + std::make_shared(); + return inst; + } + + static void Shutdown() { + ThreadLocal().reset(); + } + + private: + // Push GEMM uop to the command buffer + void PushGEMMOp(UopKernel* kernel) { + uop_queue_.Push(kernel, + [this]() { this->AutoSync(); }); + if (uop_queue_.pending()) { + VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); + insn->opcode = OPCODE_LOAD; + uop_queue_.FlushUopLoad(insn); + } + VTAGemInsn* insn = insn_queue_.CreateGemInsn(); + insn->opcode = OPCODE_GEMM; + insn->uop_bgn = kernel->sram_begin_; + insn->uop_end = kernel->sram_end_; + const std::vector &loop = kernel->loop(); + if (loop.size() > 0) { + insn->iter_out = loop[0].extent; + insn->wgt_factor_out = loop[0].wgt_factor; + insn->src_factor_out = loop[0].src_factor; + insn->dst_factor_out = loop[0].dst_factor; + } else { + insn->iter_out = 1; + insn->wgt_factor_out = 0; + insn->src_factor_out = 0; + insn->dst_factor_out = 0; + } + if (loop.size() > 1) { + insn->iter_in = loop[1].extent; + insn->wgt_factor_in = loop[1].wgt_factor; + insn->src_factor_in = loop[1].src_factor; + insn->dst_factor_in = loop[1].dst_factor; + } else { + insn->iter_in = 1; + insn->wgt_factor_in = 0; + insn->src_factor_in = 0; + insn->dst_factor_in = 0; + } + } + + // Push ALU uop to the command buffer + void PushALUUop(UopKernel* kernel) { + uop_queue_.Push(kernel, + [this]() { this->AutoSync(); }); + if (uop_queue_.pending()) { + VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP); + insn->opcode = OPCODE_LOAD; + uop_queue_.FlushUopLoad(insn); + } + VTAAluInsn* insn = insn_queue_.CreateAluInsn(); + insn->opcode = OPCODE_ALU; + insn->uop_bgn = kernel->sram_begin_; + insn->uop_end = kernel->sram_end_; + insn->alu_opcode = kernel->opcode_; + insn->use_imm = kernel->use_imm_; + insn->imm = kernel->imm_val_; + const std::vector &loop = kernel->loop(); + if (loop.size() == 0) { + insn->iter_out = 1; + insn->dst_factor_out = 0; + insn->src_factor_out = 0; + insn->iter_in = 1; + insn->dst_factor_in = 0; + insn->src_factor_in = 0; + } else if (loop.size() == 1) { + insn->iter_out = 1; + insn->dst_factor_out = 0; + insn->src_factor_out = 0; + insn->iter_in = loop[0].extent; + insn->dst_factor_in = loop[0].dst_factor; + insn->src_factor_in = loop[0].src_factor; + } else { + insn->iter_out = loop[0].extent; + insn->dst_factor_out = loop[0].dst_factor; + insn->src_factor_out = loop[0].src_factor; + insn->iter_in = loop[1].extent; + insn->dst_factor_in = loop[1].dst_factor; + insn->src_factor_in = loop[1].src_factor; + } + } + + void CheckInsnOverFlow() { + // At each API call, we can at most commit: + // one pending store, one pending load, and one uop + if (insn_queue_.count() >= MAX_XFER) { + this->AutoSync(); + } + } + // Auto sync when instruction overflow + void AutoSync() { + this->Synchronize(1 << 31); + } + // VTA handles (register maps) + VTAHandle vta_fetch_handle_{nullptr}; + VTAHandle vta_load_handle_{nullptr}; + VTAHandle vta_compute_handle_{nullptr}; + VTAHandle vta_store_handle_{nullptr}; + // Internal debug flag + int debug_flag_{0}; + // The kernel we currently recording + UopKernel* record_kernel_{nullptr}; + // Micro op queue + UopQueue uop_queue_; + // instruction queue + InsnQueue insn_queue_; +}; + +} // namespace vta + + +VTACommandHandle VTATLSCommandHandle() { + return vta::CommandQueue::ThreadLocal().get(); +} + +void VTARuntimeShutdown() { + vta::CommandQueue::Shutdown(); +} + +void* VTABufferAlloc(VTACommandHandle cmd, size_t size) { + return vta::DataBuffer::Alloc(size); +} + +void VTABufferFree(VTACommandHandle cmd, void* buffer) { + vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer)); +} + +void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) { + return vta::DataBuffer::FromHandle(buffer)->virt_addr(); +} + +void VTABufferCopy(VTACommandHandle cmd, + const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + int kind_mask) { + vta::DataBuffer* from_buffer = nullptr; + vta::DataBuffer* to_buffer = nullptr; + + if (kind_mask & 2) { + from_buffer = vta::DataBuffer::FromHandle(from); + from = from_buffer->virt_addr(); + } + if (kind_mask & 1) { + to_buffer = vta::DataBuffer::FromHandle(to); + to = to_buffer->virt_addr(); + } + if (from_buffer) { + from_buffer->InvalidateCache(from_offset, size); + } + + memcpy(static_cast(to) + to_offset, + static_cast(from) + from_offset, + size); + if (to_buffer) { + to_buffer->FlushCache(to_offset, size); + } +} + +void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) { + static_cast(cmd)-> + SetDebugFlag(debug_flag); +} + +void VTAWriteBarrier(VTACommandHandle cmd, + void* buffer, uint32_t elem_bits, + uint32_t start, uint32_t extent) { + static_cast(cmd)-> + WriteBarrier(buffer, elem_bits, start, extent); +} + +void VTAReadBarrier(VTACommandHandle cmd, + void* buffer, uint32_t elem_bits, + uint32_t start, uint32_t extent) { + static_cast(cmd)-> + ReadBarrier(buffer, elem_bits, start, extent); +} + +void VTALoadBuffer2D(VTACommandHandle cmd, + void* src_dram_addr, + uint32_t src_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride, + uint32_t x_pad_before, + uint32_t y_pad_before, + uint32_t x_pad_after, + uint32_t y_pad_after, + uint32_t dst_sram_index, + uint32_t dst_memory_type) { + static_cast(cmd)-> + LoadBuffer2D(src_dram_addr, src_elem_offset, + x_size, y_size, x_stride, + x_pad_before, y_pad_before, + x_pad_after, y_pad_after, + dst_sram_index, dst_memory_type); +} + +void VTAStoreBuffer2D(VTACommandHandle cmd, + uint32_t src_sram_index, + uint32_t src_memory_type, + void* dst_dram_addr, + uint32_t dst_elem_offset, + uint32_t x_size, + uint32_t y_size, + uint32_t x_stride) { + static_cast(cmd)-> + StoreBuffer2D(src_sram_index, src_memory_type, + dst_dram_addr, dst_elem_offset, + x_size, y_size, x_stride); +} + +void VTAUopPush(uint32_t mode, + uint32_t reset_out, + uint32_t dst_index, + uint32_t src_index, + uint32_t wgt_index, + uint32_t opcode, + uint32_t use_imm, + uint32_t imm_val) { + vta::CommandQueue::ThreadLocal()->record_kernel() + ->Push(mode, reset_out, dst_index, src_index, + wgt_index, opcode, use_imm, imm_val); +} + +void VTAUopLoopBegin(uint32_t extent, + uint32_t dst_factor, + uint32_t src_factor, + uint32_t wgt_factor) { + vta::CommandQueue::ThreadLocal()->record_kernel() + ->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor); +} + +void VTAUopLoopEnd() { + vta::CommandQueue::ThreadLocal()->record_kernel() + ->PushLoopEnd(); +} + +int VTAPushGEMMOp(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes) { + vta::CommandQueue::ThreadLocal()-> + PushGEMMOp(uop_handle, finit, signature, nbytes); + return 0; +} + +int VTAPushALUOp(void** uop_handle, + int (*finit)(void*), + void* signature, + int nbytes) { + vta::CommandQueue::ThreadLocal()-> + PushALUUop(uop_handle, finit, signature, nbytes); + return 0; +} + +int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) { + static_cast(cmd)-> + DepPush(from_qid, to_qid); + return 0; +} + +int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) { + static_cast(cmd)-> + DepPop(from_qid, to_qid); + return 0; +} + +void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) { + static_cast(cmd)-> + Synchronize(wait_cycles); +} diff --git a/vta/src/tvm/vta_device_api.cc b/vta/src/tvm/vta_device_api.cc new file mode 100644 index 00000000..b686b65f --- /dev/null +++ b/vta/src/tvm/vta_device_api.cc @@ -0,0 +1,106 @@ +// simply include the driver for now. +#include +#include +#include +#include "../../tvm/src/runtime/workspace_pool.h" + +namespace tvm { +namespace runtime { + +std::string VTARPCGetPath(const std::string& name) { + static const PackedFunc* f = + runtime::Registry::Get("tvm.contrib.rpc.server.workpath"); + CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath"; + return (*f)(name); +} + +// Global functions that can be called +TVM_REGISTER_GLOBAL("tvm.contrib.vta.init") +.set_body([](TVMArgs args, TVMRetValue* rv) { + std::string path = VTARPCGetPath(args[0]); + VTAProgram(path.c_str()); + LOG(INFO) << "VTA initialization end with bistream " << path; + }); + +TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown") +.set_body([](TVMArgs args, TVMRetValue* rv) { + VTARuntimeShutdown(); + }); + +class VTADeviceAPI final : public DeviceAPI { + public: + void SetDevice(TVMContext ctx) final {} + + void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + if (kind == kExist) { + *rv = 1; + } + } + + void* AllocDataSpace(TVMContext ctx, + size_t size, size_t alignment, + TVMType type_hint) final { + return VTABufferAlloc(VTATLSCommandHandle(), size); + } + + void FreeDataSpace(TVMContext ctx, void* ptr) final { + VTABufferFree(VTATLSCommandHandle(), ptr); + } + + void CopyDataFromTo(const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + TVMContext ctx_from, + TVMContext ctx_to, + TVMStreamHandle stream) final { + int kind_mask = 0; + if (ctx_from.device_type != kDLCPU) { + kind_mask |= 2; + } + if (ctx_to.device_type != kDLCPU) { + kind_mask |= 1; + } + VTABufferCopy(VTATLSCommandHandle(), + from, from_offset, + to, to_offset, + size, kind_mask); + } + + void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { + } + + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final; + + void FreeWorkspace(TVMContext ctx, void* data) final; + + static const std::shared_ptr& Global() { + static std::shared_ptr inst = + std::make_shared(); + return inst; + } +}; + +struct VTAWorkspacePool : public WorkspacePool { + VTAWorkspacePool() : + WorkspacePool(static_cast(kExtDev), + VTADeviceAPI::Global()) {} +}; + +void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) { + return dmlc::ThreadLocalStore::Get() + ->AllocWorkspace(ctx, size); +} + +void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) { + dmlc::ThreadLocalStore::Get()->FreeWorkspace(ctx, data); +} + +TVM_REGISTER_GLOBAL("device_api.ext_dev") +.set_body([](TVMArgs args, TVMRetValue* rv) { + DeviceAPI* ptr = VTADeviceAPI::Global().get(); + *rv = static_cast(ptr); + }); +} // namespace runtime +} // namespace tvm diff --git a/vta/tests/driver/Makefile b/vta/tests/driver/Makefile deleted file mode 100644 index dad8dbed..00000000 --- a/vta/tests/driver/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -CC ?= g++ -CFLAGS = -Wall -O3 -std=c++11 -I/usr/include -LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers -LIBS = -l:libsds_lib.so -l:libdma.so -SRC_DIR = ../../src -INCLUDE_DIR = ../../include -DRIVER_DIR = $(SRC_DIR)/driver/pynq -TESTLIB_DIR = $(SRC_DIR)/test -VPATH = $(DRIVER_DIR):$(TESTLIB_DIR) -SOURCES = vta_pynq_driver.c vta_test_lib.cc -OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o -EXECUTABLE = vta - -# VTA Parameters -# Log of input width in bits -LOG_INP_WIDTH = 3 -# Log of weight width in bits -LOG_WGT_WIDTH = 3 -# Log of accum width in bits -LOG_ACC_WIDTH = 5 -# Log of output width in bits -LOG_OUT_WIDTH = $(LOG_INP_WIDTH) -# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication) -LOG_BATCH = 0 -# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication) -LOG_IN_BLOCK = 4 -# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication) -LOG_OUT_BLOCK = 4 -# Log of uop buffer size in Bytes -LOG_UOP_BUFF_SIZE = 15 -# Log of inp buffer size in Bytes -LOG_INP_BUFF_SIZE = 15 -# Log of wgt buffer size in Bytes -LOG_WGT_BUFF_SIZE = 15 -# Log of acc buffer size in Bytes -LOG_ACC_BUFF_SIZE = 17 -# Log of out buffer size in Bytes -LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" ) - -# Define flags -CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \ - -DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \ - -DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \ - -DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \ - -DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \ - -DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \ - -DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE) - -# All Target -all: $(EXECUTABLE) - -%.o: %.cc $(SOURCES) - $(CC) -c -o $@ $< $(CFLAGS) - -$(EXECUTABLE): $(OBJECTS) - $(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS) - -clean: - rm -rf *.o $(EXECUTABLE) diff --git a/vta/src/test/vta_test_lib.cc b/vta/tests/hardware/common/test_lib.cc similarity index 97% rename from vta/src/test/vta_test_lib.cc rename to vta/tests/hardware/common/test_lib.cc index df8fce0d..d203b2aa 100644 --- a/vta/src/test/vta_test_lib.cc +++ b/vta/tests/hardware/common/test_lib.cc @@ -4,7 +4,7 @@ * \brief Test library for the VTA design simulation and driver tests. */ -#include "vta_test_lib.h" +#include "./test_lib.h" const char* getOpcodeString(int opcode, bool use_imm) { // Returns string name @@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) { void * allocBuffer(size_t num_bytes) { #ifdef NO_SIM - return cma_alloc(num_bytes, CACHED); + return VTAMemAlloc(num_bytes, CACHED); #else return malloc(num_bytes); #endif @@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) { void freeBuffer(void * buffer) { #ifdef NO_SIM - return cma_free(buffer); + return VTAMemFree(buffer); #else return free(buffer); #endif @@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) { // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); #else VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); #endif @@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression, // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); #else VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); #endif @@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) { // Allocate buffer #ifdef NO_SIM - VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED); + VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED); #else VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size); #endif @@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp } // Compute reference output - inp_T **outputs_ref = alloc2dArray(batch, vector_size); + out_T **outputs_ref = alloc2dArray(batch, vector_size); for (int i = 0; i < batch; i ++) { for (int j = 0; j < vector_size; j ++) { acc_T tmp = 0; @@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp tmp = inputs[i][j] >> immediate[i / BATCH]; } // Set - outputs_ref[i][j] = (inp_T) tmp; + outputs_ref[i][j] = (out_T) tmp; } } @@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp packBuffer(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT); // Prepare output buffer - inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets); + out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets); #ifdef NO_SIM // Invoke the VTA @@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp #endif // Unpack output buffer - inp_T **outputs = alloc2dArray(batch, vector_size); - unpackBuffer(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT); + out_T **outputs = alloc2dArray(batch, vector_size); + unpackBuffer(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT); // Correctness checks int err = 0; @@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp // Free all allocated arrays free(immediate); free2dArray(inputs, batch, vector_size * input_sets); - free2dArray(outputs_ref, batch, vector_size); - free2dArray(outputs, batch, vector_size); + free2dArray(outputs_ref, batch, vector_size); + free2dArray(outputs, batch, vector_size); freeBuffer(insn_buf); freeBuffer(uop_buf); freeBuffer(bias_buf); @@ -891,17 +891,17 @@ virtual_threads=%d\n", int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2; int uop_size = uop_compression ? block / BATCH * virtual_threads : block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads; - int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT; int inp_size = batch / BATCH * in_feat / BLOCK_IN; + int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT; int out_size = batch / BATCH * out_feat / BLOCK_OUT; // Blocked buffer sizes (in terms of elements) - int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT; int inp_block_size = block / BATCH * block / BLOCK_IN; + int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT; int out_block_size = block / BATCH * block / BLOCK_OUT; // Make sure we don't exceed buffer bounds assert(uop_size <= UOP_BUFF_DEPTH); - assert(wgt_block_size <= WGT_BUFF_DEPTH); assert(inp_block_size <= INP_BUFF_DEPTH); + assert(wgt_block_size <= WGT_BUFF_DEPTH); assert(out_block_size <= ACC_BUFF_DEPTH); // Initialize instruction buffer @@ -1017,15 +1017,15 @@ virtual_threads=%d\n", printMicroOp(uop_size, uop_buf); #endif - // Initialize weights - wgt_T **weights = allocInit2dArray(out_feat, in_feat); // Initialize inputs inp_T **inputs = allocInit2dArray(batch, in_feat); + // Initialize weights + wgt_T **weights = allocInit2dArray(out_feat, in_feat); // Initialize biases acc_T **biases = allocInit2dArray(batch, out_feat); // Reference GEMM implementation - inp_T **outputs_ref = alloc2dArray(batch, out_feat); + out_T **outputs_ref = alloc2dArray(batch, out_feat); for (int i = 0; i < batch; i ++) { for (int j = 0; j < out_feat; j ++) { acc_T sum = biases[i][j]; @@ -1033,21 +1033,21 @@ virtual_threads=%d\n", sum += (acc_T) (inputs[i][k] * weights[j][k]); } // Set - outputs_ref[i][j] = (inp_T) sum; + outputs_ref[i][j] = (out_T) sum; } } - // Prepare the weight buffer - wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size); - packBuffer(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN); // Prepare the input buffer inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size); packBuffer(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN); + // Prepare the weight buffer + wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size); + packBuffer(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN); // Prepare the bias buffer acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size); packBuffer(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT); // Prepare the output buffer - inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size); + out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size); #ifdef NO_SIM // Invoke the VTA @@ -1069,8 +1069,8 @@ virtual_threads=%d\n", #endif // Unpack output data - inp_T **outputs = alloc2dArray(batch, out_feat); - unpackBuffer(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT); + out_T **outputs = alloc2dArray(batch, out_feat); + unpackBuffer(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT); // Correctness checks int err = 0; @@ -1087,15 +1087,15 @@ virtual_threads=%d\n", } // Free all allocated arrays - free2dArray(weights, out_feat, in_feat); free2dArray(inputs, batch, in_feat); + free2dArray(weights, out_feat, in_feat); free2dArray(biases, batch, out_feat); - free2dArray(outputs_ref, batch, out_feat); - free2dArray(outputs, batch, out_feat); + free2dArray(outputs_ref, batch, out_feat); + free2dArray(outputs, batch, out_feat); freeBuffer((void *) insn_buf); freeBuffer((void *) uop_buf); - freeBuffer((void *) weight_buf); freeBuffer((void *) input_buf); + freeBuffer((void *) weight_buf); freeBuffer((void *) bias_buf); freeBuffer((void *) output_buf); diff --git a/vta/include/vta_test_lib.h b/vta/tests/hardware/common/test_lib.h similarity index 97% rename from vta/include/vta_test_lib.h rename to vta/tests/hardware/common/test_lib.h index b4eb1684..fad2e4da 100644 --- a/vta/include/vta_test_lib.h +++ b/vta/tests/hardware/common/test_lib.h @@ -7,21 +7,25 @@ #ifndef VTA_TESTLIB_H_ #define VTA_TESTLIB_H_ -#include "vta_params.h" - #include #include #include #include +#include #ifdef NO_SIM -#include "vta_pynq_driver.h" +#include + +#ifdef PYNQ_TARGET +#include "../../../src/pynq/pynq_driver.h" +#endif //PYNQ_TARGET typedef uint64_t axi_T; typedef uint32_t uop_T; typedef int8_t wgt_T; typedef int8_t inp_T; +typedef int8_t out_T; typedef int32_t acc_T; uint64_t vta ( @@ -35,8 +39,7 @@ uint64_t vta ( #else //NO_SIM -#include "vta.h" -#include "vta_typedefs.h" +#include "../../../hardware/vivado/src/vta.h" #endif //NO_SIM diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/pynq/Makefile new file mode 100644 index 00000000..7e70366f --- /dev/null +++ b/vta/tests/hardware/pynq/Makefile @@ -0,0 +1,37 @@ +CC ?= g++ +CFLAGS = -Wall -O3 -std=c++11 -I/usr/include +LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers +LIBS = -l:libsds_lib.so -l:libdma.so +INCLUDE_DIR = ../../../include +DRIVER_DIR = ../../../src/pynq +TESTLIB_DIR = ../common +VPATH = $(DRIVER_DIR):$(TESTLIB_DIR) +SOURCES = pynq_driver.cc test_lib.cc +OBJECTS = pynq_driver.o test_lib.o metal_test.o +EXECUTABLE = vta + +# Include top-level config file +ifndef config +ifneq ("$(wildcard ../../../config.mk)", "") + config = ../../../config.mk +else + config = ../../../make/config.mk +endif +endif +include $(config) + +# Define flags +CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0 +CFLAGS += $(ADD_CFLAGS) + +# All Target +all: $(EXECUTABLE) + +%.o: %.cc $(SOURCES) + $(CC) -c -o $@ $< $(CFLAGS) + +$(EXECUTABLE): $(OBJECTS) + $(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS) + +clean: + rm -rf *.o $(EXECUTABLE) diff --git a/vta/tests/driver/driver_test.cc b/vta/tests/hardware/pynq/metal_test.cc similarity index 71% rename from vta/tests/driver/driver_test.cc rename to vta/tests/hardware/pynq/metal_test.cc index 6cdc32a9..b5147399 100644 --- a/vta/tests/driver/driver_test.cc +++ b/vta/tests/hardware/pynq/metal_test.cc @@ -9,8 +9,9 @@ #include #include #include -#include "vta_test_lib.h" -#include "vta_pynq_driver.h" +#include +#include "../../../src/pynq/pynq_driver.h" +#include "../common/test_lib.h" // VTA invocation (present the same abstraction as in the simulation tests) uint64_t vta ( @@ -43,18 +44,18 @@ uint64_t vta ( #endif // Program VTA - ProgramVTA(bitstream); + VTAProgram(bitstream); // Get VTA handles - VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE); - VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE); - VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); - VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE); + VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); + VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE); + VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE); + VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE); // Physical address pointers uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0; uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0; - uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0; + uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0; uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0; uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0; @@ -65,29 +66,29 @@ uint64_t vta ( clock_gettime(CLOCK_REALTIME, &start); // FETCH @ 0x10 : Data signal of insn_count_V - WriteMappedReg(vta_fetch_handle, 0x10, insn_count); + VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count); // FETCH @ 0x18 : Data signal of insns_V - if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy); - // LOAD @ 0x10 : Data signal of weight_V - if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy); - // LOAD @ 0x18 : Data signal of inputs_V - if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy); + if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy); + // LOAD @ 0x10 : Data signal of inputs_V + if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy); + // LOAD @ 0x18 : Data signal of weight_V + if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy); // COMPUTE @ 0x20 : Data signal of uops_V - if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy); + if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy); // COMPUTE @ 0x28 : Data signal of biases_V - if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy); + if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy); // STORE @ 0x10 : Data signal of outputs_V - if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy); + if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy); // VTA start - WriteMappedReg(vta_fetch_handle, 0x0, 0x1); - WriteMappedReg(vta_load_handle, 0x0, 0x81); - WriteMappedReg(vta_compute_handle, 0x0, 0x81); - WriteMappedReg(vta_store_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1); + VTAWriteMappedReg(vta_load_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81); + VTAWriteMappedReg(vta_store_handle, 0x0, 0x81); int flag = 0, t = 0; for (t = 0; t < 10000000; ++t) { - flag = ReadMappedReg(vta_compute_handle, 0x18); + flag = VTAReadMappedReg(vta_compute_handle, 0x18); if (flag & VTA_DONE) break; } @@ -104,10 +105,10 @@ uint64_t vta ( t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec); // Unmap VTA register - UnmapRegister(vta_fetch_handle, VTA_RANGE); - UnmapRegister(vta_load_handle, VTA_RANGE); - UnmapRegister(vta_compute_handle, VTA_RANGE); - UnmapRegister(vta_store_handle, VTA_RANGE); + VTAUnmapRegister(vta_fetch_handle, VTA_RANGE); + VTAUnmapRegister(vta_load_handle, VTA_RANGE); + VTAUnmapRegister(vta_compute_handle, VTA_RANGE); + VTAUnmapRegister(vta_store_handle, VTA_RANGE); return t_fpga; };