hardware compilation flow, and driver tests
This commit is contained in:
Родитель
b8d8e5b6e8
Коммит
470018503f
|
@ -1,2 +1,12 @@
|
|||
# vta
|
||||
Open Hardware/Software Stack for Vertical Deep Learning System Optimization
|
||||
==============================================
|
||||
|
||||
[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
|
||||
|
||||
VTA is an open hardware/software co-design stack for deep learning systems systems.
|
||||
It provides a customizable hardware accelerator template for deep learning inference workloads,
|
||||
combined with a fully functional compiler stack built with TVM.
|
||||
|
||||
License
|
||||
-------
|
||||
© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
doxygen
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1 @@
|
|||
build
|
|
@ -0,0 +1,106 @@
|
|||
# Directories
|
||||
ROOTDIR = $(CURDIR)
|
||||
BUILD_DIR = $(ROOTDIR)/build
|
||||
SCRIPT_DIR = $(ROOTDIR)/scripts
|
||||
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
|
||||
SIM_DIR = $(ROOTDIR)/sim
|
||||
TEST_DIR = $(ROOTDIR)/../../src/test
|
||||
INCLUDE_DIR = $(ROOTDIR)/../../include
|
||||
|
||||
# Executables
|
||||
VIVADO_HLS = vivado_hls
|
||||
VIVADO = vivado
|
||||
HSI = hsi
|
||||
|
||||
# Build parameters:
|
||||
# Number of threads during compilation
|
||||
NUM_THREADS = 8
|
||||
# Target Frequency
|
||||
CLOCK_FREQ = 100
|
||||
# Log of input width in bits
|
||||
LOG_INP_WIDTH = 3
|
||||
# Log of weight width in bits
|
||||
LOG_WGT_WIDTH = 3
|
||||
# Log of accum width in bits
|
||||
LOG_ACC_WIDTH = 5
|
||||
# Log of output width in bits
|
||||
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
|
||||
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BATCH = 0
|
||||
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_IN_BLOCK = 4
|
||||
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_OUT_BLOCK = 4
|
||||
# Log of uop buffer size in Bytes
|
||||
LOG_UOP_BUFF_SIZE = 15
|
||||
# Log of inp buffer size in Bytes
|
||||
LOG_INP_BUFF_SIZE = 15
|
||||
# Log of wgt buffer size in Bytes
|
||||
LOG_WGT_BUFF_SIZE = 15
|
||||
# Log of acc buffer size in Bytes
|
||||
LOG_ACC_BUFF_SIZE = 17
|
||||
# Log of out buffer size in Bytes
|
||||
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
|
||||
|
||||
# Derived parameter
|
||||
# Input width in bits
|
||||
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
|
||||
# Weight width in bits
|
||||
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
|
||||
# Output width in bits
|
||||
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
|
||||
# Tensor batch size
|
||||
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
|
||||
# Tensor outer block size
|
||||
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
|
||||
# Tensor inner block size
|
||||
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
|
||||
# Uop buffer size in Bytes
|
||||
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
|
||||
# Inp buffer size in Bytes
|
||||
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
|
||||
# Wgt buffer size in Bytes
|
||||
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
|
||||
# Acc buffer size in Bytes
|
||||
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
|
||||
# Out buffer size in Bytes
|
||||
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
|
||||
|
||||
# Derive clock target period
|
||||
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
|
||||
|
||||
# Derive config name
|
||||
CONF = \
|
||||
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
|
||||
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
|
||||
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
|
||||
|
||||
.PHONY: all ip bit driver clean
|
||||
|
||||
all: driver
|
||||
|
||||
ip:
|
||||
mkdir -p $(IP_BUILD_PATH)
|
||||
cd $(IP_BUILD_PATH) && \
|
||||
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
|
||||
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
|
||||
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
|
||||
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
|
||||
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
|
||||
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
|
||||
|
||||
bit: ip
|
||||
mkdir -p $(HW_BUILD_PATH)
|
||||
cd $(HW_BUILD_PATH) && \
|
||||
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
|
||||
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
|
||||
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
|
||||
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
|
||||
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
|
||||
|
||||
driver: bit
|
||||
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
|
||||
cd $(HW_BUILD_PATH)/bsp && make
|
||||
|
||||
clean:
|
||||
rm -rf build
|
|
@ -0,0 +1,177 @@
|
|||
#
|
||||
# Copyright (c) 2018 by Contributors
|
||||
# file: hls.tcl
|
||||
# brief: HLS generation script.
|
||||
#
|
||||
|
||||
# Command line arguments:
|
||||
# Arg 1: path to design sources
|
||||
# Arg 2: path to sim sources
|
||||
# Arg 3: path to test sources
|
||||
# Arg 4: path to include sources
|
||||
# Arg 5: target clock period
|
||||
# Arg 6: input type width (log)
|
||||
# Arg 7: weight type width (log)
|
||||
# Arg 8: accum type width (log)
|
||||
# Arg 9: output type width (log)
|
||||
# Arg 10: batch size (log)
|
||||
# Arg 11: in block size (log)
|
||||
# Arg 12: out block size (log)
|
||||
# Arg 13: uop buffer size in B (log)
|
||||
# Arg 14: inp buffer size in B (log)
|
||||
# Arg 15: wgt buffer size in B (log)
|
||||
# Arg 16: acc buffer size in B (log)
|
||||
# Arg 17: out buffer size in B (log)
|
||||
|
||||
if { [llength $argv] eq 19 } {
|
||||
set src_dir [lindex $argv 2]
|
||||
set sim_dir [lindex $argv 3]
|
||||
set test_dir [lindex $argv 4]
|
||||
set include_dir [lindex $argv 5]
|
||||
set target_period [lindex $argv 6]
|
||||
set inp_width [lindex $argv 7]
|
||||
set wgt_width [lindex $argv 8]
|
||||
set acc_width [lindex $argv 9]
|
||||
set out_width [lindex $argv 10]
|
||||
set batch [lindex $argv 11]
|
||||
set block_in [lindex $argv 12]
|
||||
set block_out [lindex $argv 13]
|
||||
set uop_buff_size [lindex $argv 14]
|
||||
set inp_buff_size [lindex $argv 15]
|
||||
set wgt_buff_size [lindex $argv 16]
|
||||
set acc_buff_size [lindex $argv 17]
|
||||
set out_buff_size [lindex $argv 18]
|
||||
} else {
|
||||
set src_dir "../src/"
|
||||
set sim_dir "../sim/"
|
||||
set test_dir "../../src/test/"
|
||||
set include_dir "../../include"
|
||||
set target_period 10
|
||||
set inp_width 3
|
||||
set wgt_width 3
|
||||
set acc_width 5
|
||||
set out_width 3
|
||||
set batch 1
|
||||
set block_out 4
|
||||
set block_in 4
|
||||
set uop_buff_size 15
|
||||
set inp_buff_size 15
|
||||
set wgt_buff_size 15
|
||||
set acc_buff_size 17
|
||||
set out_buff_size 15
|
||||
}
|
||||
|
||||
# C define flags to pass to compiler
|
||||
set cflags "-I $include_dir -I $include_dir/hardware/hls \
|
||||
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
|
||||
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
|
||||
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
|
||||
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
|
||||
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
|
||||
-DLOG_OUT_BUFF_SIZE=$out_buff_size"
|
||||
|
||||
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
|
||||
# This is necessary because of a Vivado restriction that doesn't allow for
|
||||
# buses wider than 1024 bits.
|
||||
proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
|
||||
|
||||
# Set device number
|
||||
set_part {xc7z020clg484-1}
|
||||
|
||||
# Set the clock frequency
|
||||
create_clock -period $per -name default
|
||||
|
||||
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
|
||||
set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
|
||||
if {$inp_partition_factor == 0} {
|
||||
set_directive_array_reshape -type complete -dim 2 "load" inp_mem
|
||||
set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
|
||||
} else {
|
||||
# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
|
||||
set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
|
||||
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
|
||||
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
|
||||
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
|
||||
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
|
||||
}
|
||||
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
|
||||
set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
|
||||
if {$wgt_partition_factor == 0} {
|
||||
set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
|
||||
set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
|
||||
} else {
|
||||
# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
|
||||
set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
|
||||
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
|
||||
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
|
||||
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
|
||||
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
|
||||
}
|
||||
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
|
||||
set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
|
||||
if {$out_partition_factor == 0} {
|
||||
set_directive_array_reshape -type complete -dim 2 "compute" out_mem
|
||||
set_directive_array_reshape -type complete -dim 2 "store" out_mem
|
||||
} else {
|
||||
# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
|
||||
set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
|
||||
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
|
||||
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
|
||||
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
|
||||
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
|
||||
}
|
||||
}
|
||||
|
||||
# HLS behavioral sim
|
||||
open_project vta_sim
|
||||
set_top vta
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
|
||||
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csim_design -clean
|
||||
close_project
|
||||
|
||||
# Generate fetch stage
|
||||
open_project vta_fetch
|
||||
set_top fetch
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csynth_design
|
||||
export_design -format ip_catalog
|
||||
close_project
|
||||
|
||||
# Generate load stage
|
||||
open_project vta_load
|
||||
set_top load
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csynth_design
|
||||
export_design -format ip_catalog
|
||||
close_project
|
||||
|
||||
# Generate compute stage
|
||||
open_project vta_compute
|
||||
set_top compute
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csynth_design
|
||||
export_design -format ip_catalog
|
||||
close_project
|
||||
|
||||
# Generate store stage
|
||||
open_project vta_store
|
||||
set_top store
|
||||
add_files $src_dir/vta.cc -cflags $cflags
|
||||
open_solution "solution0"
|
||||
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
|
||||
csynth_design
|
||||
export_design -format ip_catalog
|
||||
close_project
|
||||
|
||||
exit
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
#
|
||||
# Copyright (c) 2018 by Contributors
|
||||
# file: hsi.tcl
|
||||
# brief: Driver generation script for ARMv7 driver libraries.
|
||||
#
|
||||
|
||||
open_hw_design export/vta.hdf
|
||||
create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
|
||||
generate_bsp -dir bsp
|
||||
|
||||
exit
|
|
@ -0,0 +1,946 @@
|
|||
#
|
||||
# Copyright (c) 2018 by Xilinx, Contributors
|
||||
# file: vivado.tcl
|
||||
# brief: Vivado compilation script. Partially automatically generated
|
||||
# by Vivado.
|
||||
#
|
||||
|
||||
# Check if script is running in correct Vivado version.
|
||||
set scripts_vivado_version 2017.1
|
||||
set current_vivado_version [version -short]
|
||||
|
||||
if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
|
||||
puts ""
|
||||
catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
|
||||
<$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \
|
||||
Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \
|
||||
<$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \
|
||||
Status...\", then run write_bd_tcl to create an updated script."}
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# Parse argument list, derive the clock to utilize
|
||||
set clock_id 0
|
||||
if { [llength $argv] eq 12 } {
|
||||
set ip_path [lindex $argv 0]
|
||||
set num_threads [lindex $argv 1]
|
||||
set clock_freq [lindex $argv 2]
|
||||
set inp_width [lindex $argv 3]
|
||||
set wgt_width [lindex $argv 4]
|
||||
set out_width [lindex $argv 5]
|
||||
set batch [lindex $argv 6]
|
||||
set out_block [lindex $argv 7]
|
||||
set in_block [lindex $argv 8]
|
||||
set inp_mem_size [lindex $argv 9]
|
||||
set wgt_mem_size [lindex $argv 10]
|
||||
set out_mem_size [lindex $argv 11]
|
||||
if {$clock_freq eq 100} {
|
||||
set clock_id 0
|
||||
puts "Setting clock frequency to 100MHz"
|
||||
} elseif {$clock_freq eq 142} {
|
||||
set clock_id 1
|
||||
puts "Setting clock frequency to 142MHz"
|
||||
} elseif {$clock_freq eq 167} {
|
||||
set clock_id 3
|
||||
puts "Setting clock frequency to 167MHz"
|
||||
} elseif {$clock_freq eq 200} {
|
||||
set clock_id 2
|
||||
puts "Setting clock frequency to 200MHz"
|
||||
} else {
|
||||
set clock_id 0
|
||||
puts "Unrecognized clock frequency, setting clock to 100MHz"
|
||||
}
|
||||
} else {
|
||||
puts "Arg list incomplete: <path to ip dir> <num threads> <clock freq> \
|
||||
<inp width> <wgt_width> <out_width> <batch> <in_block / 1024> <out_block>"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Derive input mem parameters
|
||||
set inp_mem_width [expr $inp_width * $batch * $in_block]
|
||||
set inp_mem_depth [expr $inp_mem_size * 8 / $inp_mem_width]
|
||||
set inp_bus_width 1024
|
||||
set inp_part [expr $inp_mem_width / $inp_bus_width]
|
||||
if {[expr $inp_part == 0]} {
|
||||
set inp_part 1
|
||||
set inp_bus_width $inp_mem_width
|
||||
}
|
||||
# Derive weight mem parameters
|
||||
set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
|
||||
set wgt_mem_depth [expr $wgt_mem_size * 8 / $wgt_mem_width]
|
||||
set wgt_bus_width 1024
|
||||
set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
|
||||
if {[expr $wgt_part == 0]} {
|
||||
set wgt_part 1
|
||||
set wgt_bus_width $wgt_mem_width
|
||||
}
|
||||
# Derive output mem parameters
|
||||
set out_mem_width [expr $out_width * $batch * $out_block]
|
||||
set out_mem_depth [expr $out_mem_size * 8 / $out_mem_width]
|
||||
set out_bus_width 1024
|
||||
set out_part [expr $out_mem_width / $out_bus_width]
|
||||
if {[expr $out_part == 0]} {
|
||||
set out_part 1
|
||||
set out_bus_width $out_mem_width
|
||||
}
|
||||
|
||||
puts $inp_mem_width
|
||||
puts $inp_mem_depth
|
||||
puts $inp_bus_width
|
||||
puts $inp_part
|
||||
puts $wgt_mem_width
|
||||
puts $wgt_mem_depth
|
||||
puts $wgt_bus_width
|
||||
puts $wgt_part
|
||||
puts $out_mem_width
|
||||
puts $out_mem_depth
|
||||
puts $out_bus_width
|
||||
puts $out_part
|
||||
|
||||
# User defined paths
|
||||
set proj_name vta
|
||||
set proj_path "."
|
||||
set ip_lib "ip_lib"
|
||||
set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip"
|
||||
set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip"
|
||||
set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip"
|
||||
set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip"
|
||||
|
||||
# Create custom project
|
||||
create_project -force $proj_name $proj_path -part xc7z020clg484-1
|
||||
|
||||
# Update IP repository with generated IP
|
||||
file mkdir $ip_lib
|
||||
set_property ip_repo_paths $ip_lib [current_project]
|
||||
update_ip_catalog
|
||||
update_ip_catalog -add_ip $fetch_ip -repo_path $ip_lib
|
||||
update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
|
||||
update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
|
||||
update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
|
||||
|
||||
# CHANGE DESIGN NAME HERE
|
||||
set design_name $proj_name
|
||||
|
||||
# Creating design if needed
|
||||
set errMsg ""
|
||||
set nRet 0
|
||||
|
||||
set cur_design [current_bd_design -quiet]
|
||||
set list_cells [get_bd_cells -quiet]
|
||||
|
||||
if { ${design_name} eq "" } {
|
||||
# USE CASES:
|
||||
# 1) Design_name not set
|
||||
|
||||
set errMsg "Please set the variable <design_name> to a non-empty value."
|
||||
set nRet 1
|
||||
|
||||
} elseif { ${cur_design} ne "" && ${list_cells} eq "" } {
|
||||
# USE CASES:
|
||||
# 2): Current design opened AND is empty AND names same.
|
||||
# 3): Current design opened AND is empty AND names diff; design_name NOT in project.
|
||||
# 4): Current design opened AND is empty AND names diff; design_name exists in project.
|
||||
|
||||
if { $cur_design ne $design_name } {
|
||||
common::send_msg_id "BD_TCL-001" "INFO" "Changing value of <design_name> from <$design_name> \
|
||||
to <$cur_design> since current design is empty."
|
||||
set design_name [get_property NAME $cur_design]
|
||||
}
|
||||
common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..."
|
||||
|
||||
} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } {
|
||||
# USE CASES:
|
||||
# 5) Current design opened AND has components AND same names.
|
||||
|
||||
set errMsg "Design <$design_name> already exists in your project, please set the variable \
|
||||
<design_name> to another value."
|
||||
set nRet 1
|
||||
} elseif { [get_files -quiet ${design_name}.bd] ne "" } {
|
||||
# USE CASES:
|
||||
# 6) Current opened design, has components, but diff names, design_name exists in project.
|
||||
# 7) No opened design, design_name exists in project.
|
||||
|
||||
set errMsg "Design <$design_name> already exists in your project, please set the variable \
|
||||
<design_name> to another value."
|
||||
set nRet 2
|
||||
|
||||
} else {
|
||||
# USE CASES:
|
||||
# 8) No opened design, design_name not in project.
|
||||
# 9) Current opened design, has components, but diff names, design_name not in project.
|
||||
|
||||
common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \
|
||||
project, so creating one..."
|
||||
|
||||
create_bd_design $design_name
|
||||
|
||||
common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design."
|
||||
current_bd_design $design_name
|
||||
|
||||
}
|
||||
|
||||
common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable <design_name> is equal \
|
||||
to \"$design_name\"."
|
||||
|
||||
if { $nRet != 0 } {
|
||||
catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg}
|
||||
return $nRet
|
||||
}
|
||||
|
||||
##################################################################
|
||||
# DESIGN PROCs
|
||||
##################################################################
|
||||
|
||||
|
||||
|
||||
# Procedure to create entire design; Provide argument to make
|
||||
# procedure reusable. If parentCell is "", will use root.
|
||||
proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} {
|
||||
|
||||
variable script_folder
|
||||
|
||||
if { $parentCell eq "" } {
|
||||
set parentCell [get_bd_cells /]
|
||||
}
|
||||
|
||||
# Get object for parentCell
|
||||
set parentObj [get_bd_cells $parentCell]
|
||||
if { $parentObj == "" } {
|
||||
catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"}
|
||||
return
|
||||
}
|
||||
|
||||
# Make sure parentObj is hier blk
|
||||
set parentType [get_property TYPE $parentObj]
|
||||
if { $parentType ne "hier" } {
|
||||
catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \
|
||||
<$parentType>. Expected to be <hier>."}
|
||||
return
|
||||
}
|
||||
|
||||
# Save current instance; Restore later
|
||||
set oldCurInst [current_bd_instance .]
|
||||
|
||||
# Set parent object as current
|
||||
current_bd_instance $parentObj
|
||||
|
||||
|
||||
# Create interface ports
|
||||
set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ]
|
||||
set FIXED_IO [ create_bd_intf_port -mode Master \
|
||||
-vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ]
|
||||
|
||||
# Create ports
|
||||
|
||||
# Create instance: axi_interconnect_1, and set properties
|
||||
set axi_interconnect_1 \
|
||||
[ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.NUM_MI {5} \
|
||||
] $axi_interconnect_1
|
||||
|
||||
# Create instance: axi_smc, and set properties
|
||||
set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.NUM_SI {5} \
|
||||
] $axi_smc
|
||||
|
||||
# Create instance: axi_timer_1, and set properties
|
||||
set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ]
|
||||
|
||||
# Create instance: compute_0, and set properties
|
||||
set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
|
||||
CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \
|
||||
CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \
|
||||
] $compute_0
|
||||
|
||||
# Create instance: fetch_0, and set properties
|
||||
set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \
|
||||
CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \
|
||||
] $fetch_0
|
||||
|
||||
# Create instance: g2l_queue, and set properties
|
||||
set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 g2l_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {1024} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $g2l_queue
|
||||
|
||||
# Create instance: g2s_queue, and set properties
|
||||
set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 g2s_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {1024} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $g2s_queue
|
||||
|
||||
# Create instance: gemm_queue, and set properties
|
||||
set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 gemm_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {511} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {512} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TDATA_NUM_BYTES {16} \
|
||||
CONFIG.TKEEP_WIDTH {16} \
|
||||
CONFIG.TSTRB_WIDTH {16} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $gemm_queue
|
||||
|
||||
# Create instance: l2g_queue, and set properties
|
||||
set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 l2g_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {1024} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $l2g_queue
|
||||
|
||||
# Create instance: load_0, and set properties
|
||||
set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
|
||||
] $load_0
|
||||
|
||||
# Create instance: load_queue, and set properties
|
||||
set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 load_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {511} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {512} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TDATA_NUM_BYTES {16} \
|
||||
CONFIG.TKEEP_WIDTH {16} \
|
||||
CONFIG.TSTRB_WIDTH {16} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $load_queue
|
||||
|
||||
# Create instance: proc_sys_reset, and set properties
|
||||
set proc_sys_reset \
|
||||
[ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
|
||||
|
||||
# Create instance: processing_system7_1, and set properties
|
||||
set processing_system7_1 \
|
||||
[ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_EN_CLK0_PORT {1} \
|
||||
CONFIG.PCW_EN_CLK1_PORT {1} \
|
||||
CONFIG.PCW_EN_CLK2_PORT {1} \
|
||||
CONFIG.PCW_EN_CLK3_PORT {1} \
|
||||
CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
|
||||
CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \
|
||||
CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \
|
||||
CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \
|
||||
CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \
|
||||
CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_IMPORT_BOARD_PRESET {None} \
|
||||
CONFIG.PCW_IRQ_F2P_INTR {1} \
|
||||
CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \
|
||||
CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \
|
||||
CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
|
||||
CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
|
||||
CONFIG.PCW_USE_HIGH_OCM {1} \
|
||||
CONFIG.PCW_USE_S_AXI_ACP {1} \
|
||||
CONFIG.PCW_USE_S_AXI_HP0 {0} \
|
||||
CONFIG.PCW_USE_S_AXI_HP1 {0} \
|
||||
CONFIG.PCW_USE_S_AXI_HP2 {0} \
|
||||
CONFIG.PCW_USE_S_AXI_HP3 {0} \
|
||||
CONFIG.preset {ZC702} \
|
||||
] $processing_system7_1
|
||||
|
||||
# Create instance: s2g_queue, and set properties
|
||||
set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 s2g_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {1023} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {1024} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $s2g_queue
|
||||
|
||||
# Create instance: store_0, and set properties
|
||||
set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
|
||||
] $store_0
|
||||
|
||||
# Create instance: store_queue, and set properties
|
||||
set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 store_queue ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Empty_Threshold_Assert_Value_axis {510} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_rach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wach {14} \
|
||||
CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
|
||||
CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
|
||||
CONFIG.Full_Flags_Reset_Value {1} \
|
||||
CONFIG.Full_Threshold_Assert_Value_axis {511} \
|
||||
CONFIG.Full_Threshold_Assert_Value_rach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wach {15} \
|
||||
CONFIG.Full_Threshold_Assert_Value_wrch {15} \
|
||||
CONFIG.INTERFACE_TYPE {AXI_STREAM} \
|
||||
CONFIG.Input_Depth_axis {512} \
|
||||
CONFIG.Reset_Type {Asynchronous_Reset} \
|
||||
CONFIG.TDATA_NUM_BYTES {16} \
|
||||
CONFIG.TKEEP_WIDTH {16} \
|
||||
CONFIG.TSTRB_WIDTH {16} \
|
||||
CONFIG.TUSER_WIDTH {0} \
|
||||
] $store_queue
|
||||
|
||||
# Create instance: xlconcat_1, and set properties
|
||||
set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.NUM_PORTS {5} \
|
||||
] $xlconcat_1
|
||||
|
||||
# Create and connect inp_mem partitions
|
||||
if {${inp_part} > 1} {
|
||||
for {set i 0} {$i < ${inp_part}} {incr i} {
|
||||
# Create instance: inp_mem, and set properties
|
||||
set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 inp_mem_${i} ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $inp_bus_width \
|
||||
CONFIG.Read_Width_B $inp_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $inp_mem_depth \
|
||||
CONFIG.Write_Width_A $inp_bus_width \
|
||||
CONFIG.Write_Width_B $inp_bus_width \
|
||||
CONFIG.use_bram_block {BRAM_Controller} \
|
||||
] $inp_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins $inp_mem/BRAM_PORTA] \
|
||||
[get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
|
||||
connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \
|
||||
[get_bd_intf_pins $inp_mem/BRAM_PORTB]
|
||||
}
|
||||
} else {
|
||||
# Create instance: inp_mem, and set properties
|
||||
set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 inp_mem ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $inp_bus_width \
|
||||
CONFIG.Read_Width_B $inp_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $inp_mem_depth \
|
||||
CONFIG.Write_Width_A $inp_bus_width \
|
||||
CONFIG.Write_Width_B $inp_bus_width \
|
||||
CONFIG.use_bram_block {BRAM_Controller} \
|
||||
] $inp_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
|
||||
[get_bd_intf_pins $inp_mem/BRAM_PORTA] \
|
||||
[get_bd_intf_pins load_0/inp_mem_V_PORTA]
|
||||
connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/inp_mem_V_PORTA] \
|
||||
[get_bd_intf_pins $inp_mem/BRAM_PORTB]
|
||||
}
|
||||
|
||||
# Create and connect wgt_mem partitions
|
||||
if {${wgt_part} > 1} {
|
||||
for {set i 0} {$i < ${wgt_part}} {incr i} {
|
||||
# Create instance: wgt_mem, and set properties
|
||||
set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 wgt_mem_${i} ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Assume_Synchronous_Clk {true} \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $wgt_bus_width \
|
||||
CONFIG.Read_Width_B $wgt_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $wgt_mem_depth \
|
||||
CONFIG.Write_Width_A $wgt_bus_width \
|
||||
CONFIG.Write_Width_B $wgt_bus_width \
|
||||
] $wgt_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \
|
||||
[get_bd_intf_pins $wgt_mem/BRAM_PORTA]
|
||||
connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \
|
||||
[get_bd_intf_pins $wgt_mem/BRAM_PORTB]
|
||||
}
|
||||
} else {
|
||||
# Create instance: wgt_mem, and set properties
|
||||
set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 wgt_mem ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Assume_Synchronous_Clk {true} \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $wgt_bus_width \
|
||||
CONFIG.Read_Width_B $wgt_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $wgt_mem_depth \
|
||||
CONFIG.Write_Width_A $wgt_bus_width \
|
||||
CONFIG.Write_Width_B $wgt_bus_width \
|
||||
] $wgt_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \
|
||||
[get_bd_intf_pins load_0/wgt_mem_V_PORTA] \
|
||||
[get_bd_intf_pins $wgt_mem/BRAM_PORTA]
|
||||
connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \
|
||||
[get_bd_intf_pins $wgt_mem/BRAM_PORTB]
|
||||
}
|
||||
|
||||
# Create and connect out_mem partitions
|
||||
if {${out_part} > 1} {
|
||||
for {set i 0} {$i < ${out_part}} {incr i} {
|
||||
# Create instance: out_mem, and set properties
|
||||
set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 out_mem_${i} ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $out_bus_width \
|
||||
CONFIG.Read_Width_B $out_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $out_mem_depth \
|
||||
CONFIG.Write_Width_A $out_bus_width \
|
||||
CONFIG.Write_Width_B $out_bus_width \
|
||||
CONFIG.use_bram_block {BRAM_Controller} \
|
||||
] $out_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \
|
||||
[get_bd_intf_pins $out_mem/BRAM_PORTA]
|
||||
connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
|
||||
[get_bd_intf_pins $out_mem/BRAM_PORTB] \
|
||||
[get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
|
||||
}
|
||||
} else {
|
||||
# Create instance: out_mem, and set properties
|
||||
set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 out_mem ]
|
||||
set_property -dict [ list \
|
||||
CONFIG.Byte_Size {8} \
|
||||
CONFIG.Enable_32bit_Address {true} \
|
||||
CONFIG.Enable_B {Use_ENB_Pin} \
|
||||
CONFIG.Memory_Type {True_Dual_Port_RAM} \
|
||||
CONFIG.Read_Width_A $out_bus_width \
|
||||
CONFIG.Read_Width_B $out_bus_width \
|
||||
CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
|
||||
CONFIG.Use_Byte_Write_Enable {true} \
|
||||
CONFIG.Use_RSTA_Pin {true} \
|
||||
CONFIG.Use_RSTB_Pin {true} \
|
||||
CONFIG.Write_Depth_A $out_mem_depth \
|
||||
CONFIG.Write_Width_A $out_bus_width \
|
||||
CONFIG.Write_Width_B $out_bus_width \
|
||||
CONFIG.use_bram_block {BRAM_Controller} \
|
||||
] $out_mem
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \
|
||||
[get_bd_intf_pins compute_0/out_mem_V_PORTA] \
|
||||
[get_bd_intf_pins $out_mem/BRAM_PORTA]
|
||||
connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \
|
||||
[get_bd_intf_pins $out_mem/BRAM_PORTB] \
|
||||
[get_bd_intf_pins store_0/out_mem_V_PORTA]
|
||||
}
|
||||
|
||||
# Create interface connections
|
||||
connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \
|
||||
[get_bd_intf_pins axi_interconnect_1/M01_AXI] \
|
||||
[get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
|
||||
connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \
|
||||
[get_bd_intf_pins axi_interconnect_1/M02_AXI] \
|
||||
[get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
|
||||
connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \
|
||||
[get_bd_intf_pins axi_interconnect_1/M03_AXI] \
|
||||
[get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
|
||||
connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \
|
||||
[get_bd_intf_pins axi_interconnect_1/M04_AXI] \
|
||||
[get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
|
||||
connect_bd_intf_net -intf_net axi_smc_M00_AXI \
|
||||
[get_bd_intf_pins axi_smc/M00_AXI] \
|
||||
[get_bd_intf_pins processing_system7_1/S_AXI_ACP]
|
||||
connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \
|
||||
[get_bd_intf_pins compute_0/g2l_dep_queue_V] \
|
||||
[get_bd_intf_pins g2l_queue/S_AXIS]
|
||||
connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \
|
||||
[get_bd_intf_pins compute_0/g2s_dep_queue_V] \
|
||||
[get_bd_intf_pins g2s_queue/S_AXIS]
|
||||
connect_bd_intf_net -intf_net compute_0_m_axi_data_port \
|
||||
[get_bd_intf_pins axi_smc/S02_AXI] \
|
||||
[get_bd_intf_pins compute_0/m_axi_data_port]
|
||||
connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \
|
||||
[get_bd_intf_pins axi_smc/S01_AXI] \
|
||||
[get_bd_intf_pins compute_0/m_axi_uop_port]
|
||||
connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \
|
||||
[get_bd_intf_pins fetch_0/gemm_queue_V_V] \
|
||||
[get_bd_intf_pins gemm_queue/S_AXIS]
|
||||
connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \
|
||||
[get_bd_intf_pins l2g_queue/S_AXIS] \
|
||||
[get_bd_intf_pins load_0/l2g_dep_queue_V]
|
||||
connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \
|
||||
[get_bd_intf_pins fetch_0/load_queue_V_V] \
|
||||
[get_bd_intf_pins load_queue/S_AXIS]
|
||||
connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \
|
||||
[get_bd_intf_pins axi_smc/S00_AXI] \
|
||||
[get_bd_intf_pins fetch_0/m_axi_ins_port]
|
||||
connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \
|
||||
[get_bd_intf_pins fetch_0/store_queue_V_V] \
|
||||
[get_bd_intf_pins store_queue/S_AXIS]
|
||||
connect_bd_intf_net -intf_net g2l_queue_M_AXIS \
|
||||
[get_bd_intf_pins g2l_queue/M_AXIS] \
|
||||
[get_bd_intf_pins load_0/g2l_dep_queue_V]
|
||||
connect_bd_intf_net -intf_net g2s_queue_M_AXIS \
|
||||
[get_bd_intf_pins g2s_queue/M_AXIS] \
|
||||
[get_bd_intf_pins store_0/g2s_dep_queue_V]
|
||||
connect_bd_intf_net -intf_net gemm_queue_M_AXIS \
|
||||
[get_bd_intf_pins compute_0/gemm_queue_V_V] \
|
||||
[get_bd_intf_pins gemm_queue/M_AXIS]
|
||||
connect_bd_intf_net -intf_net l2g_queue_M_AXIS \
|
||||
[get_bd_intf_pins compute_0/l2g_dep_queue_V] \
|
||||
[get_bd_intf_pins l2g_queue/M_AXIS]
|
||||
connect_bd_intf_net -intf_net load_0_m_axi_data_port \
|
||||
[get_bd_intf_pins axi_smc/S03_AXI] \
|
||||
[get_bd_intf_pins load_0/m_axi_data_port]
|
||||
connect_bd_intf_net -intf_net load_queue_M_AXIS \
|
||||
[get_bd_intf_pins load_0/load_queue_V_V] \
|
||||
[get_bd_intf_pins load_queue/M_AXIS]
|
||||
connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \
|
||||
[get_bd_intf_pins axi_interconnect_1/M00_AXI] \
|
||||
[get_bd_intf_pins axi_timer_1/S_AXI]
|
||||
connect_bd_intf_net -intf_net processing_system7_1_ddr \
|
||||
[get_bd_intf_ports DDR] \
|
||||
[get_bd_intf_pins processing_system7_1/DDR]
|
||||
connect_bd_intf_net -intf_net processing_system7_1_fixed_io \
|
||||
[get_bd_intf_ports FIXED_IO] \
|
||||
[get_bd_intf_pins processing_system7_1/FIXED_IO]
|
||||
connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \
|
||||
[get_bd_intf_pins axi_interconnect_1/S00_AXI] \
|
||||
[get_bd_intf_pins processing_system7_1/M_AXI_GP0]
|
||||
connect_bd_intf_net -intf_net s2g_queue_M_AXIS \
|
||||
[get_bd_intf_pins compute_0/s2g_dep_queue_V] \
|
||||
[get_bd_intf_pins s2g_queue/M_AXIS]
|
||||
connect_bd_intf_net -intf_net store_0_m_axi_data_port \
|
||||
[get_bd_intf_pins axi_smc/S04_AXI] \
|
||||
[get_bd_intf_pins store_0/m_axi_data_port]
|
||||
connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \
|
||||
[get_bd_intf_pins s2g_queue/S_AXIS] \
|
||||
[get_bd_intf_pins store_0/s2g_dep_queue_V]
|
||||
connect_bd_intf_net -intf_net store_queue_M_AXIS \
|
||||
[get_bd_intf_pins store_0/store_queue_V_V] \
|
||||
[get_bd_intf_pins store_queue/M_AXIS]
|
||||
|
||||
# Create port connections
|
||||
connect_bd_net -net axi_timer_1_interrupt \
|
||||
[get_bd_pins axi_timer_1/interrupt] \
|
||||
[get_bd_pins xlconcat_1/In0]
|
||||
connect_bd_net -net compute_0_interrupt \
|
||||
[get_bd_pins compute_0/interrupt] \
|
||||
[get_bd_pins xlconcat_1/In3]
|
||||
connect_bd_net -net fetch_0_interrupt \
|
||||
[get_bd_pins fetch_0/interrupt] \
|
||||
[get_bd_pins xlconcat_1/In1]
|
||||
connect_bd_net -net load_0_interrupt \
|
||||
[get_bd_pins load_0/interrupt] \
|
||||
[get_bd_pins xlconcat_1/In2]
|
||||
connect_bd_net -net proc_sys_reset_interconnect_aresetn \
|
||||
[get_bd_pins axi_interconnect_1/ARESETN] \
|
||||
[get_bd_pins proc_sys_reset/interconnect_aresetn]
|
||||
connect_bd_net -net proc_sys_reset_peripheral_aresetn \
|
||||
[get_bd_pins axi_interconnect_1/M00_ARESETN] \
|
||||
[get_bd_pins axi_interconnect_1/M01_ARESETN] \
|
||||
[get_bd_pins axi_interconnect_1/M02_ARESETN] \
|
||||
[get_bd_pins axi_interconnect_1/M03_ARESETN] \
|
||||
[get_bd_pins axi_interconnect_1/M04_ARESETN] \
|
||||
[get_bd_pins axi_interconnect_1/S00_ARESETN] \
|
||||
[get_bd_pins axi_smc/aresetn] \
|
||||
[get_bd_pins axi_timer_1/s_axi_aresetn] \
|
||||
[get_bd_pins compute_0/ap_rst_n] \
|
||||
[get_bd_pins fetch_0/ap_rst_n] \
|
||||
[get_bd_pins g2l_queue/s_aresetn] \
|
||||
[get_bd_pins g2s_queue/s_aresetn] \
|
||||
[get_bd_pins gemm_queue/s_aresetn] \
|
||||
[get_bd_pins l2g_queue/s_aresetn] \
|
||||
[get_bd_pins load_0/ap_rst_n] \
|
||||
[get_bd_pins load_queue/s_aresetn] \
|
||||
[get_bd_pins proc_sys_reset/peripheral_aresetn] \
|
||||
[get_bd_pins s2g_queue/s_aresetn] \
|
||||
[get_bd_pins store_0/ap_rst_n] \
|
||||
[get_bd_pins store_queue/s_aresetn]
|
||||
connect_bd_net -net processing_system7_1_FCLK_CLK \
|
||||
[get_bd_pins axi_interconnect_1/ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/M00_ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/M01_ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/M02_ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/M03_ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/M04_ACLK] \
|
||||
[get_bd_pins axi_interconnect_1/S00_ACLK] \
|
||||
[get_bd_pins axi_smc/aclk] \
|
||||
[get_bd_pins axi_timer_1/s_axi_aclk] \
|
||||
[get_bd_pins compute_0/ap_clk] \
|
||||
[get_bd_pins fetch_0/ap_clk] \
|
||||
[get_bd_pins g2l_queue/s_aclk] \
|
||||
[get_bd_pins g2s_queue/s_aclk] \
|
||||
[get_bd_pins gemm_queue/s_aclk] \
|
||||
[get_bd_pins l2g_queue/s_aclk] \
|
||||
[get_bd_pins load_0/ap_clk] \
|
||||
[get_bd_pins load_queue/s_aclk] \
|
||||
[get_bd_pins proc_sys_reset/slowest_sync_clk] \
|
||||
[get_bd_pins processing_system7_1/FCLK_CLK${clk}] \
|
||||
[get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \
|
||||
[get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \
|
||||
[get_bd_pins s2g_queue/s_aclk] \
|
||||
[get_bd_pins store_0/ap_clk] \
|
||||
[get_bd_pins store_queue/s_aclk]
|
||||
connect_bd_net -net processing_system7_1_fclk_reset0_n \
|
||||
[get_bd_pins proc_sys_reset/ext_reset_in] \
|
||||
[get_bd_pins processing_system7_1/FCLK_RESET0_N]
|
||||
connect_bd_net -net store_0_interrupt \
|
||||
[get_bd_pins store_0/interrupt] \
|
||||
[get_bd_pins xlconcat_1/In4]
|
||||
connect_bd_net -net xlconcat_1_dout \
|
||||
[get_bd_pins processing_system7_1/IRQ_F2P] \
|
||||
[get_bd_pins xlconcat_1/dout]
|
||||
|
||||
# Create address segments
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
|
||||
SEG_processing_system7_1_ACP_DDR_LOWOCM
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
|
||||
SEG_processing_system7_1_ACP_DDR_LOWOCM
|
||||
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
|
||||
SEG_processing_system7_1_ACP_HIGH_OCM
|
||||
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
|
||||
SEG_processing_system7_1_ACP_HIGH_OCM
|
||||
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
|
||||
SEG_processing_system7_1_ACP_IOP
|
||||
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
|
||||
SEG_processing_system7_1_ACP_IOP
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
|
||||
SEG_processing_system7_1_ACP_M_AXI_GP0
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
|
||||
[get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
|
||||
SEG_processing_system7_1_ACP_M_AXI_GP0
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
|
||||
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
|
||||
SEG_processing_system7_1_ACP_DDR_LOWOCM
|
||||
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
|
||||
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
|
||||
SEG_processing_system7_1_ACP_HIGH_OCM
|
||||
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
|
||||
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
|
||||
SEG_processing_system7_1_ACP_IOP
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
|
||||
[get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
|
||||
SEG_processing_system7_1_ACP_M_AXI_GP0
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
|
||||
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
|
||||
SEG_processing_system7_1_ACP_DDR_LOWOCM
|
||||
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
|
||||
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
|
||||
SEG_processing_system7_1_ACP_HIGH_OCM
|
||||
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
|
||||
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
|
||||
SEG_processing_system7_1_ACP_IOP
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
|
||||
[get_bd_addr_spaces load_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
|
||||
SEG_processing_system7_1_ACP_M_AXI_GP0
|
||||
create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \
|
||||
[get_bd_addr_spaces processing_system7_1/Data] \
|
||||
[get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg
|
||||
create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \
|
||||
[get_bd_addr_spaces processing_system7_1/Data] \
|
||||
[get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
|
||||
create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \
|
||||
[get_bd_addr_spaces processing_system7_1/Data] \
|
||||
[get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
|
||||
create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \
|
||||
[get_bd_addr_spaces processing_system7_1/Data] \
|
||||
[get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
|
||||
create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \
|
||||
[get_bd_addr_spaces processing_system7_1/Data] \
|
||||
[get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
|
||||
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
|
||||
SEG_processing_system7_1_ACP_DDR_LOWOCM
|
||||
create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
|
||||
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
|
||||
SEG_processing_system7_1_ACP_HIGH_OCM
|
||||
create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
|
||||
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
|
||||
SEG_processing_system7_1_ACP_IOP
|
||||
create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
|
||||
[get_bd_addr_spaces store_0/Data_m_axi_data_port] \
|
||||
[get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
|
||||
SEG_processing_system7_1_ACP_M_AXI_GP0
|
||||
|
||||
|
||||
# Restore current instance
|
||||
current_bd_instance $oldCurInst
|
||||
|
||||
save_bd_design
|
||||
}
|
||||
# End of create_root_design()
|
||||
|
||||
|
||||
##################################################################
|
||||
# MAIN FLOW
|
||||
##################################################################
|
||||
|
||||
create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \
|
||||
$inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth
|
||||
|
||||
# Create top-level wrapper file
|
||||
make_wrapper -files \
|
||||
[get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
|
||||
add_files -norecurse $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v
|
||||
update_compile_order -fileset sources_1
|
||||
update_compile_order -fileset sim_1
|
||||
|
||||
# Run bistream generation on 8 threads with performance oriented P&R strategy
|
||||
# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \
|
||||
# -strategy "Performance_ExplorePostRoutePhysOpt"
|
||||
launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
|
||||
wait_on_run impl_1
|
||||
|
||||
# Export hardware description file and bitstream files to export/ dir
|
||||
file mkdir $proj_path/export
|
||||
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
|
||||
$proj_path/export/vta.hdf
|
||||
file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
|
||||
$proj_path/export/vta.bit
|
||||
|
||||
exit
|
|
@ -0,0 +1,62 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_test.cpp
|
||||
* \brief Simulation tests for the VTA design.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "vta.h"
|
||||
#include "vta_test_lib.h"
|
||||
|
||||
int main(void)
|
||||
{
|
||||
|
||||
#if DEBUG==1
|
||||
printParameters();
|
||||
#endif
|
||||
|
||||
// Buffer indexing
|
||||
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
|
||||
// Micro op bound
|
||||
assert(UOP_GEM_3_1<UOP_WIDTH);
|
||||
assert(UOP_ALU_3_1<UOP_WIDTH);
|
||||
// Instruction alignment checks
|
||||
assert(INSN_MEM_7_1<INSN_MEM_8_0);
|
||||
assert(INSN_GEM_8_1<INSN_GEM_9_0);
|
||||
// Instruction bounds
|
||||
assert(INSN_MEM_E_1<INS_WIDTH);
|
||||
assert(INSN_GEM_E_1<INS_WIDTH);
|
||||
assert(INSN_ALU_F_1<INS_WIDTH);
|
||||
|
||||
int status = 0;
|
||||
|
||||
// Run ALU test (vector-scalar operators)
|
||||
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
|
||||
|
||||
// Run ALU test (vector-vector operators)
|
||||
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
|
||||
|
||||
// Run blocked GEMM test
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
|
||||
|
||||
return status;
|
||||
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta.h
|
||||
* \brief Type definitions and prototype for VTA HLS design.
|
||||
*/
|
||||
#ifndef VTA_MAIN_H_
|
||||
#define VTA_MAIN_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <ap_axi_sdata.h>
|
||||
#include <ap_int.h>
|
||||
#include <hls_stream.h>
|
||||
|
||||
#include "vta_typedefs.h"
|
||||
#include "vta_params.h"
|
||||
|
||||
/*!
|
||||
* \brief Fetch module.
|
||||
* Reads in \a insn_count instructions via DMA and pushes them to the
|
||||
* appropriate load, gemm or store queue.
|
||||
* \param insns Instruction data base address in DRAM. AXI-4 master port.
|
||||
* \param insn_count Total instruction count. AXI-lite memory mapped register.
|
||||
* \param load_queue Load instruction queue. AXI-stream FIFO.
|
||||
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
|
||||
* \param store_queue Store instruction queue. AXI-stream FIFO.
|
||||
*/
|
||||
void fetch (
|
||||
uint32_t insn_count,
|
||||
volatile insn_T *insns,
|
||||
hls::stream<insn_T> &load_queue,
|
||||
hls::stream<insn_T> &gemm_queue,
|
||||
hls::stream<insn_T> &store_queue);
|
||||
|
||||
/*!
|
||||
* \brief Load module.
|
||||
* Reads in load instructions from the load queue, and performs appropriate
|
||||
* DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
|
||||
* Updates dependence queues accordingly.
|
||||
* \param inputs Input data base address in DRAM. AXI-4 master port.
|
||||
* \param weights Weight data base address in DRAM. AXI-4 master port.
|
||||
* \param load_queue Load instruction queue. AXI-stream FIFO.
|
||||
* \param g2l_dep_queue Dependence queue from GEMM to load stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param l2g_dep_queue Dependence queue from load to GEMM stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
|
||||
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
|
||||
*/
|
||||
void load (
|
||||
volatile inp_vec_T *inputs,
|
||||
volatile wgt_vec_T *weights,
|
||||
hls::stream<insn_T> &load_queue,
|
||||
hls::stream<bool> &g2l_dep_queue,
|
||||
hls::stream<bool> &l2g_dep_queue,
|
||||
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
|
||||
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
|
||||
);
|
||||
|
||||
/*!
|
||||
* \brief Compute module.
|
||||
* Reads in GEMM instructions from the gemm queue, and performs appropriate
|
||||
* GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
|
||||
* and writes computation results into the \a out_mem. Updates dependence
|
||||
* queues accordingly.
|
||||
* \param done Signal that indicates that VLA is done. AXI-lite memory mapped
|
||||
* register.
|
||||
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
|
||||
* \param biases Bias data base address in DRAM. AXI-4 master port.
|
||||
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
|
||||
* \param l2g_dep_queue Dependence queue from load to gemm stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param s2g_dep_queue Dependence queue from store to gemm stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param g2l_dep_queue Dependence queue from gemm to load stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param g2s_dep_queue Dependence queue from gemm to store stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
|
||||
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
|
||||
* \param out_mem Local output SRAM buffer. Write only single port BRAM.
|
||||
*/
|
||||
void compute (
|
||||
volatile uint32_t &done,
|
||||
volatile uop_T *uops,
|
||||
volatile acc_vec_T *biases,
|
||||
hls::stream<insn_T> &gemm_queue,
|
||||
hls::stream<bool> &l2g_dep_queue,
|
||||
hls::stream<bool> &s2g_dep_queue,
|
||||
hls::stream<bool> &g2l_dep_queue,
|
||||
hls::stream<bool> &g2s_dep_queue,
|
||||
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
|
||||
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
|
||||
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
|
||||
);
|
||||
|
||||
/*!
|
||||
* \brief Store module.
|
||||
* Reads in store instructions from the store queue, and performs appropriate
|
||||
* store instructions from the output buffer in SRAM to DRAM. Updates dependence
|
||||
* queues accordingly.
|
||||
* \param outputs Output data base address in DRAM. AXI-4 master port.
|
||||
* \param store_queue Store instruction queue. AXI-stream FIFO.
|
||||
* \param g2s_dep_queue Dependence queue from gemm to store stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param s2g_dep_queue Dependence queue from store to gemm stage.
|
||||
* AXI-stream FIFO.
|
||||
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
|
||||
*/
|
||||
void store (
|
||||
volatile out_vec_T *outputs,
|
||||
hls::stream<insn_T> &store_queue,
|
||||
hls::stream<bool> &g2s_dep_queue,
|
||||
hls::stream<bool> &s2g_dep_queue,
|
||||
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
|
||||
);
|
||||
|
||||
/*!
|
||||
* \brief VTA wrapper for simulation purpose only.
|
||||
* Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
|
||||
* \param insn_count Total instruction count. AXI-lite memory mapped register.
|
||||
* \param insns Instruction data base address in DRAM. AXI-4 master port.
|
||||
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
|
||||
* \param inputs Input data base address in DRAM. AXI-4 master port.
|
||||
* \param weights Weight data base address in DRAM. AXI-4 master port.
|
||||
* \param biases Bias data base address in DRAM. AXI-4 master port.
|
||||
* \param outputs Output data base address in DRAM. AXI-4 master port.
|
||||
*/
|
||||
void vta (
|
||||
uint32_t insn_count,
|
||||
volatile insn_T *insns,
|
||||
volatile uop_T *uops,
|
||||
volatile inp_vec_T *inputs,
|
||||
volatile wgt_vec_T *weights,
|
||||
volatile acc_vec_T *biases,
|
||||
volatile out_vec_T *outputs);
|
||||
|
||||
#endif // VTA_MAIN_H_
|
|
@ -0,0 +1,97 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_typedefs.h
|
||||
* \brief Type definitions for VTA HLS design.
|
||||
*/
|
||||
#ifndef VTA_TYPEDEFS_H_
|
||||
#define VTA_TYPEDEFS_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <ap_axi_sdata.h>
|
||||
#include <ap_int.h>
|
||||
#include <hls_stream.h>
|
||||
|
||||
#include "vta_params.h"
|
||||
|
||||
/* \typedef uop_T Micro-op datatype*/
|
||||
typedef ap_uint<UOP_WIDTH> uop_T;
|
||||
|
||||
/* \typedef inp_T Input datatype*/
|
||||
typedef ap_int<INP_WIDTH> inp_T;
|
||||
|
||||
/* \typedef wgt_T Weight datatype*/
|
||||
typedef ap_int<WGT_WIDTH> wgt_T;
|
||||
|
||||
/* \typedef out_T Output datatype*/
|
||||
typedef ap_int<OUT_WIDTH> out_T;
|
||||
|
||||
/* \typedef acc_T Accumulator datatype*/
|
||||
typedef ap_int<ACC_WIDTH> acc_T;
|
||||
|
||||
/* \typedef mul_T Multiplier output datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
|
||||
|
||||
/* \typedef sum_T GEMM accumulator datatype*/
|
||||
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
|
||||
|
||||
/* \typedef inp_vec_T Input vector datatype*/
|
||||
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
|
||||
|
||||
/* \typedef wgt_vec_T Weight vector datatype*/
|
||||
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
|
||||
|
||||
/* \typedef acc_vec_T Accumulator vector datatype*/
|
||||
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
|
||||
|
||||
/* \typedef out_vec_T Output vector datatype*/
|
||||
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
|
||||
|
||||
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
|
||||
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
|
||||
|
||||
/* \typedef inp_idx_T Input SRAM index datatype*/
|
||||
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
|
||||
|
||||
/* \typedef wgt_idx_T Weight SRAM index datatype*/
|
||||
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
|
||||
|
||||
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
|
||||
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
|
||||
|
||||
/* \typedef opcode_T Opcode datatype*/
|
||||
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
|
||||
|
||||
/* \typedef insn_T Instruction datatype*/
|
||||
typedef ap_uint<INS_WIDTH> insn_T;
|
||||
|
||||
/* \typedef loop_T Loop bound datatype*/
|
||||
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
|
||||
|
||||
/* \typedef memop_id_T Memory operation ID datatype*/
|
||||
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
|
||||
|
||||
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
|
||||
|
||||
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
|
||||
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
|
||||
|
||||
/* \typedef memop_size_T Memory operation range datatype*/
|
||||
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
|
||||
|
||||
/* \typedef memop_stride_T Memory operation stride datatype*/
|
||||
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
|
||||
|
||||
/* \typedef memop_pad_T Memory operation pad width datatype*/
|
||||
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
|
||||
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
|
||||
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
|
||||
|
||||
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
|
||||
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
|
||||
|
||||
#endif // VTA_TYPEDEFS_H_
|
|
@ -0,0 +1,559 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_defines.h
|
||||
* \brief Preprocessor definitions for VTA HLS design and runtime.
|
||||
*/
|
||||
#ifndef VTA_DEFINES_H_
|
||||
#define VTA_DEFINES_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*! log2 of instruction data type width */
|
||||
#define LOG_INS_WIDTH 7
|
||||
/*! Instruction data type width */
|
||||
#define INS_WIDTH (1<<LOG_INS_WIDTH)
|
||||
/*! log2 of micro op data type width */
|
||||
#define LOG_UOP_WIDTH 5
|
||||
/*! Micro Op data type width */
|
||||
#define UOP_WIDTH (1<<LOG_UOP_WIDTH)
|
||||
/*! Weight data type width */
|
||||
#define WGT_WIDTH (1<<LOG_WGT_WIDTH)
|
||||
/*! Input data type width */
|
||||
#define INP_WIDTH (1<<LOG_INP_WIDTH)
|
||||
/*! Output data type width */
|
||||
#define OUT_WIDTH (1<<LOG_OUT_WIDTH)
|
||||
/*! Accumulator data type width */
|
||||
#define ACC_WIDTH (1<<LOG_ACC_WIDTH)
|
||||
/*! log2 of ALU data type width */
|
||||
#define LOG_ALU_WIDTH (LOG_ACC_WIDTH-1)
|
||||
/*! ALU data type width */
|
||||
#define ALU_WIDTH (1<<LOG_ALU_WIDTH)
|
||||
|
||||
/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
|
||||
#define BATCH (1<<LOG_BATCH)
|
||||
/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
|
||||
#define BLOCK_IN (1<<LOG_BLOCK_IN)
|
||||
/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
|
||||
#define BLOCK_OUT (1<<LOG_BLOCK_OUT)
|
||||
|
||||
/*! Weight vector width */
|
||||
#define WGT_VECTOR_WIDTH (WGT_WIDTH*BLOCK_IN)
|
||||
/*! Input vector width */
|
||||
#define INP_VECTOR_WIDTH (INP_WIDTH*BLOCK_IN)
|
||||
/*! Accumulator vector width */
|
||||
#define ACC_VECTOR_WIDTH (ACC_WIDTH*BLOCK_OUT)
|
||||
/*! Output vector width */
|
||||
#define OUT_VECTOR_WIDTH (OUT_WIDTH*BLOCK_OUT)
|
||||
|
||||
/*! On-chip micro-op buffer size in B */
|
||||
#define UOP_BUFF_SIZE (1<<LOG_UOP_BUFF_SIZE)
|
||||
/*! On-chip weight buffer size in B */
|
||||
#define WGT_BUFF_SIZE (1<<LOG_WGT_BUFF_SIZE)
|
||||
/*! On-chip activation buffer size in B */
|
||||
#define INP_BUFF_SIZE (1<<LOG_INP_BUFF_SIZE)
|
||||
/*! On-chip accumulator buffer size in B */
|
||||
#define ACC_BUFF_SIZE (1<<LOG_ACC_BUFF_SIZE)
|
||||
|
||||
/*! Size of instruction buffer element in B */
|
||||
#define INS_ELEM_BYTES (INS_WIDTH/8)
|
||||
/*! Size of uop buffer element in B*/
|
||||
#define UOP_ELEM_BYTES (UOP_WIDTH/8)
|
||||
/*! Size of activation buffer element in B*/
|
||||
#define INP_ELEM_BYTES (BATCH*BLOCK_IN*INP_WIDTH/8)
|
||||
/*! Size of weight buffer element in B*/
|
||||
#define WGT_ELEM_BYTES (BLOCK_OUT*BLOCK_IN*WGT_WIDTH/8)
|
||||
/*! Size of accumulator buffer element in B*/
|
||||
#define ACC_ELEM_BYTES (BATCH*BLOCK_OUT*ACC_WIDTH/8)
|
||||
|
||||
/*! On-chip micro-op buffer depth */
|
||||
#define UOP_BUFF_DEPTH (UOP_BUFF_SIZE/UOP_ELEM_BYTES)
|
||||
/*! log2 of on-chip micro-op buffer depth */
|
||||
#define LOG_UOP_BUFF_DEPTH (LOG_UOP_BUFF_SIZE-LOG_UOP_WIDTH+3)
|
||||
// ! \brief On-chip weight buffer depth
|
||||
#define WGT_BUFF_DEPTH (WGT_BUFF_SIZE/WGT_ELEM_BYTES)
|
||||
/*! log2 of weight micro-op buffer depth */
|
||||
#define LOG_WGT_BUFF_DEPTH (LOG_WGT_BUFF_SIZE-LOG_BLOCK_OUT-LOG_BLOCK_IN-LOG_WGT_WIDTH+3)
|
||||
/*! On-chip activation buffer depth */
|
||||
#define INP_BUFF_DEPTH (INP_BUFF_SIZE/INP_ELEM_BYTES)
|
||||
/*! log2 of activation micro-op buffer depth */
|
||||
#define LOG_INP_BUFF_DEPTH (LOG_INP_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_IN-LOG_INP_WIDTH+3)
|
||||
/*! On-chip accumulator buffer depth */
|
||||
#define ACC_BUFF_DEPTH (ACC_BUFF_SIZE/ACC_ELEM_BYTES)
|
||||
/*! log2 of on-chip accumulator buffer depth */
|
||||
#define LOG_ACC_BUFF_DEPTH (LOG_ACC_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_OUT-LOG_ACC_WIDTH+3)
|
||||
|
||||
/*! Instruction opcode field bitwidth */
|
||||
#define OPCODE_BIT_WIDTH 3
|
||||
/*! ALU opcode field bitwidth */
|
||||
#define ALU_OPCODE_BIT_WIDTH 3
|
||||
/*! ALU instruction reset mode bitwidth */
|
||||
#define ALU_RESET_BIT_WIDTH 2
|
||||
|
||||
/*! Opcode: load encoding */
|
||||
#define OPCODE_LOAD 0
|
||||
/*! Opcode: store encoding */
|
||||
#define OPCODE_STORE 1
|
||||
/*! Opcode: GEMM encoding */
|
||||
#define OPCODE_GEMM 2
|
||||
/*! Opcode: finish encoding */
|
||||
#define OPCODE_FINISH 3
|
||||
/*! Opcode: ALU encoding */
|
||||
#define OPCODE_ALU 4
|
||||
|
||||
/*! ALU opcode: unary min op */
|
||||
#define ALU_OPCODE_MIN 0
|
||||
/*! ALU opcode: unary max op */
|
||||
#define ALU_OPCODE_MAX 1
|
||||
/*! ALU opcode: binary add op */
|
||||
#define ALU_OPCODE_ADD 2
|
||||
/*! ALU opcode: binary sub op [NOT IMPLEMENTED] */
|
||||
#define ALU_OPCODE_SUB 3
|
||||
/*! ALU opcode: binary mul op [NOT IMPLEMENTED] */
|
||||
#define ALU_OPCODE_MUL 4
|
||||
/*! ALU opcode: shift left by immediate op */
|
||||
#define ALU_OPCODE_SHL 5
|
||||
/*! ALU opcode: shift right by immediate op [NOT IMPLEMENTED] */
|
||||
#define ALU_OPCODE_SHR 6
|
||||
|
||||
/*! ALU instruction reset mode: set to min */
|
||||
#define ALU_RESET_MIN 3
|
||||
/*! ALU instruction reset mode: set to zero */
|
||||
#define ALU_RESET_ZERO 0
|
||||
/*! ALU instruction reset mode: no reset */
|
||||
#define ALU_NO_RESET 2
|
||||
/*! ALU instruction reset mode: set to max */
|
||||
#define ALU_RESET_MAX 1
|
||||
|
||||
/*! Memory type field bitwidth */
|
||||
#define MEMOP_ID_BIT_WIDTH 2
|
||||
/*! Load/Store Instruction: DRAM address width*/
|
||||
#define MEMOP_SRAM_ADDR_BIT_WIDTH 16
|
||||
/*! Load/Store Instruction: DRAM address width*/
|
||||
#define MEMOP_DRAM_ADDR_BIT_WIDTH 32
|
||||
/*! Load/Store Instruction: transfer size width*/
|
||||
#define MEMOP_SIZE_BIT_WIDTH 16
|
||||
/*! Load/Store Instruction: stride size width*/
|
||||
#define MEMOP_STRIDE_BIT_WIDTH 16
|
||||
/*! Load/Store Instruction: padding width*/
|
||||
#define MEMOP_PAD_BIT_WIDTH 4
|
||||
/*! Load/Store Instruction: padding value encoding width*/
|
||||
#define MEMOP_PAD_VAL_BIT_WIDTH 2
|
||||
/*! ALU Instruction: immediate bitwidth*/
|
||||
#define ALUOP_IMM_BIT_WIDTH 16
|
||||
/*! GEMM/ALU Instruction: loop max iter bits */
|
||||
#define LOOP_ITER_WIDTH 15
|
||||
|
||||
/*! Mem ID constant: uop memory */
|
||||
#define MEM_ID_UOP 0
|
||||
/*! Mem ID constant: weight memory */
|
||||
#define MEM_ID_WGT 1
|
||||
/*! Mem ID constant: input memory */
|
||||
#define MEM_ID_INP 2
|
||||
/*! Mem ID constant: accumulator/bias memory */
|
||||
#define MEM_ID_ACC 3
|
||||
/*! Mem ID constant: output store buffer */
|
||||
#define MEM_ID_OUT 4
|
||||
|
||||
// Instruction organization layout:
|
||||
//
|
||||
// LOAD/STORE
|
||||
// _____________________________|_type______________|
|
||||
// arg 0: opcode | opcode_T |
|
||||
// arg 1: pop_prev_dependence | bool |
|
||||
// arg 2: pop_next_dependence | bool |
|
||||
// arg 3: push_prev_dependence | bool |
|
||||
// arg 4: push_next_dependence | bool |
|
||||
// arg 5: memory_type | memop_id_T |
|
||||
// arg 6: pad_value | memop_pad_val_T |
|
||||
// arg 7: sram_base | memop_sram_T |
|
||||
// arg 8: dram_base | memop_dram_T |
|
||||
// arg 9: y_size | memop_size_T |
|
||||
// arg a: x_size | memop_size_T |
|
||||
// arg b: x_stride | memop_stride_T |
|
||||
// arg c: y_pad_0 | memop_pad_T |
|
||||
// arg d: y_pad_1 | memop_pad_T |
|
||||
// arg e: x_pad_0 | memop_pad_T |
|
||||
// arg f: x_pad_1 | memop_pad_T |
|
||||
//
|
||||
// GEMM
|
||||
// _____________________________|_type______________|
|
||||
// arg 0: opcode | opcode_T |
|
||||
// arg 1: pop_prev_dependence | bool |
|
||||
// arg 2: pop_next_dependence | bool |
|
||||
// arg 3: push_prev_dependence | bool |
|
||||
// arg 4: push_next_dependence | bool |
|
||||
// arg 5: uop_bgn | uop_idx_T |
|
||||
// arg 6: uop_end | uop_idx_T |
|
||||
// arg 7: iteration count ax0 | loop_T |
|
||||
// arg 8: iteration count ax1 | loop_T |
|
||||
// arg 9: accum idx factor ax0 | acc_idx_T |
|
||||
// arg a: accum idx factor ax1 | acc_idx_T |
|
||||
// arg b: input idx factor ax0 | acc_idx_T |
|
||||
// arg c: input idx factor ax1 | acc_idx_T |
|
||||
// arg d: weight idx factor ax0 | wgt_idx_T |
|
||||
// arg e: weight idx factor ax1 | wgt_idx_T |
|
||||
//
|
||||
// ALU
|
||||
// _____________________________|_type______________|
|
||||
// arg 0: opcode | opcode_T |
|
||||
// arg 1: pop_prev_dependence | bool |
|
||||
// arg 2: pop_next_dependence | bool |
|
||||
// arg 3: push_prev_dependence | bool |
|
||||
// arg 4: push_next_dependence | bool |
|
||||
// arg 5: uop_bgn | uop_idx_T |
|
||||
// arg 6: uop_end | uop_idx_T |
|
||||
// arg 7: iteration count ax0 | loop_T |
|
||||
// arg 8: iteration count ax1 | loop_T |
|
||||
// arg 9: dst idx factor ax0 | acc_idx_T |
|
||||
// arg a: dst idx factor ax1 | acc_idx_T |
|
||||
// arg b: src idx factor ax0 | acc_idx_T |
|
||||
// arg c: src idx factor ax1 | acc_idx_T |
|
||||
// arg d: alu_opcode | aluop_opcode_T |
|
||||
// arg e: use_imm | bool |
|
||||
// arg f: imm | alu_imm_T |
|
||||
|
||||
/*! Load/Store instruction start position of the opcode field */
|
||||
#define INSN_MEM_0_0 0
|
||||
/*! Load/Store instruction end position of the opcode field */
|
||||
#define INSN_MEM_0_1 (INSN_MEM_0_0+OPCODE_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction position of the pop_prev_dep field */
|
||||
#define INSN_MEM_1 (INSN_MEM_0_1+1)
|
||||
/*! Load/Store instruction position of the pop_next_dep field */
|
||||
#define INSN_MEM_2 (INSN_MEM_1+1)
|
||||
/*! Load/Store instruction position of the push_prev_dependence field */
|
||||
#define INSN_MEM_3 (INSN_MEM_2+1)
|
||||
/*! Load/Store instruction position of the push_next_dependence field */
|
||||
#define INSN_MEM_4 (INSN_MEM_3+1)
|
||||
/*! Load/Store instruction start position of the memory_type field */
|
||||
#define INSN_MEM_5_0 (INSN_MEM_4+1)
|
||||
/*! Load/Store instruction end position of the memory_type field */
|
||||
#define INSN_MEM_5_1 (INSN_MEM_5_0+MEMOP_ID_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the sram_base field */
|
||||
#define INSN_MEM_6_0 (INSN_MEM_5_1+1)
|
||||
/*! Load/Store instruction end position of the sram_base field */
|
||||
#define INSN_MEM_6_1 (INSN_MEM_6_0+MEMOP_SRAM_ADDR_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the dram_base field */
|
||||
#define INSN_MEM_7_0 (INSN_MEM_6_1+1)
|
||||
/*! Load/Store instruction end position of the dram_base field */
|
||||
#define INSN_MEM_7_1 (INSN_MEM_7_0+MEMOP_DRAM_ADDR_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the y_size field */
|
||||
#define INSN_MEM_8_0 64
|
||||
/*! Load/Store instruction end position of the y_size field */
|
||||
#define INSN_MEM_8_1 (INSN_MEM_8_0+MEMOP_SIZE_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the x_size field */
|
||||
#define INSN_MEM_9_0 (INSN_MEM_8_1+1)
|
||||
/*! Load/Store instruction start position of the x_size field */
|
||||
#define INSN_MEM_9_1 (INSN_MEM_9_0+MEMOP_SIZE_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the x_stride field */
|
||||
#define INSN_MEM_A_0 (INSN_MEM_9_1+1)
|
||||
/*! Load/Store instruction end position of the x_stride field */
|
||||
#define INSN_MEM_A_1 (INSN_MEM_A_0+MEMOP_STRIDE_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the y_pad_0 field */
|
||||
#define INSN_MEM_B_0 (INSN_MEM_A_1+1)
|
||||
/*! Load/Store instruction start position of the y_pad_0 field */
|
||||
#define INSN_MEM_B_1 (INSN_MEM_B_0+MEMOP_PAD_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the y_pad_1 field */
|
||||
#define INSN_MEM_C_0 (INSN_MEM_B_1+1)
|
||||
/*! Load/Store instruction start position of the y_pad_1 field */
|
||||
#define INSN_MEM_C_1 (INSN_MEM_C_0+MEMOP_PAD_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the x_pad_0 field */
|
||||
#define INSN_MEM_D_0 (INSN_MEM_C_1+1)
|
||||
/*! Load/Store instruction start position of the x_pad_0 field */
|
||||
#define INSN_MEM_D_1 (INSN_MEM_D_0+MEMOP_PAD_BIT_WIDTH-1)
|
||||
/*! Load/Store instruction start position of the x_pad_1 field */
|
||||
#define INSN_MEM_E_0 (INSN_MEM_D_1+1)
|
||||
/*! Load/Store instruction start position of the x_pad_1 field */
|
||||
#define INSN_MEM_E_1 (INSN_MEM_E_0+MEMOP_PAD_BIT_WIDTH-1)
|
||||
|
||||
/*! GEMM instruction start position of the opcode field */
|
||||
#define INSN_GEM_0_0 0
|
||||
/*! GEMM instruction end position of the opcode field */
|
||||
#define INSN_GEM_0_1 (INSN_GEM_0_0+OPCODE_BIT_WIDTH-1)
|
||||
/*! GEMM instruction position of the pop_prev_dep field */
|
||||
#define INSN_GEM_1 (INSN_GEM_0_1+1)
|
||||
/*! GEMM instruction position of the pop_next_dep field */
|
||||
#define INSN_GEM_2 (INSN_GEM_1+1)
|
||||
/*! GEMM instruction position of the push_prev_dependence field */
|
||||
#define INSN_GEM_3 (INSN_GEM_2+1)
|
||||
/*! GEMM instruction position of the push_next_dependence field */
|
||||
#define INSN_GEM_4 (INSN_GEM_3+1)
|
||||
/*! GEMM instruction start position of the uop_bgn field */
|
||||
#define INSN_GEM_5_0 (INSN_GEM_4+1)
|
||||
/*! GEMM instruction end position of the uop_bgn field */
|
||||
#define INSN_GEM_5_1 (INSN_GEM_5_0+LOG_UOP_BUFF_DEPTH-1)
|
||||
/*! GEMM instruction start position of the uop_end field */
|
||||
#define INSN_GEM_6_0 (INSN_GEM_5_1+1)
|
||||
/*! GEMM instruction end position of the uop_end field */
|
||||
#define INSN_GEM_6_1 (INSN_GEM_6_0+LOG_UOP_BUFF_DEPTH+1-1)
|
||||
/*! GEMM instruction start position of the iter_out field */
|
||||
#define INSN_GEM_7_0 (INSN_GEM_6_1+1)
|
||||
/*! GEMM instruction end position of the iter_out field */
|
||||
#define INSN_GEM_7_1 (INSN_GEM_7_0+LOOP_ITER_WIDTH-1)
|
||||
/*! GEMM instruction start position of the iter_in field */
|
||||
#define INSN_GEM_8_0 (INSN_GEM_7_1+1)
|
||||
/*! GEMM instruction end position of the iter_in field */
|
||||
#define INSN_GEM_8_1 (INSN_GEM_8_0+LOOP_ITER_WIDTH-1)
|
||||
/*! GEMM instruction start position of the dst_factor_out field */
|
||||
#define INSN_GEM_9_0 64
|
||||
/*! GEMM instruction end position of the dst_factor_out field */
|
||||
#define INSN_GEM_9_1 (INSN_GEM_9_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM instruction start position of the dst_factor_in field */
|
||||
#define INSN_GEM_A_0 (INSN_GEM_9_1+1)
|
||||
/*! GEMM instruction end position of the dst_factor_in field */
|
||||
#define INSN_GEM_A_1 (INSN_GEM_A_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM instruction start position of the src_factor_out field */
|
||||
#define INSN_GEM_B_0 (INSN_GEM_A_1+1)
|
||||
/*! GEMM instruction end position of the src_factor_out field */
|
||||
#define INSN_GEM_B_1 (INSN_GEM_B_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM instruction start position of the src_factor_in field */
|
||||
#define INSN_GEM_C_0 (INSN_GEM_B_1+1)
|
||||
/*! GEMM instruction end position of the src_factor_in field */
|
||||
#define INSN_GEM_C_1 (INSN_GEM_C_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
|
||||
/*! GEMM instruction start position of the wgt_factor_out field */
|
||||
#define INSN_GEM_D_0 (INSN_GEM_C_1+1)
|
||||
/*! GEMM instruction end position of the wgt_factor_out field */
|
||||
#define INSN_GEM_D_1 (INSN_GEM_D_0+LOG_WGT_BUFF_DEPTH-1)
|
||||
/*! GEMM instruction start position of the wgt_factor_in field */
|
||||
#define INSN_GEM_E_0 (INSN_GEM_D_1+1)
|
||||
/*! GEMM instruction end position of the wgt_factor_in field */
|
||||
#define INSN_GEM_E_1 (INSN_GEM_E_0+LOG_WGT_BUFF_DEPTH-1)
|
||||
|
||||
/*! ALU instruction start position of the alu_opcode field */
|
||||
#define INSN_ALU_D_0 (INSN_GEM_C_1+1)
|
||||
/*! ALU instruction end position of the alu_opcode field */
|
||||
#define INSN_ALU_D_1 (INSN_ALU_D_0+ALU_OPCODE_BIT_WIDTH-1)
|
||||
/*! ALU instruction position of the use_imm field */
|
||||
#define INSN_ALU_E (INSN_ALU_D_1+1)
|
||||
/*! ALU instruction start position of the immediate field */
|
||||
#define INSN_ALU_F_0 (INSN_ALU_E+1)
|
||||
/*! ALU instruction end position of the immediate field */
|
||||
#define INSN_ALU_F_1 (INSN_ALU_F_0+ALUOP_IMM_BIT_WIDTH-1)
|
||||
|
||||
/*! GEMM Micro-op position of the reset_out field */
|
||||
#define UOP_GEM_0 0
|
||||
/*! GEMM Micro-op start position of the acc_idx field */
|
||||
#define UOP_GEM_1_0 (UOP_GEM_0+1)
|
||||
/*! GEMM Micro-op end position of the acc_idx field */
|
||||
#define UOP_GEM_1_1 (UOP_GEM_1_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM Micro-op start position of the inp_idx field */
|
||||
#define UOP_GEM_2_0 (UOP_GEM_1_1+1)
|
||||
/*! GEMM Micro-op end position of the inp_idx field */
|
||||
#define UOP_GEM_2_1 (UOP_GEM_2_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM Micro-op start position of the wgt_idx field */
|
||||
#define UOP_GEM_3_0 (UOP_GEM_2_1+1)
|
||||
/*! GEMM Micro-op end position of the wgt_idx field */
|
||||
#define UOP_GEM_3_1 (UOP_GEM_3_0+LOG_WGT_BUFF_DEPTH-1)
|
||||
|
||||
/*! GEMM Micro-op position of the reset_out field */
|
||||
#define UOP_ALU_0 0
|
||||
/*! GEMM Micro-op start position of the acc_idx field */
|
||||
#define UOP_ALU_1_0 (UOP_ALU_0+1)
|
||||
/*! GEMM Micro-op end position of the acc_idx field */
|
||||
#define UOP_ALU_1_1 (UOP_ALU_1_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM Micro-op start position of the inp_idx field */
|
||||
#define UOP_ALU_2_0 (UOP_ALU_1_1+1)
|
||||
/*! GEMM Micro-op end position of the inp_idx field */
|
||||
#define UOP_ALU_2_1 (UOP_ALU_2_0+LOG_ACC_BUFF_DEPTH-1)
|
||||
/*! GEMM Micro-op start position of the wgt_idx field */
|
||||
#define UOP_ALU_3_0 (UOP_ALU_2_1+1)
|
||||
/*! GEMM Micro-op end position of the wgt_idx field */
|
||||
#define UOP_ALU_3_1 (UOP_ALU_3_0+LOG_WGT_BUFF_DEPTH-1)
|
||||
|
||||
/*! \brief VTA generic instruction */
|
||||
typedef struct {
|
||||
uint64_t word_0 : 64;
|
||||
uint64_t word_1 : 64;
|
||||
} VTAGenericInsn;
|
||||
|
||||
/*! \brief VTA load/store instruction
|
||||
* Load/store instruction can describe a 2D strided access pattern
|
||||
* with padding, which can be useful to perform spatial padding
|
||||
* on the fly on a tensor on which to perform 2D convolution.
|
||||
* For instance if we try to load a 4x4 spatial tile from a 16x16
|
||||
* matrix with padding of size 1 on all dimensions:
|
||||
* y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
|
||||
* x_pad_0 = 1, x_pad_1 = 1.
|
||||
*/
|
||||
typedef struct {
|
||||
/*! \brief The instruction opcode */
|
||||
uint64_t opcode : OPCODE_BIT_WIDTH;
|
||||
/*! \brief Unused in this instruction */
|
||||
uint64_t pop_prev_dep : 1;
|
||||
/*! \brief Pop dependence token from GEMM stage */
|
||||
uint64_t pop_next_dep : 1;
|
||||
/*! \brief Unused in this instruction */
|
||||
uint64_t push_prev_dep : 1;
|
||||
/*! \brief Push dependence token to GEMM stage */
|
||||
uint64_t push_next_dep : 1;
|
||||
/*! \brief Source/destination SRAM for store/load instruction */
|
||||
uint64_t memory_type : MEMOP_ID_BIT_WIDTH;
|
||||
/*! \brief SRAM base address (pointer to memory elem type) */
|
||||
uint64_t sram_base : MEMOP_SRAM_ADDR_BIT_WIDTH;
|
||||
/*! \brief DRAM base address (pointer to memory elem type) */
|
||||
uint64_t dram_base : MEMOP_DRAM_ADDR_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: y-size */
|
||||
uint64_t y_size : MEMOP_SIZE_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: x-size (in terms of memory elements) */
|
||||
uint64_t x_size : MEMOP_SIZE_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: x-stride (in terms of memory elements) */
|
||||
uint64_t x_stride : MEMOP_STRIDE_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: start padding along y dimension */
|
||||
uint64_t y_pad_0 : MEMOP_PAD_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: end padding along y dimension */
|
||||
uint64_t y_pad_1 : MEMOP_PAD_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: start padding along x dimension */
|
||||
uint64_t x_pad_0 : MEMOP_PAD_BIT_WIDTH;
|
||||
/*! \brief 2D access pattern: end padding along x dimension */
|
||||
uint64_t x_pad_1 : MEMOP_PAD_BIT_WIDTH;
|
||||
} VTAMemInsn;
|
||||
|
||||
/*! \brief VTA GEMM instruction
|
||||
* GEMM instruction is implemented by executing a sequence of micro-operations
|
||||
* that is read in the local micro-op memory, delimited by \a uop_bgn and
|
||||
* \a uop_end. For improved storage-efficiency, the micro-operations can be
|
||||
* executed in a 2-level nested loop as follows:
|
||||
* \code{.cpp}
|
||||
* for (i = 0; i < iter_out; i++) {
|
||||
* for (j = 0; j < iter_in; j++) {
|
||||
* for (k = uop_bgn; k < uop_end; k++) {
|
||||
* // Read micro op
|
||||
* uop_T uop = uop_mem[k];
|
||||
* // Read in memory indices
|
||||
* acc_idx_T acc_idx = uop.dst_idx;
|
||||
* inp_idx_T inp_idx = uop.inp_idx;
|
||||
* wgt_idx_T wgt_idx = uop.wgt_idx;
|
||||
* // Update those indices with the following affine functions
|
||||
* acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
|
||||
* inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
|
||||
* wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
|
||||
* // Perform GEMM operation
|
||||
* acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
|
||||
* }
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
/*! \brief The instruction opcode */
|
||||
uint64_t opcode : OPCODE_BIT_WIDTH;
|
||||
/*! \brief Pop dependence token from load stage */
|
||||
uint64_t pop_prev_dep : 1;
|
||||
/*! \brief Pop dependence token from store stage */
|
||||
uint64_t pop_next_dep : 1;
|
||||
/*! \brief Push dependence token to load stage */
|
||||
uint64_t push_prev_dep : 1;
|
||||
/*! \brief Push dependence token to store stage */
|
||||
uint64_t push_next_dep : 1;
|
||||
/*! \brief Micro-op begin address */
|
||||
uint64_t uop_bgn : LOG_UOP_BUFF_DEPTH;
|
||||
/*! \brief Micro-op end address */
|
||||
uint64_t uop_end : LOG_UOP_BUFF_DEPTH+1;
|
||||
/*! \brief Iterations in the outer uop execution loop */
|
||||
uint64_t iter_out : LOOP_ITER_WIDTH;
|
||||
/*! \brief Iterations in the inner uop execution loop */
|
||||
uint64_t iter_in : LOOP_ITER_WIDTH;
|
||||
/*! \brief Outer loop accumulator memory index factor */
|
||||
uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Inner loop accumulator memory index factor */
|
||||
uint64_t dst_factor_in : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Outer loop input memory index factor */
|
||||
uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Inner loop input memory index factor */
|
||||
uint64_t src_factor_in : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Outer loop weight memory index factor */
|
||||
uint64_t wgt_factor_out : LOG_WGT_BUFF_DEPTH;
|
||||
/*! \brief Inner loop weight memory index factor */
|
||||
uint64_t wgt_factor_in : LOG_WGT_BUFF_DEPTH;
|
||||
} VTAGemInsn;
|
||||
|
||||
/*! \brief VTA ALU instruction
|
||||
* ALU instruction is implemented by executing a sequence of micro-operations
|
||||
* that is read in the local micro-op memory, delimited by \a uop_bgn and
|
||||
* \a uop_end. For improved storage-efficiency, the micro-operations can be
|
||||
* executed in a 2-level nested loop as follows:
|
||||
* \code{.cpp}
|
||||
* for (i = 0; i < iter_out; i++) {
|
||||
* for (j = 0; j < iter_in; j++) {
|
||||
* for (k = uop_bgn; k < uop_end; k++) {
|
||||
* // Read micro op
|
||||
* uop_T uop = uop_mem[k];
|
||||
* // Read in memory indices
|
||||
* acc_idx_T dst_idx = uop.dst_idx;
|
||||
* inp_idx_T src_idx = uop.inp_idx;
|
||||
* // Update those indices with the following affine functions
|
||||
* dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
|
||||
* src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
|
||||
* // Perform ALU operation
|
||||
* if (use_imm) {
|
||||
* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
|
||||
* } else {
|
||||
* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
|
||||
* }
|
||||
* }
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
/*! \brief The instruction opcode */
|
||||
uint64_t opcode : OPCODE_BIT_WIDTH;
|
||||
/*! \brief Pop dependence token from load stage */
|
||||
uint64_t pop_prev_dep : 1;
|
||||
/*! \brief Pop dependence token from store stage */
|
||||
uint64_t pop_next_dep : 1;
|
||||
/*! \brief Push dependence token to load stage */
|
||||
uint64_t push_prev_dep : 1;
|
||||
/*! \brief Push dependence token to store stage */
|
||||
uint64_t push_next_dep : 1;
|
||||
/*! \brief Micro-op begin address */
|
||||
uint64_t uop_bgn : LOG_UOP_BUFF_DEPTH;
|
||||
/*! \brief Micro-op end address */
|
||||
uint64_t uop_end : LOG_UOP_BUFF_DEPTH+1;
|
||||
/*! \brief Iterations in the outer uop execution loop */
|
||||
uint64_t iter_out : LOOP_ITER_WIDTH;
|
||||
/*! \brief Iterations in the inner uop execution loop */
|
||||
uint64_t iter_in : LOOP_ITER_WIDTH;
|
||||
/*! \brief Outer loop accumulator memory destination index factor */
|
||||
uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Inner loop accumulator memory destination index factor */
|
||||
uint64_t dst_factor_in : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Outer loop accumulator memory source index factor */
|
||||
uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Inner loop accumulator memory source index factor */
|
||||
uint64_t src_factor_in : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief ALU opcode */
|
||||
uint64_t alu_opcode : ALU_OPCODE_BIT_WIDTH;
|
||||
/*! \brief Use immediate is true */
|
||||
uint64_t use_imm : 1;
|
||||
/*! \brief Immediate value */
|
||||
uint64_t imm : ALUOP_IMM_BIT_WIDTH;
|
||||
} VTAAluInsn;
|
||||
|
||||
/*! \brief VTA ALU instruction converter */
|
||||
union VTAInsn {
|
||||
/*! \brief VTA generic instruction */
|
||||
VTAGenericInsn generic;
|
||||
/*! \brief VTA load/store instruction */
|
||||
VTAMemInsn mem;
|
||||
/*! \brief VTA GEMM instruction */
|
||||
VTAGemInsn gemm;
|
||||
/*! \brief VTA ALU instruction */
|
||||
VTAAluInsn alu;
|
||||
};
|
||||
|
||||
/*! \brief VTA micro-op for GEMM/ALU instruction */
|
||||
typedef struct {
|
||||
/*! \brief Initialize acc_mem at index dst_idx to 0*/
|
||||
uint32_t reset_out : 1;
|
||||
/*! \brief Destination index (indexes accum buffer) */
|
||||
uint32_t dst_idx : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
|
||||
uint32_t src_idx : LOG_ACC_BUFF_DEPTH;
|
||||
/*! \brief Weight index (indexes weight buffer) */
|
||||
uint32_t wgt_idx : LOG_WGT_BUFF_DEPTH;
|
||||
} VTAUop;
|
||||
|
||||
#endif // VTA_DEFINES_H_
|
|
@ -0,0 +1,152 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_pynq_driver.h
|
||||
* \brief VTA driver for Pynq board.
|
||||
*/
|
||||
|
||||
#ifndef VTA_PYNQ_DRIVER_H_
|
||||
#define VTA_PYNQ_DRIVER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __arm__
|
||||
#include "libxlnk_cma.h"
|
||||
#else
|
||||
void* cma_alloc(size_t size, int cached);
|
||||
void cma_free(void* buf);
|
||||
uint32_t cma_get_phy_addr(void* buf);
|
||||
void xlnkFlushCache(void* buf, int size);
|
||||
void xlnkInvalidateCache(void* buf, int size);
|
||||
#endif
|
||||
|
||||
/*! \brief VTA command handle */
|
||||
typedef void * VTAHandle;
|
||||
|
||||
/*! \brief DMA command handle */
|
||||
typedef struct {
|
||||
/*! \brief Register map to the AXI DMA control registers*/
|
||||
void *dma_register_map;
|
||||
/*! \brief Transmit data descriptor*/
|
||||
void *mm2s_descriptor_register_map;
|
||||
/*! \brief Receive data descriptor*/
|
||||
void *s2mm_descriptor_register_map;
|
||||
/*! \brief Transmit data descriptor physical address*/
|
||||
uint32_t mm2s_descriptor_phy;
|
||||
/*! \brief Receive data descriptor physical address*/
|
||||
uint32_t s2mm_descriptor_phy;
|
||||
/*! \brief Descriptor size */
|
||||
uint32_t descriptor_size;
|
||||
/*! \brief Transaction count for tx channel */
|
||||
uint32_t mm2s_count;
|
||||
/*! \brief Transaction count for rx channel */
|
||||
uint32_t s2mm_count;
|
||||
/*! \brief Multi-channel mode enable */
|
||||
int multichannel_en;
|
||||
} DMAHandle;
|
||||
|
||||
/*! \brief partial bitstream status file path */
|
||||
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
|
||||
/*! \brief bitstream destination file path */
|
||||
#define BS_XDEVCFG "/dev/xdevcfg"
|
||||
|
||||
/*! \brief Path to /dev/mem */
|
||||
#define DEV_MEM_PATH "/dev/mem"
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_LENGTH 4
|
||||
/*! \brief MMIO driver constant */
|
||||
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
|
||||
|
||||
/*! \brief VTA configuration register address range */
|
||||
#define VTA_RANGE 0x100
|
||||
/*! \brief VTA configuration register start value */
|
||||
#define VTA_START 0x1
|
||||
/*! \brief VTA configuration register auto-restart value */
|
||||
#define VTA_AUTORESTART 0x81
|
||||
/*! \brief VTA configuration register done value */
|
||||
#define VTA_DONE 0x1
|
||||
|
||||
/*! \brief VTA fetch stage configuration register address
|
||||
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_FETCH_ADDR 0x43C00000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_COMPUTE_ADDR 0x43C10000
|
||||
/*! \brief VTA compute stage configuration register address
|
||||
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_LOAD_ADDR 0x43C20000
|
||||
/*! \brief VTA store stage configuration register address
|
||||
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
|
||||
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
|
||||
*/
|
||||
#define VTA_STORE_ADDR 0x43C30000
|
||||
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define CACHED 1
|
||||
/*! \brief Memory management constants with libxlnk_cma */
|
||||
#define NOT_CACHED 0
|
||||
|
||||
/*! \brief log2 of SDS buffer size limit */
|
||||
#define LOG_MAX_XFER 22
|
||||
/*! \brief SDS buffer size limit */
|
||||
#define MAX_XFER (1<<LOG_MAX_XFER)
|
||||
|
||||
/*!
|
||||
* \brief Returns a memory map to FPGA configuration registers.
|
||||
* \param addr The base physical address of the configuration registers.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
* \return A pointer to the memory mapped region.
|
||||
*/
|
||||
void *MapRegister(unsigned addr, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Deletes the configuration register memory map.
|
||||
* \param vta The memory mapped region.
|
||||
* \param length The size of the memory mapped region in bytes.
|
||||
*/
|
||||
void UnmapRegister(void *vta, size_t length);
|
||||
|
||||
/*!
|
||||
* \brief Writes to a memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to write to.
|
||||
* \param val The value to be written to the memory mapped register.
|
||||
*/
|
||||
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
|
||||
|
||||
/*!
|
||||
* \brief Reads from the memory mapped configuration register.
|
||||
* \param vta_base The handle to the memory mapped configuration registers.
|
||||
* \param offset The offset of the register to read from.
|
||||
* \return The value read from the memory mapped register.
|
||||
*/
|
||||
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
|
||||
|
||||
/*!
|
||||
* \brief Programming the bit stream on the FPGA.
|
||||
* \param bitstream The path to the bit stream file.
|
||||
*/
|
||||
void ProgramVTA(const char* bitstream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // VTA_PYNQ_DRIVER_H_
|
|
@ -0,0 +1,300 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_test_lib.cpp
|
||||
* \brief Test library for the VTA design simulation and driver tests.
|
||||
*/
|
||||
|
||||
#ifndef VTA_TESTLIB_H_
|
||||
#define VTA_TESTLIB_H_
|
||||
|
||||
#include "vta_params.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef NO_SIM
|
||||
|
||||
#include "vta_pynq_driver.h"
|
||||
|
||||
typedef uint64_t axi_T;
|
||||
typedef uint32_t uop_T;
|
||||
typedef int8_t wgt_T;
|
||||
typedef int8_t inp_T;
|
||||
typedef int32_t acc_T;
|
||||
|
||||
uint64_t vta (
|
||||
uint32_t insn_count,
|
||||
VTAGenericInsn *insns,
|
||||
VTAUop *uops,
|
||||
inp_T *inputs,
|
||||
wgt_T *weights,
|
||||
acc_T *biases,
|
||||
inp_T *outputs);
|
||||
|
||||
#else //NO_SIM
|
||||
|
||||
#include "vta.h"
|
||||
#include "vta_typedefs.h"
|
||||
|
||||
#endif //NO_SIM
|
||||
|
||||
/*!
|
||||
* \brief Returns opcode string.
|
||||
* \param opcode Opcode parameter (defined in vta_defines.h).
|
||||
* \param use_imm Boolean that indicates if the operation uses an immediate value.
|
||||
* \return The opcode string.
|
||||
*/
|
||||
const char* getOpcodeString(int opcode, bool use_imm);
|
||||
|
||||
/*!
|
||||
* \brief Performs buffer data packing and tiling.
|
||||
* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
|
||||
* \param src Pointer to the unpacked source 2D array.
|
||||
* \param y_size Number of rows.
|
||||
* \param x_size Number of columns.
|
||||
* \param y_block Inner tiling along row dimension.
|
||||
* \param x_block Inner tiling along column dimension.
|
||||
*/
|
||||
template <typename T, int T_WIDTH>
|
||||
void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
|
||||
|
||||
/*!
|
||||
* \brief Performs buffer data unpacking.
|
||||
* \param dst Pointer to the unpacked destination 2D array.
|
||||
* \param src Pointer to the packed, and tiled source 1D array (flattened).
|
||||
* \param y_size Number of rows.
|
||||
* \param x_size Number of columns.
|
||||
* \param y_block Inner tiling along row dimension.
|
||||
* \param x_block Inner tiling along column dimension.
|
||||
*/
|
||||
template <typename T, int T_WIDTH>
|
||||
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
|
||||
|
||||
/*!
|
||||
* \brief Allocates and initializes a 2D array in the heap.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \return Pointer to the 2D array.
|
||||
*/
|
||||
template <typename T, int T_WIDTH>
|
||||
T ** allocInit2dArray(int rows, int cols);
|
||||
|
||||
/*!
|
||||
* \brief Allocates a 2D array in the heap.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \return Pointer to the 2D array.
|
||||
*/
|
||||
template <typename T>
|
||||
T ** alloc2dArray(int rows, int cols);
|
||||
|
||||
/*!
|
||||
* \brief Frees a 2D array.
|
||||
* \param array Pointer to the 2D array to be freed.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
*/
|
||||
template <typename T>
|
||||
void free2dArray(T **array, int rows, int cols);
|
||||
|
||||
/*!
|
||||
* \brief Allocates a 3D array in the heap.
|
||||
* \param rows Number of rows (dim 0).
|
||||
* \param cols Number of columns (dim 1).
|
||||
* \param depth Depth of the array (dim 2).
|
||||
* \return Pointer to the 3D array.
|
||||
*/
|
||||
template <typename T>
|
||||
T *** alloc3dArray(int rows, int cols, int depth);
|
||||
|
||||
/*!
|
||||
* \brief Frees a 3D array.
|
||||
* \param array Pointer to the 3D array.
|
||||
* \param rows Number of rows (dim 0).
|
||||
* \param cols Number of columns (dim 1).
|
||||
* \param depth Depth of the array (dim 2).
|
||||
*/
|
||||
template <typename T>
|
||||
void free3dArray(T *** array, int rows, int cols, int depth);
|
||||
|
||||
/*!
|
||||
* \brief Performs memory allocation in a physically contiguous region of memory.
|
||||
* \param num_bytes Size of the buffer in bytes.
|
||||
* \return Pointer to the allocated buffer.
|
||||
*/
|
||||
void * allocBuffer(size_t num_bytes);
|
||||
|
||||
/*!
|
||||
* \brief Frees buffer allocated in a physically contiguous region of memory.
|
||||
* \param buffer Pointer to the buffer to free.
|
||||
*/
|
||||
void freeBuffer(void * buffer);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA reset instruction on a 2D patch of the register file.
|
||||
* \param type On-chip memory target.
|
||||
* \param sram_offset Offset in SRAM.
|
||||
* \param y_size Number of rows to reset (y axis).
|
||||
* \param x_size Number of elements per row to reset (x axis).
|
||||
* \param x_stride Stride along the x axis.
|
||||
* \param pop_prev_dep Pop dependence from previous stage.
|
||||
* \param pop_next_dep Pop dependence from next stage.
|
||||
* \param push_prev_dep Push dependence to previous stage.
|
||||
* \param push_next_dep Push dependence to next stage.
|
||||
* \return A VTAGenericInsn for a reset op.
|
||||
*/
|
||||
VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
|
||||
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA 2D load or store instruction.
|
||||
* \param opcode Type of operation.
|
||||
* \param type On-chip memory target.
|
||||
* \param sram_offset Offset in SRAM.
|
||||
* \param dram_offset Offset in DRAM.
|
||||
* \param y_size Number of rows to load/store (y axis).
|
||||
* \param x_size Number of elements per row to load/store (x axis).
|
||||
* \param x_stride Stride along the x axis.
|
||||
* \param y_pad Padding along the y axis.
|
||||
* \param x_pad Padding along the x axis.
|
||||
* \param pop_prev_dep Pop dependence from previous stage.
|
||||
* \param pop_next_dep Pop dependence from next stage.
|
||||
* \param push_prev_dep Push dependence to previous stage.
|
||||
* \param push_next_dep Push dependence to next stage.
|
||||
* \return A VTAGenericInsn for a 2D load or store op.
|
||||
*/
|
||||
VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
|
||||
int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
|
||||
int push_prev_dep, int push_next_dep);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA 1D load or store instruction.
|
||||
* \param opcode Type of operation.
|
||||
* \param type On-chip memory target.
|
||||
* \param sram_offset Offset in SRAM.
|
||||
* \param dram_offset Offset in DRAM.
|
||||
* \param size Number of elements to load/store.
|
||||
* \param pop_prev_dep Pop dependence from previous stage.
|
||||
* \param pop_next_dep Pop dependence from next stage.
|
||||
* \param push_prev_dep Push dependence to previous stage.
|
||||
* \param push_next_dep Push dependence to next stage.
|
||||
* \return A VTAGenericInsn for a 1D load or store op.
|
||||
*/
|
||||
VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
|
||||
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
|
||||
* \param uop_offset Offset of the micro-op in SRAM.
|
||||
* \param batch Batch size (a).
|
||||
* \param in_feat Input features (b).
|
||||
* \param out_feat Output features (c).
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \param pop_prev_dep Pop dependence from previous stage.
|
||||
* \param pop_next_dep Pop dependence from next stage.
|
||||
* \param push_prev_dep Push dependence to previous stage.
|
||||
* \param push_next_dep Push dependence to next stage.
|
||||
* \return A VTAGenericInsn for a GEMM op.
|
||||
*/
|
||||
VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
|
||||
bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
|
||||
int push_next_dep);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA ALU instruction for map type operation.
|
||||
* \param opcode Opcode of the ALU instruction.
|
||||
* \param use_imm Use immediate.
|
||||
* \param imm Immediate value (int16).
|
||||
* \param vector_size Vector size of the ALU operation size.
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \param pop_prev_dep Pop dependence from previous stage.
|
||||
* \param pop_next_dep Pop dependence from next stage.
|
||||
* \param push_prev_dep Push dependence to previous stage.
|
||||
* \param push_next_dep Push dependence to next stage.
|
||||
* \return A VTAGenericInsn for a ALU op.
|
||||
*/
|
||||
VTAGenericInsn getALUInsn(int opcode, bool use_imm, int imm, int vector_size, bool uop_compression,
|
||||
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
|
||||
|
||||
/*!
|
||||
* \brief Returns a VTA finish instruction.
|
||||
* \param pop_prev Pop dependence from previous stage.
|
||||
* \param pop_next Pop dependence from next stage.
|
||||
* \return A VTAGenericInsn for a finish op.
|
||||
*/
|
||||
VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
|
||||
|
||||
/*!
|
||||
* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
|
||||
* \param y_size Number of rows to load/store (y axis).
|
||||
* \param x_size Number of elements per row to load/store (x axis).
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \return A VTAUop pointer to an allocated micro-op buffer.
|
||||
*/
|
||||
VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
|
||||
|
||||
/*!
|
||||
* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
|
||||
* of size (a, b) x (b, c).
|
||||
* \param batch Batch size (a).
|
||||
* \param in_feat Input features (b).
|
||||
* \param out_feat Output features (c).
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \param multi_threaded Generate micro-ops for two virtual execution threads.
|
||||
* \return A VTAUop pointer to an allocated micro-op buffer.
|
||||
*/
|
||||
VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
|
||||
bool multi_threaded);
|
||||
|
||||
/*!
|
||||
* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
|
||||
* \param vector_size Vector size.
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \return A VTAUop pointer to an allocated micro-op buffer.
|
||||
*/
|
||||
VTAUop * getMapALUUops(int vector_size, bool uop_compression);
|
||||
|
||||
/*!
|
||||
* \brief Print out parameters of the VTA design (for debugging purposes).
|
||||
*/
|
||||
void printParameters();
|
||||
|
||||
/*!
|
||||
* \brief Print out instruction information (for debugging purposes).
|
||||
* \param num_insn Number of instructions.
|
||||
* \param insns Pointer to the instruction buffer.
|
||||
*/
|
||||
void printInstruction(int num_insn, VTAGenericInsn *insns);
|
||||
|
||||
/*!
|
||||
* \brief Print out micro-op information (for debugging purposes).
|
||||
* \param num_insn Number of micro-ops.
|
||||
* \param insns Pointer to the micro-op buffer.
|
||||
*/
|
||||
void printMicroOp(int num_uop, VTAUop *uops);
|
||||
|
||||
/*!
|
||||
* \brief VTA ALU unit test.
|
||||
* \param opcode The ALU opcode.
|
||||
* \param use_imm Use immediate.
|
||||
* \param batch Batch size.
|
||||
* \param vector_size Vector length of the ALU operation.
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \return Number of errors from the test run.
|
||||
*/
|
||||
int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
|
||||
|
||||
/*!
|
||||
* \brief VTA blocked GEMM unit test.
|
||||
* \param batch Batch size.
|
||||
* \param channels Channel width.
|
||||
* \param block Blocking size.
|
||||
* \param uop_compression Apply micro-op compression.
|
||||
* \return Number of errors from the test run.
|
||||
*/
|
||||
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
|
||||
int virtual_threads);
|
||||
|
||||
#endif // VTA_TESTLIB_H_
|
|
@ -0,0 +1,77 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta_pynq_driver.c
|
||||
* \brief VTA driver for Pynq board.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "vta_pynq_driver.h"
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
void *MapRegister(uint32_t addr, size_t length) {
|
||||
|
||||
// Align the base address with the pages
|
||||
uint32_t virt_base = addr & ~(getpagesize() - 1);
|
||||
// Calculate base address offset w.r.t the base address
|
||||
uint32_t virt_offset = addr - virt_base;
|
||||
// Open file and mmap
|
||||
uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
|
||||
|
||||
return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
|
||||
}
|
||||
|
||||
void UnmapRegister(void *vta, size_t length) {
|
||||
// Unmap memory
|
||||
int status = munmap(vta, length);
|
||||
assert(status==0);
|
||||
}
|
||||
|
||||
void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
|
||||
*((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
|
||||
}
|
||||
|
||||
uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
|
||||
return *((volatile uint32_t *) (((char *) base_addr) + offset));
|
||||
}
|
||||
|
||||
void ProgramVTA(const char* bitstream) {
|
||||
|
||||
int elem;
|
||||
FILE *src, *dst, *partial;
|
||||
|
||||
partial = fopen(BS_IS_PARTIAL, "w");
|
||||
if (partial == NULL) {
|
||||
printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
|
||||
fclose(partial);
|
||||
exit(1);
|
||||
}
|
||||
fputc('0', partial);
|
||||
fclose(partial);
|
||||
|
||||
src = fopen(bitstream, "rb");
|
||||
if (src == NULL) {
|
||||
printf("Cannot open bitstream %s\n", bitstream);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
dst = fopen(BS_XDEVCFG, "wb");
|
||||
if (dst == NULL) {
|
||||
printf("Cannot open device file %s\n", BS_XDEVCFG);
|
||||
fclose(dst);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
elem = fgetc(src);
|
||||
while (elem != EOF) {
|
||||
fputc(elem, dst);
|
||||
elem = fgetc(src);
|
||||
}
|
||||
|
||||
fclose(src);
|
||||
fclose(dst);
|
||||
|
||||
}
|
|
@ -0,0 +1,789 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file vta.cpp
|
||||
* \brief VTA HLS design.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "vta.h"
|
||||
|
||||
void fetch (
|
||||
uint32_t insn_count,
|
||||
volatile insn_T *insns,
|
||||
hls::stream<insn_T> &load_queue,
|
||||
hls::stream<insn_T> &gemm_queue,
|
||||
hls::stream<insn_T> &store_queue) {
|
||||
#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
|
||||
#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
|
||||
#pragma HLS INTERFACE axis port=load_queue
|
||||
#pragma HLS INTERFACE axis port=gemm_queue
|
||||
#pragma HLS INTERFACE axis port=store_queue
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
|
||||
|
||||
INSN_DECODE: for (int pc = 0; pc < insn_count; pc ++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
// Read instruction fields
|
||||
insn_T insn = insns[pc];
|
||||
// Do some partial decoding
|
||||
opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
|
||||
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
|
||||
// Push to appropriate instruction queue
|
||||
if (opcode == OPCODE_STORE) {
|
||||
store_queue.write(insn);
|
||||
} else if (opcode == OPCODE_LOAD &&
|
||||
(memory_type == MEM_ID_INP || memory_type == MEM_ID_WGT)) {
|
||||
load_queue.write(insn);
|
||||
} else {
|
||||
gemm_queue.write(insn);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void load (
|
||||
volatile inp_vec_T *inputs,
|
||||
volatile wgt_vec_T *weights,
|
||||
hls::stream<insn_T> &load_queue,
|
||||
hls::stream<bool> &g2l_dep_queue,
|
||||
hls::stream<bool> &l2g_dep_queue,
|
||||
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
|
||||
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
|
||||
) {
|
||||
#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE axis port=load_queue
|
||||
#pragma HLS INTERFACE axis port=g2l_dep_queue
|
||||
#pragma HLS INTERFACE axis port=l2g_dep_queue
|
||||
#pragma HLS INTERFACE bram port=wgt_mem
|
||||
#pragma HLS INTERFACE bram port=inp_mem
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
|
||||
// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
|
||||
|
||||
// Pop load instruction
|
||||
insn_T insn = load_queue.read();
|
||||
|
||||
// Decode instruction
|
||||
bool pop_prev_dependence = insn[INSN_MEM_1];
|
||||
bool pop_next_dependence = insn[INSN_MEM_2];
|
||||
bool push_prev_dependence = insn[INSN_MEM_3];
|
||||
bool push_next_dependence = insn[INSN_MEM_4];
|
||||
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
|
||||
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
|
||||
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
|
||||
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
|
||||
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
|
||||
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
|
||||
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
|
||||
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
|
||||
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
|
||||
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
|
||||
|
||||
// Pop dependence token if instructed
|
||||
if (pop_next_dependence) {
|
||||
g2l_dep_queue.read();
|
||||
}
|
||||
|
||||
// Initialize indices
|
||||
memop_sram_T sram_idx = sram_base;
|
||||
memop_dram_T dram_idx = dram_base;
|
||||
|
||||
// Pre-compute dimensions, and offsets
|
||||
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
|
||||
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
|
||||
memop_sram_T y_offset = x_size_total * y_pad_0;
|
||||
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
|
||||
|
||||
// Skip padding along y dimension
|
||||
sram_idx += y_offset;
|
||||
|
||||
// Perform data transfer from DRAM
|
||||
for (int y = 0; y < y_size; y ++) {
|
||||
#pragma HLS PIPELINE rewind
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_0;
|
||||
// Perform data transfer
|
||||
if (memory_type == MEM_ID_INP) {
|
||||
memcpy(
|
||||
&inp_mem[sram_idx][0],
|
||||
(const inp_vec_T*) &inputs[dram_idx * BATCH],
|
||||
x_size * INP_ELEM_BYTES
|
||||
);
|
||||
} else {
|
||||
memcpy(
|
||||
&wgt_mem[sram_idx][0],
|
||||
(const wgt_vec_T*) &weights[dram_idx * BLOCK_OUT],
|
||||
x_size * WGT_ELEM_BYTES
|
||||
);
|
||||
}
|
||||
sram_idx += x_size;
|
||||
dram_idx += x_stride;
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_1;
|
||||
}
|
||||
|
||||
// Reset SRAM index
|
||||
sram_idx = sram_base;
|
||||
// Pad x/y edges with zeros
|
||||
for (int y = 0; y < y_size_total; y ++) {
|
||||
if (y < y_pad_0 || y >= y_pad_0 + y_size) {
|
||||
for (int x = 0; x < x_size_total; x ++) {
|
||||
#pragma HLS PIPELINE II=1 rewind
|
||||
if (memory_type == MEM_ID_INP) {
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
inp_mem[sram_idx][i] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < BLOCK_OUT; i ++) {
|
||||
wgt_mem[sram_idx][i] = 0;
|
||||
}
|
||||
}
|
||||
sram_idx ++;
|
||||
}
|
||||
} else {
|
||||
for (int x = 0; x < x_pad_0; x ++) {
|
||||
#pragma HLS PIPELINE II=1 rewind
|
||||
if (memory_type == MEM_ID_INP) {
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
inp_mem[sram_idx][i] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < BLOCK_OUT; i ++) {
|
||||
wgt_mem[sram_idx][i] = 0;
|
||||
}
|
||||
}
|
||||
sram_idx ++;
|
||||
}
|
||||
sram_idx += x_size;
|
||||
for (int x = 0; x < x_pad_1; x ++) {
|
||||
#pragma HLS PIPELINE II=1 rewind
|
||||
if (memory_type == MEM_ID_INP) {
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
inp_mem[sram_idx][i] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < BLOCK_OUT; i ++) {
|
||||
wgt_mem[sram_idx][i] = 0;
|
||||
}
|
||||
}
|
||||
sram_idx ++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Push dependence token if instructed
|
||||
if (push_next_dependence) {
|
||||
l2g_dep_queue.write(1);
|
||||
}
|
||||
}
|
||||
|
||||
void compute (
|
||||
volatile uint32_t &done,
|
||||
volatile uop_T *uops,
|
||||
volatile acc_vec_T *biases,
|
||||
hls::stream<insn_T> &gemm_queue,
|
||||
hls::stream<bool> &l2g_dep_queue,
|
||||
hls::stream<bool> &s2g_dep_queue,
|
||||
hls::stream<bool> &g2l_dep_queue,
|
||||
hls::stream<bool> &g2s_dep_queue,
|
||||
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
|
||||
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
|
||||
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
|
||||
) {
|
||||
#pragma HLS INTERFACE s_axilite port=done bundle=CONTROL_BUS
|
||||
#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
|
||||
#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE axis port=gemm_queue
|
||||
#pragma HLS INTERFACE axis port=l2g_dep_queue
|
||||
#pragma HLS INTERFACE axis port=s2g_dep_queue
|
||||
#pragma HLS INTERFACE axis port=g2l_dep_queue
|
||||
#pragma HLS INTERFACE axis port=g2s_dep_queue
|
||||
#pragma HLS INTERFACE bram port=inp_mem
|
||||
#pragma HLS INTERFACE bram port=wgt_mem
|
||||
#pragma HLS INTERFACE bram port=out_mem
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
|
||||
// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
|
||||
// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
|
||||
// This is necessary connect the SRAM to the load module
|
||||
#pragma HLS RESOURCE variable=wgt_mem core=RAM_1P
|
||||
|
||||
// Micro-op storage
|
||||
static uop_T uop_mem[UOP_BUFF_DEPTH];
|
||||
|
||||
// Accumulator storage
|
||||
static acc_vec_T acc_mem[ACC_BUFF_DEPTH][BATCH];
|
||||
#pragma HLS ARRAY_PARTITION variable=acc_mem complete dim=2
|
||||
|
||||
// Pop GEMM instruction
|
||||
insn_T insn = gemm_queue.read();
|
||||
|
||||
// Decode
|
||||
opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
|
||||
bool pop_prev_dependence = insn[INSN_MEM_1];
|
||||
bool pop_next_dependence = insn[INSN_MEM_2];
|
||||
bool push_prev_dependence = insn[INSN_MEM_3];
|
||||
bool push_next_dependence = insn[INSN_MEM_4];
|
||||
|
||||
// Pop dependence token if instructed
|
||||
if (pop_prev_dependence) {
|
||||
l2g_dep_queue.read();
|
||||
}
|
||||
if (pop_next_dependence) {
|
||||
s2g_dep_queue.read();
|
||||
}
|
||||
|
||||
// Perform action based on opcode
|
||||
if (opcode == OPCODE_FINISH) {
|
||||
|
||||
// Set done flag if we reach a FINISH instruction
|
||||
done = 1;
|
||||
|
||||
} else if (opcode == OPCODE_LOAD || opcode == OPCODE_STORE) {
|
||||
|
||||
// Set done value
|
||||
done = 0;
|
||||
|
||||
// Decode instruction
|
||||
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
|
||||
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
|
||||
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
|
||||
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
|
||||
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
|
||||
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
|
||||
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
|
||||
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
|
||||
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
|
||||
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
|
||||
|
||||
// Initialize indices
|
||||
memop_sram_T sram_idx = sram_base;
|
||||
memop_dram_T dram_idx = dram_base;
|
||||
|
||||
// Pre-compute dimensions, and offsets
|
||||
memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
|
||||
memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
|
||||
memop_sram_T y_offset = x_size_total * y_pad_0;
|
||||
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
|
||||
|
||||
if (memory_type == MEM_ID_UOP) {
|
||||
// Perform data transfer
|
||||
memcpy(
|
||||
&uop_mem[sram_base],
|
||||
(const uop_T*) &uops[dram_base],
|
||||
x_size * sizeof(uop_T)
|
||||
);
|
||||
} else {
|
||||
// Skip vertical padding
|
||||
sram_idx += y_offset;
|
||||
// Perform data transfer from DRAM
|
||||
for (int y = 0; y < y_size; y ++) {
|
||||
#pragma HLS PIPELINE rewind
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_0;
|
||||
// Perform data transfer
|
||||
memcpy(
|
||||
&acc_mem[sram_idx][0],
|
||||
(const acc_vec_T*) &biases[dram_idx * BATCH],
|
||||
x_size*ACC_ELEM_BYTES
|
||||
);
|
||||
sram_idx += x_size;
|
||||
dram_idx += x_stride;
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_1;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (opcode == OPCODE_GEMM || opcode == OPCODE_ALU) {
|
||||
|
||||
// Set done value
|
||||
done = 0;
|
||||
|
||||
// Decode
|
||||
uop_idx_T uop_bgn = insn.range(INSN_GEM_5_1, INSN_GEM_5_0);
|
||||
uop_idx_T uop_end = insn.range(INSN_GEM_6_1, INSN_GEM_6_0);
|
||||
loop_T iter_out = insn.range(INSN_GEM_7_1, INSN_GEM_7_0);
|
||||
loop_T iter_in = insn.range(INSN_GEM_8_1, INSN_GEM_8_0);
|
||||
acc_idx_T dst_factor_out = insn.range(INSN_GEM_9_1, INSN_GEM_9_0);
|
||||
acc_idx_T dst_factor_in = insn.range(INSN_GEM_A_1, INSN_GEM_A_0);
|
||||
inp_idx_T src_factor_out = insn.range(INSN_GEM_B_1, INSN_GEM_B_0);
|
||||
inp_idx_T src_factor_in = insn.range(INSN_GEM_C_1, INSN_GEM_C_0);
|
||||
|
||||
// GEMM-specific fields
|
||||
wgt_idx_T wgt_factor_out = insn.range(INSN_GEM_D_1, INSN_GEM_D_0);
|
||||
wgt_idx_T wgt_factor_in = insn.range(INSN_GEM_E_1, INSN_GEM_E_0);
|
||||
|
||||
// ALU-specific field
|
||||
aluop_opcode_T alu_opcode = insn.range(INSN_ALU_D_1, INSN_ALU_D_0);
|
||||
bool use_imm = insn[INSN_ALU_E];
|
||||
aluop_imm_T imm = insn.range(INSN_ALU_F_1, INSN_ALU_F_0);
|
||||
|
||||
acc_idx_T dst_offset_out = 0;
|
||||
inp_idx_T src_offset_out = 0;
|
||||
wgt_idx_T wgt_offset_out = 0;
|
||||
|
||||
// Outer Loop
|
||||
EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out ++) {
|
||||
#pragma HLS DEPENDENCE variable=acc_mem inter false
|
||||
|
||||
acc_idx_T dst_offset_in = dst_offset_out;
|
||||
inp_idx_T src_offset_in = src_offset_out;
|
||||
wgt_idx_T wgt_offset_in = wgt_offset_out;
|
||||
|
||||
// Inner Loop
|
||||
EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in ++) {
|
||||
|
||||
// Perform appropriate computation based on opcode
|
||||
if (opcode == OPCODE_GEMM) {
|
||||
|
||||
// Iterate over micro op
|
||||
READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
|
||||
#pragma HLS PIPELINE II=1 rewind
|
||||
|
||||
// Read micro-op fields
|
||||
uop_T uop = uop_mem[upc];
|
||||
|
||||
// Decode indices
|
||||
bool reset_out = uop[UOP_GEM_0];
|
||||
acc_idx_T dst_idx =
|
||||
uop.range(UOP_GEM_1_1, UOP_GEM_1_0) + dst_offset_in;
|
||||
acc_idx_T src_idx =
|
||||
uop.range(UOP_GEM_2_1, UOP_GEM_2_0) + src_offset_in;
|
||||
wgt_idx_T wgt_idx =
|
||||
uop.range(UOP_GEM_3_1, UOP_GEM_3_0) + wgt_offset_in;
|
||||
|
||||
// Read weight matrix
|
||||
wgt_vec_T w_matrix[BLOCK_OUT];
|
||||
for (int i = 0; i < BLOCK_OUT; i ++) {
|
||||
w_matrix[i] = wgt_mem[wgt_idx][i];
|
||||
}
|
||||
// Read input matrix and accum matrix
|
||||
acc_vec_T o_matrix[BATCH];
|
||||
out_vec_T i_matrix[BATCH];
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
o_matrix[i] = acc_mem[dst_idx][i];
|
||||
i_matrix[i] = inp_mem[src_idx][i];
|
||||
}
|
||||
// Result matrices
|
||||
acc_vec_T acc_mem_val[BATCH];
|
||||
out_vec_T st_buf_val[BATCH];
|
||||
|
||||
// Inner GEMM loop
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
for (int b = 0; b < BLOCK_OUT; b ++) {
|
||||
// Initialize the accumulator values
|
||||
acc_T accum =
|
||||
o_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
|
||||
// Dot product sum
|
||||
sum_T tmp = 0;
|
||||
// Inner matrix multiplication loop (input channel/feature)
|
||||
for (int k=0; k<BLOCK_IN; k++) {
|
||||
wgt_T w_elem =
|
||||
w_matrix[b].range((k + 1) * WGT_WIDTH - 1, k * WGT_WIDTH);
|
||||
inp_T i_elem =
|
||||
i_matrix[i].range((k + 1) * INP_WIDTH - 1, k * INP_WIDTH);
|
||||
mul_T prod = i_elem * w_elem;
|
||||
tmp += (sum_T) prod;
|
||||
}
|
||||
// Update summation
|
||||
accum += (acc_T) tmp;
|
||||
// Update result vector
|
||||
acc_mem_val[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
|
||||
reset_out ? (acc_T) 0 : accum;
|
||||
st_buf_val[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
|
||||
(inp_T) accum.range(INP_WIDTH - 1, 0);
|
||||
}
|
||||
// Write to buffers
|
||||
acc_mem[dst_idx][i] = acc_mem_val[i];
|
||||
out_mem[dst_idx][i] = st_buf_val[i];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (opcode == OPCODE_ALU) {
|
||||
|
||||
// Iterate over micro op
|
||||
READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
|
||||
|
||||
// Read micro-op fields
|
||||
uop_T uop = uop_mem[upc];
|
||||
|
||||
// Decode
|
||||
bool reset_out = uop[UOP_ALU_0];
|
||||
acc_idx_T dst_idx =
|
||||
uop.range(UOP_ALU_1_1, UOP_ALU_1_0) + dst_offset_in;
|
||||
acc_idx_T src_idx =
|
||||
uop.range(UOP_ALU_2_1, UOP_ALU_2_0) + src_offset_in;
|
||||
|
||||
// Read input matrix and accum matrix
|
||||
acc_vec_T dst_matrix[BATCH];
|
||||
acc_vec_T src_matrix[BATCH];
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
#pragma HLS UNROLL complete
|
||||
dst_matrix[i] = acc_mem[dst_idx][i];
|
||||
src_matrix[i] = acc_mem[src_idx][i];
|
||||
}
|
||||
|
||||
// Result matrices
|
||||
acc_vec_T cmp_res[BATCH];
|
||||
acc_vec_T add_res[BATCH];
|
||||
acc_vec_T shr_res[BATCH];
|
||||
out_vec_T short_cmp_res[BATCH];
|
||||
out_vec_T short_add_res[BATCH];
|
||||
out_vec_T short_shr_res[BATCH];
|
||||
|
||||
// Perform ALU op over matrix elements
|
||||
for (int i = 0; i < BATCH; i ++) {
|
||||
#pragma HLS PIPELINE II=1 rewind
|
||||
// Results vector
|
||||
acc_vec_T res_vec = 0;
|
||||
for (int b = 0; b < BLOCK_OUT; b ++) {
|
||||
// Read in operands
|
||||
acc_T src_0 =
|
||||
dst_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
|
||||
acc_T src_1 =
|
||||
use_imm ?
|
||||
(acc_T) imm :
|
||||
src_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
|
||||
// Compute Min/Max
|
||||
acc_T mix_val =
|
||||
src_0 < src_1 ?
|
||||
(alu_opcode == ALU_OPCODE_MIN ? src_0 : src_1) :
|
||||
(alu_opcode == ALU_OPCODE_MIN ? src_1 : src_0);
|
||||
cmp_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
|
||||
mix_val;
|
||||
short_cmp_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
|
||||
(inp_T) mix_val.range(INP_WIDTH - 1, 0);
|
||||
// Compute Sum
|
||||
acc_T add_val =
|
||||
src_0.range(ACC_WIDTH - 1, 0) + src_1.range(ACC_WIDTH - 1, 0);
|
||||
add_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
|
||||
add_val;
|
||||
short_add_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
|
||||
(inp_T) add_val.range(INP_WIDTH - 1, 0);
|
||||
// Compute Shift
|
||||
acc_T shr_val =
|
||||
src_0 >> (aluop_sh_imm_T) src_1.range(LOG_ACC_WIDTH - 1, 0);
|
||||
shr_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
|
||||
shr_val;
|
||||
short_shr_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
|
||||
(inp_T) shr_val.range(INP_WIDTH-1, 0);
|
||||
}
|
||||
|
||||
// Store to accum memory/store buffer
|
||||
if (alu_opcode == ALU_OPCODE_MIN ||
|
||||
alu_opcode == ALU_OPCODE_MAX) {
|
||||
acc_mem[dst_idx][i] = cmp_res[i];
|
||||
out_mem[dst_idx][i] = short_cmp_res[i];
|
||||
} else if (alu_opcode==ALU_OPCODE_ADD) {
|
||||
acc_mem[dst_idx][i] = add_res[i];
|
||||
out_mem[dst_idx][i] = short_add_res[i];
|
||||
} else if (alu_opcode==ALU_OPCODE_SHR) {
|
||||
acc_mem[dst_idx][i] = shr_res[i];
|
||||
out_mem[dst_idx][i] = short_shr_res[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update offsets
|
||||
dst_offset_in += dst_factor_in;
|
||||
src_offset_in += src_factor_in;
|
||||
wgt_offset_in += wgt_factor_in;
|
||||
}
|
||||
|
||||
// Update offsets
|
||||
dst_offset_out += dst_factor_out;
|
||||
src_offset_out += src_factor_out;
|
||||
wgt_offset_out += wgt_factor_out;
|
||||
}
|
||||
}
|
||||
|
||||
// Push dependence token if instructed
|
||||
if (push_prev_dependence) {
|
||||
g2l_dep_queue.write(1);
|
||||
}
|
||||
if (push_next_dependence) {
|
||||
g2s_dep_queue.write(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void store (
|
||||
volatile out_vec_T *outputs,
|
||||
hls::stream<insn_T> &store_queue,
|
||||
hls::stream<bool> &g2s_dep_queue,
|
||||
hls::stream<bool> &s2g_dep_queue,
|
||||
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
|
||||
) {
|
||||
#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE axis port=store_queue
|
||||
#pragma HLS INTERFACE axis port=g2s_dep_queue
|
||||
#pragma HLS INTERFACE axis port=s2g_dep_queue
|
||||
#pragma HLS INTERFACE bram port=out_mem
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
|
||||
// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
|
||||
|
||||
// Load buffer
|
||||
insn_T insn = store_queue.read();
|
||||
|
||||
// Decode
|
||||
bool pop_prev_dependence = insn[INSN_MEM_1];
|
||||
bool pop_next_dependence = insn[INSN_MEM_2];
|
||||
bool push_prev_dependence = insn[INSN_MEM_3];
|
||||
bool push_next_dependence = insn[INSN_MEM_4];
|
||||
memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
|
||||
memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
|
||||
memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
|
||||
memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
|
||||
memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
|
||||
memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
|
||||
memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
|
||||
memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
|
||||
memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
|
||||
memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
|
||||
|
||||
// Pop dependence token if instructed
|
||||
if (pop_prev_dependence) {
|
||||
g2s_dep_queue.read();
|
||||
}
|
||||
|
||||
// Initialize indices
|
||||
memop_sram_T sram_idx = sram_base;
|
||||
memop_dram_T dram_idx = dram_base;
|
||||
|
||||
// Skip padding along y dimension
|
||||
memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
|
||||
sram_idx += y_offset;
|
||||
#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
|
||||
|
||||
// Copy along y dimension
|
||||
for (int y = 0; y < y_size; y ++) {
|
||||
#pragma HLS PIPELINE rewind
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_0;
|
||||
// Perform data transfer
|
||||
memcpy(
|
||||
(out_vec_T *) &outputs[dram_idx*BATCH],
|
||||
(const out_vec_T*) &out_mem[sram_idx][0],
|
||||
x_size * INP_ELEM_BYTES);
|
||||
sram_idx += x_size;
|
||||
dram_idx += x_stride;
|
||||
// Skip padding along x dimension
|
||||
sram_idx += x_pad_1;
|
||||
}
|
||||
|
||||
// Push dependence token if instructed
|
||||
if (push_prev_dependence) {
|
||||
s2g_dep_queue.write(1);
|
||||
}
|
||||
}
|
||||
|
||||
void vta (
|
||||
uint32_t insn_count,
|
||||
volatile insn_T *insns,
|
||||
volatile uop_T *uops,
|
||||
volatile inp_vec_T *inputs,
|
||||
volatile wgt_vec_T *weights,
|
||||
volatile acc_vec_T *biases,
|
||||
volatile out_vec_T *outputs) {
|
||||
#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
|
||||
#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
|
||||
#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
|
||||
#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
|
||||
|
||||
// Instantiate temporary instruction queues (used for peeking)
|
||||
hls::stream<insn_T> tmp_load_queue;
|
||||
hls::stream<insn_T> tmp_gemm_queue;
|
||||
hls::stream<insn_T> tmp_store_queue;
|
||||
|
||||
// Instatiate physical instruction queues
|
||||
hls::stream<insn_T> load_queue;
|
||||
hls::stream<insn_T> gemm_queue;
|
||||
hls::stream<insn_T> store_queue;
|
||||
|
||||
// Dependence queues
|
||||
hls::stream<bool> l2g_dep_queue;
|
||||
hls::stream<bool> s2g_dep_queue;
|
||||
hls::stream<bool> g2l_dep_queue;
|
||||
hls::stream<bool> g2s_dep_queue;
|
||||
|
||||
// Instantiate memories
|
||||
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH];
|
||||
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT];
|
||||
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH];
|
||||
|
||||
// Push all instructions into the queues
|
||||
fetch(
|
||||
insn_count,
|
||||
insns,
|
||||
tmp_load_queue,
|
||||
tmp_gemm_queue,
|
||||
tmp_store_queue
|
||||
);
|
||||
|
||||
// Global done indicator
|
||||
uint32_t done = 0;
|
||||
|
||||
// Temporary instructions
|
||||
insn_T tmp_load;
|
||||
insn_T tmp_gemv;
|
||||
insn_T tmp_store;
|
||||
|
||||
// Peeking status
|
||||
bool tmp_load_popped = false;
|
||||
bool tmp_gemm_popped = false;
|
||||
bool tmp_store_popped = false;
|
||||
int exit_counter = 0;
|
||||
|
||||
// Main control loop
|
||||
while (true) {
|
||||
// First execute as many load instructions as possible
|
||||
while (!tmp_load_queue.empty() || tmp_load_popped == true) {
|
||||
// Pop the load instruction
|
||||
if (!tmp_load_popped) {
|
||||
tmp_load_queue.read(tmp_load);
|
||||
tmp_load_popped = true;
|
||||
}
|
||||
// Check dependences and invoke the load stage
|
||||
bool pop_next_dependence = tmp_load[INSN_MEM_2];
|
||||
if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
|
||||
!pop_next_dependence) {
|
||||
// Push the instruction in the load queue
|
||||
load_queue.write(tmp_load);
|
||||
tmp_load_popped = false;
|
||||
load(
|
||||
inputs,
|
||||
weights,
|
||||
load_queue,
|
||||
g2l_dep_queue,
|
||||
l2g_dep_queue,
|
||||
inp_mem,
|
||||
wgt_mem
|
||||
);
|
||||
} else {
|
||||
// Execution of load stage pending on completion of other stages, so break here...
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Next execute as many gemm instructions as possible
|
||||
while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
|
||||
// Pop the gemm instruction
|
||||
if (!tmp_gemm_popped) {
|
||||
tmp_gemm_queue.read(tmp_gemv);
|
||||
tmp_gemm_popped = true;
|
||||
}
|
||||
// Check dependences and invoke the load stage
|
||||
bool pop_prev_dependence = tmp_gemv[INSN_MEM_1];
|
||||
bool pop_next_dependence = tmp_gemv[INSN_MEM_2];
|
||||
if (
|
||||
(pop_prev_dependence && !l2g_dep_queue.empty() &&
|
||||
pop_next_dependence && !s2g_dep_queue.empty()) ||
|
||||
(!pop_prev_dependence && pop_next_dependence &&
|
||||
!s2g_dep_queue.empty()) ||
|
||||
(pop_prev_dependence && !l2g_dep_queue.empty() &&
|
||||
!pop_next_dependence) ||
|
||||
(!pop_prev_dependence && !pop_next_dependence)
|
||||
) {
|
||||
// Push the instruction in the load queue
|
||||
gemm_queue.write(tmp_gemv);
|
||||
tmp_gemm_popped = false;
|
||||
compute(
|
||||
done,
|
||||
uops,
|
||||
biases,
|
||||
gemm_queue,
|
||||
l2g_dep_queue,
|
||||
s2g_dep_queue,
|
||||
g2l_dep_queue,
|
||||
g2s_dep_queue,
|
||||
inp_mem,
|
||||
wgt_mem,
|
||||
out_mem
|
||||
);
|
||||
} else {
|
||||
// Execution of load stage pending on completion of other stages,
|
||||
// so break here...
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Finally execute as many store instructions as possible
|
||||
while (!tmp_store_queue.empty() || tmp_store_popped == true) {
|
||||
// Pop the load instruction
|
||||
if (!tmp_store_popped) {
|
||||
tmp_store_queue.read(tmp_store);
|
||||
tmp_store_popped = true;
|
||||
}
|
||||
// Check dependences and invoke the load stage
|
||||
bool pop_prev_dependence = tmp_store[INSN_MEM_1];
|
||||
if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
|
||||
!pop_prev_dependence) {
|
||||
// Push the instruction in the load queue
|
||||
store_queue.write(tmp_store);
|
||||
tmp_store_popped = false;
|
||||
store(
|
||||
outputs,
|
||||
store_queue,
|
||||
g2s_dep_queue,
|
||||
s2g_dep_queue,
|
||||
out_mem
|
||||
);
|
||||
} else {
|
||||
// Execution of load stage pending on completion of other stages, so break here...
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Check if we get a signal that we are done
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
exit_counter ++;
|
||||
if (exit_counter > 1000) {
|
||||
if (tmp_load_popped) {
|
||||
if (g2l_dep_queue.empty()) {
|
||||
printf("waiting on g2l\n");
|
||||
}
|
||||
}
|
||||
if (tmp_gemm_popped) {
|
||||
if (l2g_dep_queue.empty() && tmp_gemv[INSN_MEM_1]) {
|
||||
printf("waiting on l2g\n");
|
||||
}
|
||||
if (s2g_dep_queue.empty() && tmp_gemv[INSN_MEM_2]) {
|
||||
printf("waiting on s2g\n");
|
||||
}
|
||||
}
|
||||
if (tmp_store_popped) {
|
||||
if (g2s_dep_queue.empty()) {
|
||||
printf("waiting on g2s\n");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the tokens are empty
|
||||
bool tmp_tok;
|
||||
int l2g_count = 0;
|
||||
int s2g_count = 0;
|
||||
int g2l_count = 0;
|
||||
int g2s_count = 0;
|
||||
while(l2g_dep_queue.read_nb(tmp_tok)) {
|
||||
l2g_count ++;
|
||||
}
|
||||
while(s2g_dep_queue.read_nb(tmp_tok)) {
|
||||
s2g_count ++;
|
||||
}
|
||||
while(g2l_dep_queue.read_nb(tmp_tok)) {
|
||||
g2l_count ++;
|
||||
}
|
||||
while(g2s_dep_queue.read_nb(tmp_tok)) {
|
||||
g2s_count ++;
|
||||
}
|
||||
|
||||
assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,59 @@
|
|||
CC ?= g++
|
||||
CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
|
||||
LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
|
||||
LIBS = -l:libsds_lib.so -l:libdma.so
|
||||
SRC_DIR = ../../src
|
||||
INCLUDE_DIR = ../../include
|
||||
DRIVER_DIR = $(SRC_DIR)/driver/pynq
|
||||
TESTLIB_DIR = $(SRC_DIR)/test
|
||||
VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
|
||||
SOURCES = vta_pynq_driver.c vta_test_lib.cc
|
||||
OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
|
||||
EXECUTABLE = vta
|
||||
|
||||
# VTA Parameters
|
||||
# Log of input width in bits
|
||||
LOG_INP_WIDTH = 3
|
||||
# Log of weight width in bits
|
||||
LOG_WGT_WIDTH = 3
|
||||
# Log of accum width in bits
|
||||
LOG_ACC_WIDTH = 5
|
||||
# Log of output width in bits
|
||||
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
|
||||
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_BATCH = 0
|
||||
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_IN_BLOCK = 4
|
||||
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
|
||||
LOG_OUT_BLOCK = 4
|
||||
# Log of uop buffer size in Bytes
|
||||
LOG_UOP_BUFF_SIZE = 15
|
||||
# Log of inp buffer size in Bytes
|
||||
LOG_INP_BUFF_SIZE = 15
|
||||
# Log of wgt buffer size in Bytes
|
||||
LOG_WGT_BUFF_SIZE = 15
|
||||
# Log of acc buffer size in Bytes
|
||||
LOG_ACC_BUFF_SIZE = 17
|
||||
# Log of out buffer size in Bytes
|
||||
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
|
||||
|
||||
# Define flags
|
||||
CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
|
||||
-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
|
||||
-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
|
||||
-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
|
||||
-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
|
||||
-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
|
||||
-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
|
||||
|
||||
# All Target
|
||||
all: $(EXECUTABLE)
|
||||
|
||||
%.o: %.cc $(SOURCES)
|
||||
$(CC) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
$(EXECUTABLE): $(OBJECTS)
|
||||
$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
|
||||
|
||||
clean:
|
||||
rm -rf *.o $(EXECUTABLE)
|
|
@ -0,0 +1,152 @@
|
|||
/*!
|
||||
* Copyright (c) 2018 by Contributors
|
||||
* \file driver_test.cpp
|
||||
* \brief Bare-metal test to test driver and VTA design.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include "vta_test_lib.h"
|
||||
#include "vta_pynq_driver.h"
|
||||
|
||||
// VTA invocation (present the same abstraction as in the simulation tests)
|
||||
uint64_t vta (
|
||||
uint32_t insn_count,
|
||||
VTAGenericInsn *insns,
|
||||
VTAUop *uops,
|
||||
inp_T *inputs,
|
||||
wgt_T *weights,
|
||||
acc_T *biases,
|
||||
inp_T *outputs) {
|
||||
|
||||
// Performance counter variables
|
||||
uint64_t t_fpga;
|
||||
struct timespec start, stop;
|
||||
|
||||
// Derive bitstream file
|
||||
char bitstream[64];
|
||||
char str_batch_size[4];
|
||||
char str_block_out_size[4];
|
||||
char str_block_in_size[4];
|
||||
char str_block_bit_width[4];
|
||||
sprintf(str_batch_size, "%d", BATCH);
|
||||
sprintf(str_block_out_size, "%d", BLOCK_OUT);
|
||||
sprintf(str_block_in_size, "%d", BLOCK_IN);
|
||||
sprintf(str_block_bit_width, "%d", WGT_WIDTH);
|
||||
strcpy(bitstream, "vta.bit");
|
||||
|
||||
#if DEBUG==1
|
||||
printf("INFO - Programming FPGA: %s!\n", bitstream);
|
||||
#endif
|
||||
|
||||
// Program VTA
|
||||
ProgramVTA(bitstream);
|
||||
// Get VTA handles
|
||||
VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
|
||||
VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
|
||||
|
||||
// Physical address pointers
|
||||
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
|
||||
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
|
||||
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
|
||||
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
|
||||
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
|
||||
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
|
||||
|
||||
#if DEBUG==1
|
||||
printf("INFO - Starting FPGA!\n");
|
||||
#endif
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
|
||||
// FETCH @ 0x10 : Data signal of insn_count_V
|
||||
WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
|
||||
// FETCH @ 0x18 : Data signal of insns_V
|
||||
if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
|
||||
// LOAD @ 0x10 : Data signal of weight_V
|
||||
if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
|
||||
// LOAD @ 0x18 : Data signal of inputs_V
|
||||
if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
|
||||
// COMPUTE @ 0x20 : Data signal of uops_V
|
||||
if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
|
||||
// COMPUTE @ 0x28 : Data signal of biases_V
|
||||
if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
|
||||
// STORE @ 0x10 : Data signal of outputs_V
|
||||
if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
|
||||
|
||||
// VTA start
|
||||
WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
|
||||
WriteMappedReg(vta_load_handle, 0x0, 0x81);
|
||||
WriteMappedReg(vta_compute_handle, 0x0, 0x81);
|
||||
WriteMappedReg(vta_store_handle, 0x0, 0x81);
|
||||
|
||||
int flag = 0, t = 0;
|
||||
for (t = 0; t < 10000000; ++t) {
|
||||
flag = ReadMappedReg(vta_compute_handle, 0x18);
|
||||
if (flag & VTA_DONE) break;
|
||||
}
|
||||
|
||||
if (t==10000000) {
|
||||
printf("\tWARNING: VTA TIMEOUT!!!!\n");
|
||||
}
|
||||
#if DEBUG==1
|
||||
else {
|
||||
printf("INFO - FPGA Finished!\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
|
||||
|
||||
// Unmap VTA register
|
||||
UnmapRegister(vta_fetch_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_load_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_compute_handle, VTA_RANGE);
|
||||
UnmapRegister(vta_store_handle, VTA_RANGE);
|
||||
|
||||
return t_fpga;
|
||||
};
|
||||
|
||||
int main(void)
|
||||
{
|
||||
|
||||
#if DEBUG==1
|
||||
printParameters();
|
||||
#endif
|
||||
|
||||
int status = 0;
|
||||
|
||||
// Run ALU test (vector-scalar operators)
|
||||
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
|
||||
|
||||
// Run ALU test (vector-vector operators)
|
||||
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
|
||||
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
|
||||
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
|
||||
|
||||
// Run blocked GEMM test
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
|
||||
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
|
||||
|
||||
if (status==0) {
|
||||
printf("\nINFO - Unit tests successful!\n");
|
||||
} else {
|
||||
printf("\nINTO - Unit tests failed!\n");
|
||||
}
|
||||
|
||||
return status;
|
||||
|
||||
}
|
Загрузка…
Ссылка в новой задаче