hardware compilation flow, and driver tests

2018-03-16 11:46:44 -07:00 · 2018-03-16 11:46:44 -07:00 · 470018503f
--- a/vta/README.md
+++ b/vta/README.md
@ -1,2 +1,12 @@
-# vta
 Open Hardware/Software Stack for Vertical Deep Learning System Optimization
+==============================================
+
+[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
+
+VTA is an open hardware/software co-design stack for deep learning systems systems.
+It provides a customizable hardware accelerator template for deep learning inference workloads,
+combined with a fully functional compiler stack built with TVM.
+
+License
+-------
+© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
--- a/vta/docs/.gitignore
+++ b/vta/docs/.gitignore
@ -0,0 +1 @@
+doxygen
--- a/vta/docs/Doxyfile
+++ b/vta/docs/Doxyfile
--- a/vta/hardware/vivado/.gitignore
+++ b/vta/hardware/vivado/.gitignore
@ -0,0 +1 @@
+build
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
@ -0,0 +1,106 @@
+# Directories
+ROOTDIR = $(CURDIR)
+BUILD_DIR = $(ROOTDIR)/build
+SCRIPT_DIR = $(ROOTDIR)/scripts
+SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
+SIM_DIR = $(ROOTDIR)/sim
+TEST_DIR = $(ROOTDIR)/../../src/test
+INCLUDE_DIR = $(ROOTDIR)/../../include
+
+# Executables
+VIVADO_HLS = vivado_hls
+VIVADO = vivado
+HSI = hsi
+
+# Build parameters:
+#  Number of threads during compilation
+NUM_THREADS = 8
+#  Target Frequency
+CLOCK_FREQ = 100
+#  Log of input width in bits
+LOG_INP_WIDTH = 3
+#  Log of weight width in bits
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits
+LOG_ACC_WIDTH = 5
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_IN_BLOCK = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_OUT_BLOCK = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+
+# Derived parameter
+#  Input width in bits
+INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+#  Weight width in bits
+WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+#  Output width in bits
+OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+#  Tensor batch size
+BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+#  Tensor outer block size
+IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
+#  Tensor inner block size
+OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
+#  Uop buffer size in Bytes
+UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+#  Inp buffer size in Bytes
+INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+#  Wgt buffer size in Bytes
+WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+#  Acc buffer size in Bytes
+ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+#  Out buffer size in Bytes
+OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+
+# Derive clock target period
+TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
+
+# Derive config name
+CONF = \
+	$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
+IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
+HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
+
+.PHONY: all ip bit driver clean
+
+all: driver
+
+ip: 
+	mkdir -p $(IP_BUILD_PATH)
+	cd $(IP_BUILD_PATH) && \
+		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
+			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
+			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
+			$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
+			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
+			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
+
+bit: ip
+	mkdir -p $(HW_BUILD_PATH)
+	cd $(HW_BUILD_PATH) && \
+		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
+		-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
+		$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
+		$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
+		$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
+
+driver: bit
+	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
+	cd $(HW_BUILD_PATH)/bsp && make
+
+clean:
+	rm -rf build
--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
@ -0,0 +1,177 @@
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hls.tcl
+#  brief: HLS generation script.
+#
+
+# Command line arguments:
+# Arg 1: path to design sources
+# Arg 2: path to sim sources
+# Arg 3: path to test sources
+# Arg 4: path to include sources
+# Arg 5: target clock period
+# Arg 6: input type width (log)
+# Arg 7: weight type width (log)
+# Arg 8: accum type width (log)
+# Arg 9: output type width (log)
+# Arg 10: batch size (log)
+# Arg 11: in block size (log)
+# Arg 12: out block size (log)
+# Arg 13: uop buffer size in B (log)
+# Arg 14: inp buffer size in B (log)
+# Arg 15: wgt buffer size in B (log)
+# Arg 16: acc buffer size in B (log)
+# Arg 17: out buffer size in B (log)
+
+if { [llength $argv] eq 19 } {
+	set src_dir [lindex $argv 2]
+	set sim_dir [lindex $argv 3]
+	set test_dir [lindex $argv 4]
+	set include_dir [lindex $argv 5]
+	set target_period [lindex $argv 6]
+	set inp_width [lindex $argv 7]
+	set wgt_width [lindex $argv 8]
+	set acc_width [lindex $argv 9]
+	set out_width [lindex $argv 10]
+	set batch [lindex $argv 11]
+	set block_in [lindex $argv 12]
+	set block_out [lindex $argv 13]
+	set uop_buff_size [lindex $argv 14]
+	set inp_buff_size [lindex $argv 15]
+	set wgt_buff_size [lindex $argv 16]
+	set acc_buff_size [lindex $argv 17]
+	set out_buff_size [lindex $argv 18]
+} else {
+	set src_dir "../src/"
+	set sim_dir "../sim/"
+	set test_dir "../../src/test/"
+	set include_dir "../../include"
+	set target_period 10
+	set inp_width 3
+	set wgt_width 3
+	set acc_width 5
+	set out_width 3
+	set batch 1
+	set block_out 4
+	set block_in 4
+	set uop_buff_size 15
+	set inp_buff_size 15
+	set wgt_buff_size 15
+	set acc_buff_size 17
+	set out_buff_size 15
+}
+
+# C define flags to pass to compiler
+set cflags "-I $include_dir -I $include_dir/hardware/hls \
+	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
+	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
+	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
+	-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
+	-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
+	-DLOG_OUT_BUFF_SIZE=$out_buff_size"
+
+# Initializes the HLS design and sets HLS pragmas for memory partitioning.
+# This is necessary because of a Vivado restriction that doesn't allow for
+# buses wider than 1024 bits.
+proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
+
+	# Set device number
+	set_part {xc7z020clg484-1}
+
+	# Set the clock frequency
+	create_clock -period $per -name default
+
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
+	set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
+	if {$inp_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" inp_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
+	} else {
+		# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
+		set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
+	}
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
+	set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
+	if {$wgt_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
+	} else {
+		# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
+		set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
+	}
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
+	set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
+	if {$out_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "compute" out_mem
+		set_directive_array_reshape -type complete -dim 2 "store" out_mem
+	} else {
+		# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
+		set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
+	}
+}
+
+# HLS behavioral sim
+open_project vta_sim
+set_top vta
+add_files $src_dir/vta.cc -cflags $cflags
+add_files -tb $sim_dir/vta_test.cc -cflags $cflags
+add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csim_design -clean
+close_project
+
+# Generate fetch stage
+open_project vta_fetch
+set_top fetch
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate load stage
+open_project vta_load
+set_top load
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate compute stage
+open_project vta_compute
+set_top compute
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate store stage
+open_project vta_store
+set_top store
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+exit
+
--- a/vta/hardware/vivado/scripts/hsi.tcl
+++ b/vta/hardware/vivado/scripts/hsi.tcl
@ -0,0 +1,11 @@
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hsi.tcl
+#  brief: Driver generation script for ARMv7 driver libraries.
+#
+
+open_hw_design export/vta.hdf
+create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
+generate_bsp -dir bsp
+
+exit
--- a/vta/hardware/vivado/scripts/vivado.tcl
+++ b/vta/hardware/vivado/scripts/vivado.tcl
@ -0,0 +1,946 @@
+#
+#  Copyright (c) 2018 by Xilinx, Contributors
+#  file: vivado.tcl
+#  brief: Vivado compilation script. Partially automatically generated
+#   by Vivado.
+#
+
+# Check if script is running in correct Vivado version.
+set scripts_vivado_version 2017.1
+set current_vivado_version [version -short]
+
+if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
+   puts ""
+   catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
+    <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \
+    Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \
+    <$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \
+    Status...\", then run write_bd_tcl to create an updated script."}
+
+   return 1
+}
+
+# Parse argument list, derive the clock to utilize
+set clock_id 0
+if { [llength $argv] eq 12 } {
+  set ip_path [lindex $argv 0]
+  set num_threads [lindex $argv 1]
+  set clock_freq [lindex $argv 2]
+  set inp_width [lindex $argv 3]
+  set wgt_width [lindex $argv 4]
+  set out_width [lindex $argv 5]
+  set batch [lindex $argv 6]
+  set out_block [lindex $argv 7]
+  set in_block [lindex $argv 8]
+  set inp_mem_size [lindex $argv 9]
+  set wgt_mem_size [lindex $argv 10]
+  set out_mem_size [lindex $argv 11]
+  if {$clock_freq eq 100} {
+    set clock_id 0
+    puts "Setting clock frequency to 100MHz"
+  } elseif {$clock_freq eq 142} {
+    set clock_id 1
+    puts "Setting clock frequency to 142MHz"
+  } elseif {$clock_freq eq 167} {
+    set clock_id 3
+    puts "Setting clock frequency to 167MHz"
+  } elseif {$clock_freq eq 200} {
+    set clock_id 2
+    puts "Setting clock frequency to 200MHz"
+  } else {
+    set clock_id 0
+    puts "Unrecognized clock frequency, setting clock to 100MHz"
+  }
+} else {
+  puts "Arg list incomplete: <path to ip dir> <num threads> <clock freq> \
+    <inp width> <wgt_width> <out_width> <batch> <in_block / 1024> <out_block>"
+  return 1
+}
+
+# Derive input mem parameters
+set inp_mem_width [expr $inp_width * $batch * $in_block]
+set inp_mem_depth [expr $inp_mem_size * 8 / $inp_mem_width]
+set inp_bus_width 1024
+set inp_part [expr $inp_mem_width / $inp_bus_width]
+if {[expr $inp_part == 0]} {
+  set inp_part 1
+  set inp_bus_width $inp_mem_width
+}
+# Derive weight mem parameters
+set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
+set wgt_mem_depth [expr $wgt_mem_size * 8 / $wgt_mem_width]
+set wgt_bus_width 1024
+set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
+if {[expr $wgt_part == 0]} {
+  set wgt_part 1
+  set wgt_bus_width $wgt_mem_width
+}
+# Derive output mem parameters
+set out_mem_width [expr $out_width * $batch * $out_block]
+set out_mem_depth [expr $out_mem_size * 8 / $out_mem_width]
+set out_bus_width 1024
+set out_part [expr $out_mem_width / $out_bus_width]
+if {[expr $out_part == 0]} {
+  set out_part 1
+  set out_bus_width $out_mem_width
+}
+
+puts $inp_mem_width
+puts $inp_mem_depth
+puts $inp_bus_width
+puts $inp_part
+puts $wgt_mem_width
+puts $wgt_mem_depth
+puts $wgt_bus_width
+puts $wgt_part
+puts $out_mem_width
+puts $out_mem_depth
+puts $out_bus_width
+puts $out_part
+
+# User defined paths
+set proj_name vta
+set proj_path "."
+set ip_lib "ip_lib"
+set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip"
+set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip"
+set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip"
+set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip"
+
+# Create custom project
+create_project -force $proj_name $proj_path -part xc7z020clg484-1
+
+# Update IP repository with generated IP
+file mkdir $ip_lib
+set_property ip_repo_paths $ip_lib [current_project]
+update_ip_catalog
+update_ip_catalog -add_ip $fetch_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
+update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
+
+# CHANGE DESIGN NAME HERE
+set design_name $proj_name
+
+# Creating design if needed
+set errMsg ""
+set nRet 0
+
+set cur_design [current_bd_design -quiet]
+set list_cells [get_bd_cells -quiet]
+
+if { ${design_name} eq "" } {
+   # USE CASES:
+   #    1) Design_name not set
+
+   set errMsg "Please set the variable <design_name> to a non-empty value."
+   set nRet 1
+
+} elseif { ${cur_design} ne "" && ${list_cells} eq "" } {
+   # USE CASES:
+   #    2): Current design opened AND is empty AND names same.
+   #    3): Current design opened AND is empty AND names diff; design_name NOT in project.
+   #    4): Current design opened AND is empty AND names diff; design_name exists in project.
+
+   if { $cur_design ne $design_name } {
+      common::send_msg_id "BD_TCL-001" "INFO" "Changing value of <design_name> from <$design_name> \
+        to <$cur_design> since current design is empty."
+      set design_name [get_property NAME $cur_design]
+   }
+   common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..."
+
+} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } {
+   # USE CASES:
+   #    5) Current design opened AND has components AND same names.
+
+   set errMsg "Design <$design_name> already exists in your project, please set the variable \
+    <design_name> to another value."
+   set nRet 1
+} elseif { [get_files -quiet ${design_name}.bd] ne "" } {
+   # USE CASES:
+   #    6) Current opened design, has components, but diff names, design_name exists in project.
+   #    7) No opened design, design_name exists in project.
+
+   set errMsg "Design <$design_name> already exists in your project, please set the variable \
+    <design_name> to another value."
+   set nRet 2
+
+} else {
+   # USE CASES:
+   #    8) No opened design, design_name not in project.
+   #    9) Current opened design, has components, but diff names, design_name not in project.
+
+   common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \
+    project, so creating one..."
+
+   create_bd_design $design_name
+
+   common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design."
+   current_bd_design $design_name
+
+}
+
+common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable <design_name> is equal \
+  to \"$design_name\"."
+
+if { $nRet != 0 } {
+   catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg}
+   return $nRet
+}
+
+##################################################################
+# DESIGN PROCs
+##################################################################
+
+
+
+# Procedure to create entire design; Provide argument to make
+# procedure reusable. If parentCell is "", will use root.
+proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} {
+
+  variable script_folder
+
+  if { $parentCell eq "" } {
+     set parentCell [get_bd_cells /]
+  }
+
+  # Get object for parentCell
+  set parentObj [get_bd_cells $parentCell]
+  if { $parentObj == "" } {
+     catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"}
+     return
+  }
+
+  # Make sure parentObj is hier blk
+  set parentType [get_property TYPE $parentObj]
+  if { $parentType ne "hier" } {
+     catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \
+      <$parentType>. Expected to be <hier>."}
+     return
+  }
+
+  # Save current instance; Restore later
+  set oldCurInst [current_bd_instance .]
+
+  # Set parent object as current
+  current_bd_instance $parentObj
+
+
+  # Create interface ports
+  set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ]
+  set FIXED_IO [ create_bd_intf_port -mode Master \
+    -vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ]
+
+  # Create ports
+
+  # Create instance: axi_interconnect_1, and set properties
+  set axi_interconnect_1 \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ]
+  set_property -dict [ list \
+    CONFIG.NUM_MI {5} \
+  ] $axi_interconnect_1
+
+  # Create instance: axi_smc, and set properties
+  set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ]
+  set_property -dict [ list \
+    CONFIG.NUM_SI {5} \
+  ] $axi_smc
+
+  # Create instance: axi_timer_1, and set properties
+  set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ]
+
+  # Create instance: compute_0, and set properties
+  set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+    CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \
+    CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \
+  ] $compute_0
+
+  # Create instance: fetch_0, and set properties
+  set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \
+    CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \
+  ] $fetch_0
+
+  # Create instance: g2l_queue, and set properties
+  set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 g2l_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $g2l_queue
+
+  # Create instance: g2s_queue, and set properties
+  set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 g2s_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $g2s_queue
+
+  # Create instance: gemm_queue, and set properties
+  set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 gemm_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $gemm_queue
+
+  # Create instance: l2g_queue, and set properties
+  set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 l2g_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $l2g_queue
+
+  # Create instance: load_0, and set properties
+  set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
+  set_property -dict [ list \
+    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+  ] $load_0
+
+  # Create instance: load_queue, and set properties
+  set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 load_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $load_queue
+
+  # Create instance: proc_sys_reset, and set properties
+  set proc_sys_reset \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
+
+  # Create instance: processing_system7_1, and set properties
+  set processing_system7_1 \
+    [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ]
+  set_property -dict [ list \
+    CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_EN_CLK0_PORT {1} \
+    CONFIG.PCW_EN_CLK1_PORT {1} \
+    CONFIG.PCW_EN_CLK2_PORT {1} \
+    CONFIG.PCW_EN_CLK3_PORT {1} \
+    CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
+    CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \
+    CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \
+    CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \
+    CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \
+    CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_IMPORT_BOARD_PRESET {None} \
+    CONFIG.PCW_IRQ_F2P_INTR {1} \
+    CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \
+    CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \
+    CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
+    CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
+    CONFIG.PCW_USE_HIGH_OCM {1} \
+    CONFIG.PCW_USE_S_AXI_ACP {1} \
+    CONFIG.PCW_USE_S_AXI_HP0 {0} \
+    CONFIG.PCW_USE_S_AXI_HP1 {0} \
+    CONFIG.PCW_USE_S_AXI_HP2 {0} \
+    CONFIG.PCW_USE_S_AXI_HP3 {0} \
+    CONFIG.preset {ZC702} \
+  ] $processing_system7_1
+
+  # Create instance: s2g_queue, and set properties
+  set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 s2g_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {1024} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $s2g_queue
+
+  # Create instance: store_0, and set properties
+  set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
+  set_property -dict [ list \
+CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
+  ] $store_0
+
+  # Create instance: store_queue, and set properties
+  set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.1 store_queue ]
+  set_property -dict [ list \
+    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
+    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
+    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
+    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
+    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
+    CONFIG.Full_Flags_Reset_Value {1} \
+    CONFIG.Full_Threshold_Assert_Value_axis {511} \
+    CONFIG.Full_Threshold_Assert_Value_rach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wach {15} \
+    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
+    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
+    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Reset_Type {Asynchronous_Reset} \
+    CONFIG.TDATA_NUM_BYTES {16} \
+    CONFIG.TKEEP_WIDTH {16} \
+    CONFIG.TSTRB_WIDTH {16} \
+    CONFIG.TUSER_WIDTH {0} \
+  ] $store_queue
+
+  # Create instance: xlconcat_1, and set properties
+  set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ]
+  set_property -dict [ list \
+CONFIG.NUM_PORTS {5} \
+  ] $xlconcat_1
+
+  # Create and connect inp_mem partitions
+  if {${inp_part} > 1} {
+    for {set i 0} {$i < ${inp_part}} {incr i} {
+      # Create instance: inp_mem, and set properties
+      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 inp_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $inp_bus_width \
+        CONFIG.Read_Width_B $inp_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $inp_mem_depth \
+        CONFIG.Write_Width_A $inp_bus_width \
+        CONFIG.Write_Width_B $inp_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $inp_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
+        [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
+      connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
+    }
+  } else {
+      # Create instance: inp_mem, and set properties
+      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 inp_mem ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $inp_bus_width \
+        CONFIG.Read_Width_B $inp_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $inp_mem_depth \
+        CONFIG.Write_Width_A $inp_bus_width \
+        CONFIG.Write_Width_B $inp_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $inp_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
+        [get_bd_intf_pins load_0/inp_mem_V_PORTA]
+      connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/inp_mem_V_PORTA] \
+        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
+  }
+
+  # Create and connect wgt_mem partitions
+  if {${wgt_part} > 1} {
+    for {set i 0} {$i < ${wgt_part}} {incr i} {
+      # Create instance: wgt_mem, and set properties
+      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 wgt_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Assume_Synchronous_Clk {true} \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $wgt_bus_width \
+        CONFIG.Read_Width_B $wgt_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $wgt_mem_depth \
+        CONFIG.Write_Width_A $wgt_bus_width \
+        CONFIG.Write_Width_B $wgt_bus_width \
+      ] $wgt_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
+        [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
+    }
+  } else {
+      # Create instance: wgt_mem, and set properties
+      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 wgt_mem ]
+      set_property -dict [ list \
+        CONFIG.Assume_Synchronous_Clk {true} \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $wgt_bus_width \
+        CONFIG.Read_Width_B $wgt_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $wgt_mem_depth \
+        CONFIG.Write_Width_A $wgt_bus_width \
+        CONFIG.Write_Width_B $wgt_bus_width \
+      ] $wgt_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \
+        [get_bd_intf_pins load_0/wgt_mem_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \
+        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
+  }
+
+  # Create and connect out_mem partitions
+  if {${out_part} > 1} {
+    for {set i 0} {$i < ${out_part}} {incr i} {
+      # Create instance: out_mem, and set properties
+      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 out_mem_${i} ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $out_bus_width \
+        CONFIG.Read_Width_B $out_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $out_mem_depth \
+        CONFIG.Write_Width_A $out_bus_width \
+        CONFIG.Write_Width_B $out_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $out_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
+        [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \
+        [get_bd_intf_pins $out_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
+        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
+        [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
+    }
+  } else {
+      # Create instance: out_mem, and set properties
+      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.3 out_mem ]
+      set_property -dict [ list \
+        CONFIG.Byte_Size {8} \
+        CONFIG.Enable_32bit_Address {true} \
+        CONFIG.Enable_B {Use_ENB_Pin} \
+        CONFIG.Memory_Type {True_Dual_Port_RAM} \
+        CONFIG.Read_Width_A $out_bus_width \
+        CONFIG.Read_Width_B $out_bus_width \
+        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+        CONFIG.Use_Byte_Write_Enable {true} \
+        CONFIG.Use_RSTA_Pin {true} \
+        CONFIG.Use_RSTB_Pin {true} \
+        CONFIG.Write_Depth_A $out_mem_depth \
+        CONFIG.Write_Width_A $out_bus_width \
+        CONFIG.Write_Width_B $out_bus_width \
+        CONFIG.use_bram_block {BRAM_Controller} \
+      ] $out_mem
+      # Create interface connections
+      connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \
+        [get_bd_intf_pins compute_0/out_mem_V_PORTA] \
+        [get_bd_intf_pins $out_mem/BRAM_PORTA]
+      connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \
+        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
+        [get_bd_intf_pins store_0/out_mem_V_PORTA]
+  }
+
+  # Create interface connections
+  connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M01_AXI] \
+    [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M02_AXI] \
+    [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M03_AXI] \
+    [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \
+    [get_bd_intf_pins axi_interconnect_1/M04_AXI] \
+    [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
+  connect_bd_intf_net -intf_net axi_smc_M00_AXI \
+    [get_bd_intf_pins axi_smc/M00_AXI] \
+    [get_bd_intf_pins processing_system7_1/S_AXI_ACP]
+  connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \
+    [get_bd_intf_pins compute_0/g2l_dep_queue_V] \
+    [get_bd_intf_pins g2l_queue/S_AXIS]
+  connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \
+    [get_bd_intf_pins compute_0/g2s_dep_queue_V] \
+    [get_bd_intf_pins g2s_queue/S_AXIS]
+  connect_bd_intf_net -intf_net compute_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S02_AXI] \
+    [get_bd_intf_pins compute_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \
+    [get_bd_intf_pins axi_smc/S01_AXI] \
+    [get_bd_intf_pins compute_0/m_axi_uop_port]
+  connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \
+    [get_bd_intf_pins fetch_0/gemm_queue_V_V] \
+    [get_bd_intf_pins gemm_queue/S_AXIS]
+  connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \
+    [get_bd_intf_pins l2g_queue/S_AXIS] \
+    [get_bd_intf_pins load_0/l2g_dep_queue_V]
+  connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \
+    [get_bd_intf_pins fetch_0/load_queue_V_V] \
+    [get_bd_intf_pins load_queue/S_AXIS]
+  connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \
+    [get_bd_intf_pins axi_smc/S00_AXI] \
+    [get_bd_intf_pins fetch_0/m_axi_ins_port]
+  connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \
+    [get_bd_intf_pins fetch_0/store_queue_V_V] \
+    [get_bd_intf_pins store_queue/S_AXIS]
+  connect_bd_intf_net -intf_net g2l_queue_M_AXIS \
+    [get_bd_intf_pins g2l_queue/M_AXIS] \
+    [get_bd_intf_pins load_0/g2l_dep_queue_V]
+  connect_bd_intf_net -intf_net g2s_queue_M_AXIS \
+    [get_bd_intf_pins g2s_queue/M_AXIS] \
+    [get_bd_intf_pins store_0/g2s_dep_queue_V]
+  connect_bd_intf_net -intf_net gemm_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/gemm_queue_V_V] \
+    [get_bd_intf_pins gemm_queue/M_AXIS]
+  connect_bd_intf_net -intf_net l2g_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/l2g_dep_queue_V] \
+    [get_bd_intf_pins l2g_queue/M_AXIS]
+  connect_bd_intf_net -intf_net load_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S03_AXI] \
+    [get_bd_intf_pins load_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net load_queue_M_AXIS \
+    [get_bd_intf_pins load_0/load_queue_V_V] \
+    [get_bd_intf_pins load_queue/M_AXIS]
+  connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \
+    [get_bd_intf_pins axi_interconnect_1/M00_AXI] \
+    [get_bd_intf_pins axi_timer_1/S_AXI]
+  connect_bd_intf_net -intf_net processing_system7_1_ddr \
+    [get_bd_intf_ports DDR] \
+    [get_bd_intf_pins processing_system7_1/DDR]
+  connect_bd_intf_net -intf_net processing_system7_1_fixed_io \
+    [get_bd_intf_ports FIXED_IO] \
+    [get_bd_intf_pins processing_system7_1/FIXED_IO]
+  connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \
+    [get_bd_intf_pins axi_interconnect_1/S00_AXI] \
+    [get_bd_intf_pins processing_system7_1/M_AXI_GP0]
+  connect_bd_intf_net -intf_net s2g_queue_M_AXIS \
+    [get_bd_intf_pins compute_0/s2g_dep_queue_V] \
+    [get_bd_intf_pins s2g_queue/M_AXIS]
+  connect_bd_intf_net -intf_net store_0_m_axi_data_port \
+    [get_bd_intf_pins axi_smc/S04_AXI] \
+    [get_bd_intf_pins store_0/m_axi_data_port]
+  connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \
+    [get_bd_intf_pins s2g_queue/S_AXIS] \
+    [get_bd_intf_pins store_0/s2g_dep_queue_V]
+  connect_bd_intf_net -intf_net store_queue_M_AXIS \
+    [get_bd_intf_pins store_0/store_queue_V_V] \
+    [get_bd_intf_pins store_queue/M_AXIS]
+
+  # Create port connections
+  connect_bd_net -net axi_timer_1_interrupt \
+    [get_bd_pins axi_timer_1/interrupt] \
+    [get_bd_pins xlconcat_1/In0]
+  connect_bd_net -net compute_0_interrupt \
+    [get_bd_pins compute_0/interrupt] \
+    [get_bd_pins xlconcat_1/In3]
+  connect_bd_net -net fetch_0_interrupt \
+    [get_bd_pins fetch_0/interrupt] \
+    [get_bd_pins xlconcat_1/In1]
+  connect_bd_net -net load_0_interrupt \
+    [get_bd_pins load_0/interrupt] \
+    [get_bd_pins xlconcat_1/In2]
+  connect_bd_net -net proc_sys_reset_interconnect_aresetn \
+    [get_bd_pins axi_interconnect_1/ARESETN] \
+    [get_bd_pins proc_sys_reset/interconnect_aresetn]
+  connect_bd_net -net proc_sys_reset_peripheral_aresetn \
+    [get_bd_pins axi_interconnect_1/M00_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M01_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M02_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M03_ARESETN] \
+    [get_bd_pins axi_interconnect_1/M04_ARESETN] \
+    [get_bd_pins axi_interconnect_1/S00_ARESETN] \
+    [get_bd_pins axi_smc/aresetn] \
+    [get_bd_pins axi_timer_1/s_axi_aresetn] \
+    [get_bd_pins compute_0/ap_rst_n] \
+    [get_bd_pins fetch_0/ap_rst_n] \
+    [get_bd_pins g2l_queue/s_aresetn] \
+    [get_bd_pins g2s_queue/s_aresetn] \
+    [get_bd_pins gemm_queue/s_aresetn] \
+    [get_bd_pins l2g_queue/s_aresetn] \
+    [get_bd_pins load_0/ap_rst_n] \
+    [get_bd_pins load_queue/s_aresetn] \
+    [get_bd_pins proc_sys_reset/peripheral_aresetn] \
+    [get_bd_pins s2g_queue/s_aresetn] \
+    [get_bd_pins store_0/ap_rst_n] \
+    [get_bd_pins store_queue/s_aresetn]
+  connect_bd_net -net processing_system7_1_FCLK_CLK \
+    [get_bd_pins axi_interconnect_1/ACLK] \
+    [get_bd_pins axi_interconnect_1/M00_ACLK] \
+    [get_bd_pins axi_interconnect_1/M01_ACLK] \
+    [get_bd_pins axi_interconnect_1/M02_ACLK] \
+    [get_bd_pins axi_interconnect_1/M03_ACLK] \
+    [get_bd_pins axi_interconnect_1/M04_ACLK] \
+    [get_bd_pins axi_interconnect_1/S00_ACLK] \
+    [get_bd_pins axi_smc/aclk] \
+    [get_bd_pins axi_timer_1/s_axi_aclk] \
+    [get_bd_pins compute_0/ap_clk] \
+    [get_bd_pins fetch_0/ap_clk] \
+    [get_bd_pins g2l_queue/s_aclk] \
+    [get_bd_pins g2s_queue/s_aclk] \
+    [get_bd_pins gemm_queue/s_aclk] \
+    [get_bd_pins l2g_queue/s_aclk] \
+    [get_bd_pins load_0/ap_clk] \
+    [get_bd_pins load_queue/s_aclk] \
+    [get_bd_pins proc_sys_reset/slowest_sync_clk] \
+    [get_bd_pins processing_system7_1/FCLK_CLK${clk}] \
+    [get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \
+    [get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \
+    [get_bd_pins s2g_queue/s_aclk] \
+    [get_bd_pins store_0/ap_clk] \
+    [get_bd_pins store_queue/s_aclk]
+  connect_bd_net -net processing_system7_1_fclk_reset0_n \
+    [get_bd_pins proc_sys_reset/ext_reset_in] \
+    [get_bd_pins processing_system7_1/FCLK_RESET0_N]
+  connect_bd_net -net store_0_interrupt \
+    [get_bd_pins store_0/interrupt] \
+    [get_bd_pins xlconcat_1/In4]
+  connect_bd_net -net xlconcat_1_dout \
+    [get_bd_pins processing_system7_1/IRQ_F2P] \
+    [get_bd_pins xlconcat_1/dout]
+
+  # Create address segments
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+  create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
+  create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \
+    [get_bd_addr_spaces processing_system7_1/Data] \
+    [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
+    SEG_processing_system7_1_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
+    SEG_processing_system7_1_ACP_HIGH_OCM
+  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
+    SEG_processing_system7_1_ACP_IOP
+  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
+    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
+    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
+    SEG_processing_system7_1_ACP_M_AXI_GP0
+
+
+  # Restore current instance
+  current_bd_instance $oldCurInst
+
+  save_bd_design
+}
+# End of create_root_design()
+
+
+##################################################################
+# MAIN FLOW
+##################################################################
+
+create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \
+  $inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth
+
+# Create top-level wrapper file
+make_wrapper -files \
+  [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
+add_files -norecurse $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v
+update_compile_order -fileset sources_1
+update_compile_order -fileset sim_1
+
+# Run bistream generation on 8 threads with performance oriented P&R strategy
+# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \
+#   -strategy "Performance_ExplorePostRoutePhysOpt"
+launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
+wait_on_run impl_1
+
+# Export hardware description file and bitstream files to export/ dir
+file mkdir $proj_path/export
+file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef \
+  $proj_path/export/vta.hdf
+file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit \
+  $proj_path/export/vta.bit
+
+exit
--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test.cpp
+ * \brief Simulation tests for the VTA design.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+
+#include "vta.h"
+#include "vta_test_lib.h"
+
+int main(void)
+{
+
+#if DEBUG==1
+    printParameters();
+#endif
+
+    // Buffer indexing
+    assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
+    // Micro op bound
+    assert(UOP_GEM_3_1<UOP_WIDTH);
+    assert(UOP_ALU_3_1<UOP_WIDTH);
+    // Instruction alignment checks
+    assert(INSN_MEM_7_1<INSN_MEM_8_0);
+    assert(INSN_GEM_8_1<INSN_GEM_9_0);
+    // Instruction bounds
+    assert(INSN_MEM_E_1<INS_WIDTH);
+    assert(INSN_GEM_E_1<INS_WIDTH);
+    assert(INSN_ALU_F_1<INS_WIDTH);
+
+    int status = 0;
+
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+
+    return status;
+
+}
--- a/vta/include/hardware/hls/vta.h
+++ b/vta/include/hardware/hls/vta.h
@ -0,0 +1,137 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.h
+ * \brief Type definitions and prototype for VTA HLS design.
+ */
+#ifndef VTA_MAIN_H_
+#define VTA_MAIN_H_
+
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "vta_typedefs.h"
+#include "vta_params.h"
+
+/*!
+* \brief Fetch module.
+*   Reads in \a insn_count instructions via DMA and pushes them to the
+*   appropriate load, gemm or store queue.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+*/
+void fetch (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue);
+
+/*!
+* \brief Load module.
+*   Reads in load instructions from the load queue, and performs appropriate
+*   DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
+*   Updates dependence queues accordingly.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from GEMM to load stage.
+*   AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to GEMM stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
+*/
+void load (
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
+  );
+
+/*!
+* \brief Compute module.
+*   Reads in GEMM instructions from the gemm queue, and performs appropriate
+*   GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
+*   and writes computation results into the \a out_mem. Updates dependence
+*   queues accordingly.
+* \param done Signal that indicates that VLA is done.  AXI-lite memory mapped
+*   register.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to gemm stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from gemm to load stage.
+*   AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
+* \param out_mem Local output SRAM buffer. Write only single port BRAM.
+*/
+void compute (
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+
+/*!
+* \brief Store module.
+*   Reads in store instructions from the store queue, and performs appropriate
+*   store instructions from the output buffer in SRAM to DRAM. Updates dependence
+*   queues accordingly.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param out_mem Local output SRAM buffer. Read only single port BRAM.
+*/
+void store (
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+
+/*!
+* \brief VTA wrapper for simulation purpose only.
+*   Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+*/
+void vta (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs);
+
+#endif  // VTA_MAIN_H_
--- a/vta/include/hardware/hls/vta_typedefs.h
+++ b/vta/include/hardware/hls/vta_typedefs.h
@ -0,0 +1,97 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_typedefs.h
+ * \brief Type definitions for VTA HLS design.
+ */
+#ifndef VTA_TYPEDEFS_H_
+#define VTA_TYPEDEFS_H_
+
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "vta_params.h"
+
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<UOP_WIDTH> uop_T;
+
+/* \typedef inp_T Input datatype*/
+typedef ap_int<INP_WIDTH> inp_T;
+
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<WGT_WIDTH> wgt_T;
+
+/* \typedef out_T Output datatype*/
+typedef ap_int<OUT_WIDTH> out_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<ACC_WIDTH> acc_T;
+
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<INS_WIDTH> insn_T;
+
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
+
+#endif // VTA_TYPEDEFS_H_
--- a/vta/include/vta_params.h
+++ b/vta/include/vta_params.h
@ -0,0 +1,559 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_defines.h
+ * \brief Preprocessor definitions for VTA HLS design and runtime.
+ */
+#ifndef VTA_DEFINES_H_
+#define VTA_DEFINES_H_
+
+#include <stdint.h>
+
+/*! log2 of instruction data type width */
+#define LOG_INS_WIDTH 7
+/*! Instruction data type width */
+#define INS_WIDTH (1<<LOG_INS_WIDTH)
+/*! log2 of micro op data type width */
+#define LOG_UOP_WIDTH 5
+/*! Micro Op data type width */
+#define UOP_WIDTH (1<<LOG_UOP_WIDTH)
+/*! Weight data type width */
+#define WGT_WIDTH (1<<LOG_WGT_WIDTH)
+/*! Input data type width */
+#define INP_WIDTH (1<<LOG_INP_WIDTH)
+/*! Output data type width */
+#define OUT_WIDTH (1<<LOG_OUT_WIDTH)
+/*! Accumulator data type width */
+#define ACC_WIDTH (1<<LOG_ACC_WIDTH)
+/*! log2 of ALU data type width */
+#define LOG_ALU_WIDTH (LOG_ACC_WIDTH-1)
+/*! ALU data type width */
+#define ALU_WIDTH (1<<LOG_ALU_WIDTH)
+
+/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
+#define BATCH (1<<LOG_BATCH)
+/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
+#define BLOCK_IN (1<<LOG_BLOCK_IN)
+/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
+#define BLOCK_OUT (1<<LOG_BLOCK_OUT)
+
+/*! Weight vector width */
+#define WGT_VECTOR_WIDTH (WGT_WIDTH*BLOCK_IN)
+/*! Input vector width */
+#define INP_VECTOR_WIDTH (INP_WIDTH*BLOCK_IN)
+/*! Accumulator vector width */
+#define ACC_VECTOR_WIDTH (ACC_WIDTH*BLOCK_OUT)
+/*! Output vector width */
+#define OUT_VECTOR_WIDTH (OUT_WIDTH*BLOCK_OUT)
+
+/*! On-chip micro-op buffer size in B */
+#define UOP_BUFF_SIZE (1<<LOG_UOP_BUFF_SIZE)
+/*! On-chip weight buffer size in B */
+#define WGT_BUFF_SIZE (1<<LOG_WGT_BUFF_SIZE)
+/*! On-chip activation buffer size in B */
+#define INP_BUFF_SIZE (1<<LOG_INP_BUFF_SIZE)
+/*! On-chip accumulator buffer size in B */
+#define ACC_BUFF_SIZE (1<<LOG_ACC_BUFF_SIZE)
+
+/*! Size of instruction buffer element in B */
+#define INS_ELEM_BYTES (INS_WIDTH/8)
+/*! Size of uop buffer element in B*/
+#define UOP_ELEM_BYTES (UOP_WIDTH/8)
+/*! Size of activation buffer element in B*/
+#define INP_ELEM_BYTES (BATCH*BLOCK_IN*INP_WIDTH/8)
+/*! Size of weight buffer element in B*/
+#define WGT_ELEM_BYTES (BLOCK_OUT*BLOCK_IN*WGT_WIDTH/8)
+/*! Size of accumulator buffer element in B*/
+#define ACC_ELEM_BYTES (BATCH*BLOCK_OUT*ACC_WIDTH/8)
+
+/*! On-chip micro-op buffer depth */
+#define UOP_BUFF_DEPTH (UOP_BUFF_SIZE/UOP_ELEM_BYTES)
+/*! log2 of on-chip micro-op buffer depth */
+#define LOG_UOP_BUFF_DEPTH (LOG_UOP_BUFF_SIZE-LOG_UOP_WIDTH+3)
+// ! \brief On-chip weight buffer depth
+#define WGT_BUFF_DEPTH (WGT_BUFF_SIZE/WGT_ELEM_BYTES)
+/*! log2 of weight micro-op buffer depth */
+#define LOG_WGT_BUFF_DEPTH (LOG_WGT_BUFF_SIZE-LOG_BLOCK_OUT-LOG_BLOCK_IN-LOG_WGT_WIDTH+3)
+/*! On-chip activation buffer depth */
+#define INP_BUFF_DEPTH (INP_BUFF_SIZE/INP_ELEM_BYTES)
+/*! log2 of activation micro-op buffer depth */
+#define LOG_INP_BUFF_DEPTH (LOG_INP_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_IN-LOG_INP_WIDTH+3)
+/*! On-chip accumulator buffer depth */
+#define ACC_BUFF_DEPTH (ACC_BUFF_SIZE/ACC_ELEM_BYTES)
+/*! log2 of on-chip accumulator buffer depth */
+#define LOG_ACC_BUFF_DEPTH (LOG_ACC_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_OUT-LOG_ACC_WIDTH+3)
+
+/*! Instruction opcode field bitwidth */
+#define OPCODE_BIT_WIDTH 3
+/*! ALU opcode field bitwidth */
+#define ALU_OPCODE_BIT_WIDTH 3
+/*! ALU instruction reset mode bitwidth */
+#define ALU_RESET_BIT_WIDTH 2
+
+/*! Opcode: load encoding */
+#define OPCODE_LOAD 0
+/*! Opcode: store encoding */
+#define OPCODE_STORE 1
+/*! Opcode: GEMM encoding */
+#define OPCODE_GEMM 2
+/*! Opcode: finish encoding */
+#define OPCODE_FINISH 3
+/*! Opcode: ALU encoding */
+#define OPCODE_ALU 4
+
+/*! ALU opcode: unary min op */
+#define ALU_OPCODE_MIN 0
+/*! ALU opcode: unary max op */
+#define ALU_OPCODE_MAX 1
+/*! ALU opcode: binary add op */
+#define ALU_OPCODE_ADD 2
+/*! ALU opcode: binary sub op [NOT IMPLEMENTED] */
+#define ALU_OPCODE_SUB 3
+/*! ALU opcode: binary mul op  [NOT IMPLEMENTED] */
+#define ALU_OPCODE_MUL 4
+/*! ALU opcode: shift left by immediate op */
+#define ALU_OPCODE_SHL 5
+/*! ALU opcode: shift right by immediate op [NOT IMPLEMENTED] */
+#define ALU_OPCODE_SHR 6
+
+/*! ALU instruction reset mode: set to min */
+#define ALU_RESET_MIN 3
+/*! ALU instruction reset mode: set to zero */
+#define ALU_RESET_ZERO 0
+/*! ALU instruction reset mode: no reset */
+#define ALU_NO_RESET 2
+/*! ALU instruction reset mode: set to max */
+#define ALU_RESET_MAX 1
+
+/*! Memory type field bitwidth */
+#define MEMOP_ID_BIT_WIDTH 2
+/*! Load/Store Instruction: DRAM address width*/
+#define MEMOP_SRAM_ADDR_BIT_WIDTH 16
+/*! Load/Store Instruction: DRAM address width*/
+#define MEMOP_DRAM_ADDR_BIT_WIDTH 32
+/*! Load/Store Instruction: transfer size width*/
+#define MEMOP_SIZE_BIT_WIDTH 16
+/*! Load/Store Instruction: stride size width*/
+#define MEMOP_STRIDE_BIT_WIDTH 16
+/*! Load/Store Instruction: padding width*/
+#define MEMOP_PAD_BIT_WIDTH 4
+/*! Load/Store Instruction: padding value encoding width*/
+#define MEMOP_PAD_VAL_BIT_WIDTH 2
+/*! ALU Instruction: immediate bitwidth*/
+#define ALUOP_IMM_BIT_WIDTH 16
+/*! GEMM/ALU Instruction: loop max iter bits */
+#define LOOP_ITER_WIDTH 15
+
+/*! Mem ID constant: uop memory */
+#define MEM_ID_UOP 0
+/*! Mem ID constant: weight memory */
+#define MEM_ID_WGT 1
+/*! Mem ID constant: input memory */
+#define MEM_ID_INP 2
+/*! Mem ID constant: accumulator/bias memory */
+#define MEM_ID_ACC 3
+/*! Mem ID constant: output store buffer */
+#define MEM_ID_OUT 4
+
+// Instruction organization layout:
+//
+// LOAD/STORE
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: memory_type           | memop_id_T        |
+// arg 6: pad_value             | memop_pad_val_T   |
+// arg 7: sram_base             | memop_sram_T      |
+// arg 8: dram_base             | memop_dram_T      |
+// arg 9: y_size                | memop_size_T      |
+// arg a: x_size                | memop_size_T      |
+// arg b: x_stride              | memop_stride_T    |
+// arg c: y_pad_0               | memop_pad_T       |
+// arg d: y_pad_1               | memop_pad_T       |
+// arg e: x_pad_0               | memop_pad_T       |
+// arg f: x_pad_1               | memop_pad_T       |
+//
+// GEMM
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: uop_bgn               | uop_idx_T         |
+// arg 6: uop_end               | uop_idx_T         |
+// arg 7: iteration count ax0   | loop_T            |
+// arg 8: iteration count ax1   | loop_T            |
+// arg 9: accum idx factor ax0  | acc_idx_T         |
+// arg a: accum idx factor ax1  | acc_idx_T         |
+// arg b: input idx factor ax0  | acc_idx_T         |
+// arg c: input idx factor ax1  | acc_idx_T         |
+// arg d: weight idx factor ax0 | wgt_idx_T         |
+// arg e: weight idx factor ax1 | wgt_idx_T         |
+//
+// ALU
+// _____________________________|_type______________|
+// arg 0: opcode                | opcode_T          |
+// arg 1: pop_prev_dependence   | bool              |
+// arg 2: pop_next_dependence   | bool              |
+// arg 3: push_prev_dependence  | bool              |
+// arg 4: push_next_dependence  | bool              |
+// arg 5: uop_bgn               | uop_idx_T         |
+// arg 6: uop_end               | uop_idx_T         |
+// arg 7: iteration count ax0   | loop_T            |
+// arg 8: iteration count ax1   | loop_T            |
+// arg 9: dst idx factor ax0    | acc_idx_T         |
+// arg a: dst idx factor ax1    | acc_idx_T         |
+// arg b: src idx factor ax0    | acc_idx_T         |
+// arg c: src idx factor ax1    | acc_idx_T         |
+// arg d: alu_opcode            | aluop_opcode_T    |
+// arg e: use_imm               | bool              |
+// arg f: imm                   | alu_imm_T         |
+
+/*! Load/Store instruction start position of the opcode field */
+#define INSN_MEM_0_0 0
+/*! Load/Store instruction end position of the opcode field */
+#define INSN_MEM_0_1 (INSN_MEM_0_0+OPCODE_BIT_WIDTH-1)
+/*! Load/Store instruction position of the pop_prev_dep field */
+#define INSN_MEM_1   (INSN_MEM_0_1+1)
+/*! Load/Store instruction position of the pop_next_dep field */
+#define INSN_MEM_2   (INSN_MEM_1+1)
+/*! Load/Store instruction position of the push_prev_dependence field */
+#define INSN_MEM_3   (INSN_MEM_2+1)
+/*! Load/Store instruction position of the push_next_dependence field */
+#define INSN_MEM_4   (INSN_MEM_3+1)
+/*! Load/Store instruction start position of the memory_type field */
+#define INSN_MEM_5_0 (INSN_MEM_4+1)
+/*! Load/Store instruction end position of the memory_type field */
+#define INSN_MEM_5_1 (INSN_MEM_5_0+MEMOP_ID_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the sram_base field */
+#define INSN_MEM_6_0 (INSN_MEM_5_1+1)
+/*! Load/Store instruction end position of the sram_base field */
+#define INSN_MEM_6_1 (INSN_MEM_6_0+MEMOP_SRAM_ADDR_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the dram_base field */
+#define INSN_MEM_7_0 (INSN_MEM_6_1+1)
+/*! Load/Store instruction end position of the dram_base field */
+#define INSN_MEM_7_1 (INSN_MEM_7_0+MEMOP_DRAM_ADDR_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the y_size field */
+#define INSN_MEM_8_0 64
+/*! Load/Store instruction end position of the y_size field */
+#define INSN_MEM_8_1 (INSN_MEM_8_0+MEMOP_SIZE_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the x_size field */
+#define INSN_MEM_9_0 (INSN_MEM_8_1+1)
+/*! Load/Store instruction start position of the x_size field */
+#define INSN_MEM_9_1 (INSN_MEM_9_0+MEMOP_SIZE_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the x_stride field */
+#define INSN_MEM_A_0 (INSN_MEM_9_1+1)
+/*! Load/Store instruction end position of the x_stride field */
+#define INSN_MEM_A_1 (INSN_MEM_A_0+MEMOP_STRIDE_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the y_pad_0 field */
+#define INSN_MEM_B_0 (INSN_MEM_A_1+1)
+/*! Load/Store instruction start position of the y_pad_0 field */
+#define INSN_MEM_B_1 (INSN_MEM_B_0+MEMOP_PAD_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the y_pad_1 field */
+#define INSN_MEM_C_0 (INSN_MEM_B_1+1)
+/*! Load/Store instruction start position of the y_pad_1 field */
+#define INSN_MEM_C_1 (INSN_MEM_C_0+MEMOP_PAD_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the x_pad_0 field */
+#define INSN_MEM_D_0 (INSN_MEM_C_1+1)
+/*! Load/Store instruction start position of the x_pad_0 field */
+#define INSN_MEM_D_1 (INSN_MEM_D_0+MEMOP_PAD_BIT_WIDTH-1)
+/*! Load/Store instruction start position of the x_pad_1 field */
+#define INSN_MEM_E_0 (INSN_MEM_D_1+1)
+/*! Load/Store instruction start position of the x_pad_1 field */
+#define INSN_MEM_E_1 (INSN_MEM_E_0+MEMOP_PAD_BIT_WIDTH-1)
+
+/*! GEMM instruction start position of the opcode field */
+#define INSN_GEM_0_0 0
+/*! GEMM instruction end position of the opcode field */
+#define INSN_GEM_0_1 (INSN_GEM_0_0+OPCODE_BIT_WIDTH-1)
+/*! GEMM instruction position of the pop_prev_dep field */
+#define INSN_GEM_1   (INSN_GEM_0_1+1)
+/*! GEMM instruction position of the pop_next_dep field */
+#define INSN_GEM_2   (INSN_GEM_1+1)
+/*! GEMM instruction position of the push_prev_dependence field */
+#define INSN_GEM_3   (INSN_GEM_2+1)
+/*! GEMM instruction position of the push_next_dependence field */
+#define INSN_GEM_4   (INSN_GEM_3+1)
+/*! GEMM instruction start position of the uop_bgn field */
+#define INSN_GEM_5_0 (INSN_GEM_4+1)
+/*! GEMM instruction end position of the uop_bgn field */
+#define INSN_GEM_5_1 (INSN_GEM_5_0+LOG_UOP_BUFF_DEPTH-1)
+/*! GEMM instruction start position of the uop_end field */
+#define INSN_GEM_6_0 (INSN_GEM_5_1+1)
+/*! GEMM instruction end position of the uop_end field */
+#define INSN_GEM_6_1 (INSN_GEM_6_0+LOG_UOP_BUFF_DEPTH+1-1)
+/*! GEMM instruction start position of the iter_out field */
+#define INSN_GEM_7_0 (INSN_GEM_6_1+1)
+/*! GEMM instruction end position of the iter_out field */
+#define INSN_GEM_7_1 (INSN_GEM_7_0+LOOP_ITER_WIDTH-1)
+/*! GEMM instruction start position of the iter_in field */
+#define INSN_GEM_8_0 (INSN_GEM_7_1+1)
+/*! GEMM instruction end position of the iter_in field */
+#define INSN_GEM_8_1 (INSN_GEM_8_0+LOOP_ITER_WIDTH-1)
+/*! GEMM instruction start position of the dst_factor_out field */
+#define INSN_GEM_9_0 64
+/*! GEMM instruction end position of the dst_factor_out field */
+#define INSN_GEM_9_1 (INSN_GEM_9_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM instruction start position of the dst_factor_in field */
+#define INSN_GEM_A_0 (INSN_GEM_9_1+1)
+/*! GEMM instruction end position of the dst_factor_in field */
+#define INSN_GEM_A_1 (INSN_GEM_A_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM instruction start position of the src_factor_out field */
+#define INSN_GEM_B_0 (INSN_GEM_A_1+1)
+/*! GEMM instruction end position of the src_factor_out field */
+#define INSN_GEM_B_1 (INSN_GEM_B_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM instruction start position of the src_factor_in field */
+#define INSN_GEM_C_0 (INSN_GEM_B_1+1)
+/*! GEMM instruction end position of the src_factor_in field */
+#define INSN_GEM_C_1 (INSN_GEM_C_0+LOG_ACC_BUFF_DEPTH-1)
+
+/*! GEMM instruction start position of the wgt_factor_out field */
+#define INSN_GEM_D_0 (INSN_GEM_C_1+1)
+/*! GEMM instruction end position of the wgt_factor_out field */
+#define INSN_GEM_D_1 (INSN_GEM_D_0+LOG_WGT_BUFF_DEPTH-1)
+/*! GEMM instruction start position of the wgt_factor_in field */
+#define INSN_GEM_E_0 (INSN_GEM_D_1+1)
+/*! GEMM instruction end position of the wgt_factor_in field */
+#define INSN_GEM_E_1 (INSN_GEM_E_0+LOG_WGT_BUFF_DEPTH-1)
+
+/*! ALU instruction start position of the alu_opcode field */
+#define INSN_ALU_D_0 (INSN_GEM_C_1+1)
+/*! ALU instruction end position of the alu_opcode field */
+#define INSN_ALU_D_1 (INSN_ALU_D_0+ALU_OPCODE_BIT_WIDTH-1)
+/*! ALU instruction position of the use_imm field */
+#define INSN_ALU_E   (INSN_ALU_D_1+1)
+/*! ALU instruction start position of the immediate field */
+#define INSN_ALU_F_0 (INSN_ALU_E+1)
+/*! ALU instruction end position of the immediate field */
+#define INSN_ALU_F_1 (INSN_ALU_F_0+ALUOP_IMM_BIT_WIDTH-1)
+
+/*! GEMM Micro-op position of the reset_out field */
+#define UOP_GEM_0 0
+/*! GEMM Micro-op start position of the acc_idx field */
+#define UOP_GEM_1_0 (UOP_GEM_0+1)
+/*! GEMM Micro-op end position of the acc_idx field */
+#define UOP_GEM_1_1 (UOP_GEM_1_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM Micro-op start position of the inp_idx field */
+#define UOP_GEM_2_0 (UOP_GEM_1_1+1)
+/*! GEMM Micro-op end position of the inp_idx field */
+#define UOP_GEM_2_1 (UOP_GEM_2_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM Micro-op start position of the wgt_idx field */
+#define UOP_GEM_3_0 (UOP_GEM_2_1+1)
+/*! GEMM Micro-op end position of the wgt_idx field */
+#define UOP_GEM_3_1 (UOP_GEM_3_0+LOG_WGT_BUFF_DEPTH-1)
+
+/*! GEMM Micro-op position of the reset_out field */
+#define UOP_ALU_0 0
+/*! GEMM Micro-op start position of the acc_idx field */
+#define UOP_ALU_1_0 (UOP_ALU_0+1)
+/*! GEMM Micro-op end position of the acc_idx field */
+#define UOP_ALU_1_1 (UOP_ALU_1_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM Micro-op start position of the inp_idx field */
+#define UOP_ALU_2_0 (UOP_ALU_1_1+1)
+/*! GEMM Micro-op end position of the inp_idx field */
+#define UOP_ALU_2_1 (UOP_ALU_2_0+LOG_ACC_BUFF_DEPTH-1)
+/*! GEMM Micro-op start position of the wgt_idx field */
+#define UOP_ALU_3_0 (UOP_ALU_2_1+1)
+/*! GEMM Micro-op end position of the wgt_idx field */
+#define UOP_ALU_3_1 (UOP_ALU_3_0+LOG_WGT_BUFF_DEPTH-1)
+
+/*! \brief VTA generic instruction */
+typedef struct {
+  uint64_t word_0     : 64;
+  uint64_t word_1     : 64;
+} VTAGenericInsn;
+
+/*! \brief VTA load/store instruction
+*   Load/store instruction can describe a 2D strided access pattern
+*   with padding, which can be useful to perform spatial padding
+*   on the fly on a tensor on which to perform 2D convolution.
+*   For instance if we try to load a 4x4 spatial tile from a 16x16
+*   matrix with padding of size 1 on all dimensions:
+*   y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
+*   x_pad_0 = 1, x_pad_1 = 1.
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  /*! \brief Unused in this instruction */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from GEMM stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Unused in this instruction */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to GEMM stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Source/destination SRAM for store/load instruction */
+  uint64_t memory_type    : MEMOP_ID_BIT_WIDTH;
+  /*! \brief SRAM base address (pointer to memory elem type) */
+  uint64_t sram_base      : MEMOP_SRAM_ADDR_BIT_WIDTH;
+  /*! \brief DRAM base address (pointer to memory elem type) */
+  uint64_t dram_base      : MEMOP_DRAM_ADDR_BIT_WIDTH;
+  /*! \brief 2D access pattern: y-size */
+  uint64_t y_size         : MEMOP_SIZE_BIT_WIDTH;
+  /*! \brief 2D access pattern: x-size (in terms of memory elements) */
+  uint64_t x_size         : MEMOP_SIZE_BIT_WIDTH;
+  /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
+  uint64_t x_stride       : MEMOP_STRIDE_BIT_WIDTH;
+  /*! \brief 2D access pattern: start padding along y dimension */
+  uint64_t y_pad_0        : MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: end padding along y dimension */
+  uint64_t y_pad_1        : MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: start padding along x dimension */
+  uint64_t x_pad_0        : MEMOP_PAD_BIT_WIDTH;
+  /*! \brief 2D access pattern: end padding along x dimension */
+  uint64_t x_pad_1        : MEMOP_PAD_BIT_WIDTH;
+} VTAMemInsn;
+
+/*! \brief VTA GEMM instruction
+*   GEMM instruction is implemented by executing a sequence of micro-operations
+*   that is read in the local micro-op memory, delimited by \a uop_bgn and
+*   \a uop_end. For improved storage-efficiency, the micro-operations can be
+*   executed in a 2-level nested loop as follows:
+*   \code{.cpp}
+*     for (i = 0; i < iter_out; i++) {
+*       for (j = 0; j < iter_in; j++) {
+*         for (k = uop_bgn; k < uop_end; k++) {
+*           // Read micro op
+*           uop_T uop = uop_mem[k];
+*           // Read in memory indices
+*           acc_idx_T acc_idx = uop.dst_idx;
+*           inp_idx_T inp_idx = uop.inp_idx;
+*           wgt_idx_T wgt_idx = uop.wgt_idx;
+*           // Update those indices with the following affine functions
+*           acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
+*           inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
+*           wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
+*           // Perform GEMM operation
+*           acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
+*         }
+*       }
+*     }
+*   \endcode
+*
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  /*! \brief Pop dependence token from load stage */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from store stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Push dependence token to load stage */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to store stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Micro-op begin address */
+  uint64_t uop_bgn        : LOG_UOP_BUFF_DEPTH;
+  /*! \brief Micro-op end address */
+  uint64_t uop_end        : LOG_UOP_BUFF_DEPTH+1;
+  /*! \brief Iterations in the outer uop execution loop */
+  uint64_t iter_out       : LOOP_ITER_WIDTH;
+  /*! \brief Iterations in the inner uop execution loop */
+  uint64_t iter_in        : LOOP_ITER_WIDTH;
+  /*! \brief Outer loop accumulator memory index factor */
+  uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory index factor */
+  uint64_t dst_factor_in  : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Outer loop input memory index factor */
+  uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop input memory index factor */
+  uint64_t src_factor_in  : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Outer loop weight memory index factor */
+  uint64_t wgt_factor_out : LOG_WGT_BUFF_DEPTH;
+  /*! \brief Inner loop weight memory index factor */
+  uint64_t wgt_factor_in  : LOG_WGT_BUFF_DEPTH;
+} VTAGemInsn;
+
+/*! \brief VTA ALU instruction
+*   ALU instruction is implemented by executing a sequence of micro-operations
+*   that is read in the local micro-op memory, delimited by \a uop_bgn and
+*   \a uop_end. For improved storage-efficiency, the micro-operations can be
+*   executed in a 2-level nested loop as follows:
+*   \code{.cpp}
+*     for (i = 0; i < iter_out; i++) {
+*       for (j = 0; j < iter_in; j++) {
+*         for (k = uop_bgn; k < uop_end; k++) {
+*           // Read micro op
+*           uop_T uop = uop_mem[k];
+*           // Read in memory indices
+*           acc_idx_T dst_idx = uop.dst_idx;
+*           inp_idx_T src_idx = uop.inp_idx;
+*           // Update those indices with the following affine functions
+*           dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
+*           src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
+*           // Perform ALU operation
+*           if (use_imm) {
+*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
+*           } else {
+*             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
+*           }
+*         }
+*       }
+*     }
+*   \endcode
+*
+*/
+typedef struct {
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  /*! \brief Pop dependence token from load stage */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from store stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Push dependence token to load stage */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to store stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Micro-op begin address */
+  uint64_t uop_bgn        : LOG_UOP_BUFF_DEPTH;
+  /*! \brief Micro-op end address */
+  uint64_t uop_end        : LOG_UOP_BUFF_DEPTH+1;
+  /*! \brief Iterations in the outer uop execution loop */
+  uint64_t iter_out       : LOOP_ITER_WIDTH;
+  /*! \brief Iterations in the inner uop execution loop */
+  uint64_t iter_in        : LOOP_ITER_WIDTH;
+  /*! \brief Outer loop accumulator memory destination index factor */
+  uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory destination index factor */
+  uint64_t dst_factor_in  : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Outer loop accumulator memory source index factor */
+  uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Inner loop accumulator memory source index factor */
+  uint64_t src_factor_in  : LOG_ACC_BUFF_DEPTH;
+  /*! \brief ALU opcode */
+  uint64_t alu_opcode     : ALU_OPCODE_BIT_WIDTH;
+  /*! \brief Use immediate is true */
+  uint64_t use_imm        : 1;
+  /*! \brief Immediate value */
+  uint64_t imm            : ALUOP_IMM_BIT_WIDTH;
+} VTAAluInsn;
+
+/*! \brief VTA ALU instruction converter */
+union VTAInsn {
+  /*! \brief VTA generic instruction */
+  VTAGenericInsn generic;
+  /*! \brief VTA load/store instruction */
+  VTAMemInsn mem;
+  /*! \brief VTA GEMM instruction */
+  VTAGemInsn gemm;
+  /*! \brief VTA ALU instruction */
+  VTAAluInsn alu;
+};
+
+/*! \brief VTA micro-op for GEMM/ALU instruction */
+typedef struct {
+  /*! \brief Initialize acc_mem at index dst_idx to 0*/
+  uint32_t reset_out  : 1;
+  /*! \brief Destination index (indexes accum buffer) */
+  uint32_t dst_idx    : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
+  uint32_t src_idx    : LOG_ACC_BUFF_DEPTH;
+  /*! \brief Weight index (indexes weight buffer) */
+  uint32_t wgt_idx    : LOG_WGT_BUFF_DEPTH;
+} VTAUop;
+
+#endif // VTA_DEFINES_H_
--- a/vta/include/vta_pynq_driver.h
+++ b/vta/include/vta_pynq_driver.h
@ -0,0 +1,152 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.h
+ * \brief VTA driver for Pynq board.
+ */
+
+#ifndef VTA_PYNQ_DRIVER_H_
+#define VTA_PYNQ_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <assert.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef __arm__
+#include "libxlnk_cma.h"
+#else
+void* cma_alloc(size_t size, int cached);
+void cma_free(void* buf);
+uint32_t cma_get_phy_addr(void* buf);
+void xlnkFlushCache(void* buf, int size);
+void xlnkInvalidateCache(void* buf, int size);
+#endif
+
+/*! \brief VTA command handle */
+typedef void * VTAHandle;
+
+/*! \brief DMA command handle */
+typedef struct {
+  /*! \brief Register map to the AXI DMA control registers*/
+  void *dma_register_map;
+  /*! \brief Transmit data descriptor*/
+  void *mm2s_descriptor_register_map;
+  /*! \brief Receive data descriptor*/
+  void *s2mm_descriptor_register_map;
+  /*! \brief Transmit data descriptor physical address*/
+  uint32_t mm2s_descriptor_phy;
+  /*! \brief Receive data descriptor physical address*/
+  uint32_t s2mm_descriptor_phy;
+  /*! \brief Descriptor size */
+  uint32_t descriptor_size;
+  /*! \brief Transaction count for tx channel */
+  uint32_t mm2s_count;
+  /*! \brief Transaction count for rx channel */
+  uint32_t s2mm_count;
+  /*! \brief Multi-channel mode enable */
+  int multichannel_en;
+} DMAHandle;
+
+/*! \brief partial bitstream status file path */
+#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief bitstream destination file path */
+#define BS_XDEVCFG "/dev/xdevcfg"
+
+/*! \brief Path to /dev/mem */
+#define DEV_MEM_PATH "/dev/mem"
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_LENGTH 4
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+
+/*! \brief VTA configuration register address range */
+#define VTA_RANGE 0x100
+/*! \brief VTA configuration register start value */
+#define VTA_START 0x1
+/*! \brief VTA configuration register auto-restart value */
+#define VTA_AUTORESTART 0x81
+/*! \brief VTA configuration register done value */
+#define VTA_DONE 0x1
+
+/*! \brief VTA fetch stage configuration register address
+*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_FETCH_ADDR    0x43C00000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_COMPUTE_ADDR  0x43C10000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_LOAD_ADDR     0x43C20000
+/*! \brief VTA store stage configuration register address
+*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_STORE_ADDR    0x43C30000
+
+/*! \brief Memory management constants with libxlnk_cma */
+#define CACHED 1
+/*! \brief Memory management constants with libxlnk_cma */
+#define NOT_CACHED 0
+
+/*! \brief log2 of SDS buffer size limit */
+#define LOG_MAX_XFER 22
+/*! \brief SDS buffer size limit */
+#define MAX_XFER (1<<LOG_MAX_XFER)
+
+/*!
+ * \brief Returns a memory map to FPGA configuration registers.
+ * \param addr The base physical address of the configuration registers.
+ * \param length The size of the memory mapped region in bytes.
+ * \return A pointer to the memory mapped region.
+ */
+void *MapRegister(unsigned addr, size_t length);
+
+/*!
+ * \brief Deletes the configuration register memory map.
+ * \param vta The memory mapped region.
+ * \param length The size of the memory mapped region in bytes.
+ */
+void UnmapRegister(void *vta, size_t length);
+
+/*!
+ * \brief Writes to a memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to write to.
+ * \param val The value to be written to the memory mapped register.
+ */
+void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
+
+/*!
+ * \brief Reads from the memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to read from.
+ * \return The value read from the memory mapped register.
+ */
+unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
+
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void ProgramVTA(const char* bitstream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_PYNQ_DRIVER_H_
--- a/vta/include/vta_test_lib.h
+++ b/vta/include/vta_test_lib.h
@ -0,0 +1,300 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test_lib.cpp
+ * \brief Test library for the VTA design simulation and driver tests.
+ */
+
+#ifndef VTA_TESTLIB_H_
+#define VTA_TESTLIB_H_
+
+#include "vta_params.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef NO_SIM
+
+#include "vta_pynq_driver.h"
+
+typedef uint64_t axi_T;
+typedef uint32_t uop_T;
+typedef int8_t wgt_T;
+typedef int8_t inp_T;
+typedef int32_t acc_T;
+
+uint64_t vta (
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs);
+
+#else //NO_SIM
+
+#include "vta.h"
+#include "vta_typedefs.h"
+
+#endif //NO_SIM
+
+/*!
+* \brief Returns opcode string.
+* \param opcode Opcode parameter (defined in vta_defines.h).
+* \param use_imm Boolean that indicates if the operation uses an immediate value.
+* \return The opcode string.
+*/
+const char* getOpcodeString(int opcode, bool use_imm);
+
+/*!
+* \brief Performs buffer data packing and tiling.
+* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
+* \param src Pointer to the unpacked source 2D array.
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Performs buffer data unpacking.
+* \param dst Pointer to the unpacked destination 2D array.
+* \param src Pointer to the packed, and tiled source 1D array (flattened).
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Allocates and initializes a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T, int T_WIDTH>
+T ** allocInit2dArray(int rows, int cols);
+
+/*!
+* \brief Allocates a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T>
+T ** alloc2dArray(int rows, int cols);
+
+/*!
+* \brief Frees a 2D array.
+* \param array Pointer to the 2D array to be freed.
+* \param rows Number of rows.
+* \param cols Number of columns.
+*/
+template <typename T>
+void free2dArray(T **array, int rows, int cols);
+
+/*!
+* \brief Allocates a 3D array in the heap.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+* \return Pointer to the 3D array.
+*/
+template <typename T>
+T *** alloc3dArray(int rows, int cols, int depth);
+
+/*!
+* \brief Frees a 3D array.
+* \param array Pointer to the 3D array.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+*/
+template <typename T>
+void free3dArray(T *** array, int rows, int cols, int depth);
+
+/*!
+* \brief Performs memory allocation in a physically contiguous region of memory.
+* \param num_bytes Size of the buffer in bytes.
+* \return Pointer to the allocated buffer.
+*/
+void * allocBuffer(size_t num_bytes);
+
+/*!
+* \brief Frees buffer allocated in a physically contiguous region of memory.
+* \param buffer Pointer to the buffer to free.
+*/
+void freeBuffer(void * buffer);
+
+/*!
+* \brief Returns a VTA reset instruction on a 2D patch of the register file.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param y_size Number of rows to reset (y axis).
+* \param x_size Number of elements per row to reset (x axis).
+* \param x_stride Stride along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a reset op.
+*/
+VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 2D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param x_stride Stride along the x axis.
+* \param y_pad Padding along the y axis.
+* \param x_pad Padding along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 2D load or store op.
+*/
+VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
+  int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
+  int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 1D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param size Number of elements to load/store.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 1D load or store op.
+*/
+VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
+* \param uop_offset Offset of the micro-op in SRAM.
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a GEMM op.
+*/
+VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
+  bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
+  int push_next_dep);
+
+/*!
+* \brief Returns a VTA ALU instruction for map type operation.
+* \param opcode Opcode of the ALU instruction.
+* \param use_imm Use immediate.
+* \param imm Immediate value (int16).
+* \param vector_size Vector size of the ALU operation size.
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a ALU op.
+*/
+VTAGenericInsn getALUInsn(int opcode, bool use_imm, int imm, int vector_size, bool uop_compression,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA finish instruction.
+* \param pop_prev Pop dependence from previous stage.
+* \param pop_next Pop dependence from next stage.
+* \return A VTAGenericInsn for a finish op.
+*/
+VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
+*   of size (a, b) x (b, c).
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param multi_threaded Generate micro-ops for two virtual execution threads.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
+  bool multi_threaded);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
+* \param vector_size Vector size.
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getMapALUUops(int vector_size, bool uop_compression);
+
+/*!
+* \brief Print out parameters of the VTA design (for debugging purposes).
+*/
+void printParameters();
+
+/*!
+* \brief Print out instruction information (for debugging purposes).
+* \param num_insn Number of instructions.
+* \param insns Pointer to the instruction buffer.
+*/
+void printInstruction(int num_insn, VTAGenericInsn *insns);
+
+/*!
+* \brief Print out micro-op information (for debugging purposes).
+* \param num_insn Number of micro-ops.
+* \param insns Pointer to the micro-op buffer.
+*/
+void printMicroOp(int num_uop, VTAUop *uops);
+
+/*!
+* \brief VTA ALU unit test.
+* \param opcode The ALU opcode.
+* \param use_imm Use immediate.
+* \param batch Batch size.
+* \param vector_size Vector length of the ALU operation.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
+
+/*!
+* \brief VTA blocked GEMM unit test.
+* \param batch Batch size.
+* \param channels Channel width.
+* \param block Blocking size.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
+  int virtual_threads);
+
+#endif  // VTA_TESTLIB_H_
--- a/vta/src/driver/pynq/vta_pynq_driver.c
+++ b/vta/src/driver/pynq/vta_pynq_driver.c
@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.c
+ * \brief VTA driver for Pynq board.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "vta_pynq_driver.h"
+#ifdef __cplusplus
+}
+#endif
+
+void *MapRegister(uint32_t addr, size_t length) {
+
+  // Align the base address with the pages
+  uint32_t virt_base = addr & ~(getpagesize() - 1);
+  // Calculate base address offset w.r.t the base address
+  uint32_t virt_offset = addr - virt_base;
+  // Open file and mmap
+  uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
+
+  return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
+}
+
+void UnmapRegister(void *vta, size_t length) {
+  // Unmap memory
+  int status = munmap(vta, length);
+  assert(status==0);
+}
+
+void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+  *((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
+}
+
+uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
+  return *((volatile uint32_t *) (((char *) base_addr) + offset));
+}
+
+void ProgramVTA(const char* bitstream) {
+
+    int elem;
+    FILE *src, *dst, *partial;
+
+    partial = fopen(BS_IS_PARTIAL, "w");
+    if (partial == NULL) {
+        printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
+        fclose(partial);
+        exit(1);
+    }
+    fputc('0', partial);
+    fclose(partial);
+
+    src = fopen(bitstream, "rb");
+    if (src == NULL) {
+        printf("Cannot open bitstream %s\n", bitstream);
+        exit(1);
+    }
+
+    dst = fopen(BS_XDEVCFG, "wb");
+    if (dst == NULL) {
+        printf("Cannot open device file %s\n", BS_XDEVCFG);
+        fclose(dst);
+        exit(1);
+    }
+
+    elem = fgetc(src);
+    while (elem != EOF) {
+        fputc(elem, dst);
+        elem = fgetc(src);
+    }
+
+    fclose(src);
+    fclose(dst);
+
+}
--- a/vta/src/hardware/hls/vta.cc
+++ b/vta/src/hardware/hls/vta.cc
@ -0,0 +1,789 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.cpp
+ * \brief VTA HLS design.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vta.h"
+
+void fetch (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue) {
+#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
+#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
+#pragma HLS INTERFACE axis port=load_queue
+#pragma HLS INTERFACE axis port=gemm_queue
+#pragma HLS INTERFACE axis port=store_queue
+#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+
+  INSN_DECODE: for (int pc = 0; pc < insn_count; pc ++) {
+#pragma HLS PIPELINE II=1
+    // Read instruction fields
+    insn_T insn = insns[pc];
+    // Do some partial decoding
+    opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
+    memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
+    // Push to appropriate instruction queue
+    if (opcode == OPCODE_STORE) {
+      store_queue.write(insn);
+    } else if (opcode == OPCODE_LOAD &&
+               (memory_type == MEM_ID_INP || memory_type == MEM_ID_WGT)) {
+      load_queue.write(insn);
+    } else {
+      gemm_queue.write(insn);
+    }
+  }
+
+}
+
+void load (
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
+  ) {
+#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
+#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
+#pragma HLS INTERFACE axis port=load_queue
+#pragma HLS INTERFACE axis port=g2l_dep_queue
+#pragma HLS INTERFACE axis port=l2g_dep_queue
+#pragma HLS INTERFACE bram port=wgt_mem
+#pragma HLS INTERFACE bram port=inp_mem
+#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
+
+  // Pop load instruction
+  insn_T insn = load_queue.read();
+
+  // Decode instruction
+  bool pop_prev_dependence = insn[INSN_MEM_1];
+  bool pop_next_dependence = insn[INSN_MEM_2];
+  bool push_prev_dependence = insn[INSN_MEM_3];
+  bool push_next_dependence = insn[INSN_MEM_4];
+  memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+
+  // Pop dependence token if instructed
+  if (pop_next_dependence) {
+    g2l_dep_queue.read();
+  }
+
+  // Initialize indices
+  memop_sram_T sram_idx = sram_base;
+  memop_dram_T dram_idx = dram_base;
+
+  // Pre-compute dimensions, and offsets
+  memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
+  memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
+  memop_sram_T y_offset = x_size_total * y_pad_0;
+#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+
+  // Skip padding along y dimension
+  sram_idx += y_offset;
+
+  // Perform data transfer from DRAM
+  for (int y = 0; y < y_size; y ++) {
+#pragma HLS PIPELINE rewind
+    // Skip padding along x dimension
+    sram_idx += x_pad_0;
+    // Perform data transfer
+    if (memory_type == MEM_ID_INP) {
+      memcpy(
+        &inp_mem[sram_idx][0],
+        (const inp_vec_T*) &inputs[dram_idx * BATCH],
+        x_size * INP_ELEM_BYTES
+      );
+    } else {
+      memcpy(
+        &wgt_mem[sram_idx][0],
+        (const wgt_vec_T*) &weights[dram_idx * BLOCK_OUT],
+        x_size * WGT_ELEM_BYTES
+      );
+    }
+    sram_idx += x_size;
+    dram_idx += x_stride;
+    // Skip padding along x dimension
+    sram_idx += x_pad_1;
+  }
+
+  // Reset SRAM index
+  sram_idx = sram_base;
+  // Pad x/y edges with zeros
+  for (int y = 0; y < y_size_total; y ++) {
+    if (y < y_pad_0 || y >= y_pad_0 + y_size) {
+      for (int x = 0; x < x_size_total; x ++) {
+#pragma HLS PIPELINE II=1 rewind
+        if (memory_type == MEM_ID_INP) {
+          for (int i = 0; i < BATCH; i ++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < BLOCK_OUT; i ++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx ++;
+      }
+    } else {
+      for (int x = 0; x < x_pad_0; x ++) {
+#pragma HLS PIPELINE II=1 rewind
+        if (memory_type == MEM_ID_INP) {
+          for (int i = 0; i < BATCH; i ++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < BLOCK_OUT; i ++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx ++;
+      }
+      sram_idx += x_size;
+      for (int x = 0; x < x_pad_1; x ++) {
+#pragma HLS PIPELINE II=1 rewind
+        if (memory_type == MEM_ID_INP) {
+          for (int i = 0; i < BATCH; i ++) {
+            inp_mem[sram_idx][i] = 0;
+          }
+        } else {
+          for (int i = 0; i < BLOCK_OUT; i ++) {
+            wgt_mem[sram_idx][i] = 0;
+          }
+        }
+        sram_idx ++;
+      }
+
+    }
+  }
+
+  // Push dependence token if instructed
+  if (push_next_dependence) {
+    l2g_dep_queue.write(1);
+  }
+}
+
+void compute (
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  ) {
+#pragma HLS INTERFACE s_axilite port=done bundle=CONTROL_BUS
+#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
+#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
+#pragma HLS INTERFACE axis port=gemm_queue
+#pragma HLS INTERFACE axis port=l2g_dep_queue
+#pragma HLS INTERFACE axis port=s2g_dep_queue
+#pragma HLS INTERFACE axis port=g2l_dep_queue
+#pragma HLS INTERFACE axis port=g2s_dep_queue
+#pragma HLS INTERFACE bram port=inp_mem
+#pragma HLS INTERFACE bram port=wgt_mem
+#pragma HLS INTERFACE bram port=out_mem
+#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
+// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
+// This is necessary connect the SRAM to the load module
+#pragma HLS RESOURCE variable=wgt_mem core=RAM_1P
+
+  // Micro-op storage
+  static uop_T uop_mem[UOP_BUFF_DEPTH];
+
+  // Accumulator storage
+  static acc_vec_T acc_mem[ACC_BUFF_DEPTH][BATCH];
+#pragma HLS ARRAY_PARTITION variable=acc_mem complete dim=2
+
+  // Pop GEMM instruction
+  insn_T insn = gemm_queue.read();
+
+  // Decode
+  opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
+  bool pop_prev_dependence = insn[INSN_MEM_1];
+  bool pop_next_dependence = insn[INSN_MEM_2];
+  bool push_prev_dependence = insn[INSN_MEM_3];
+  bool push_next_dependence = insn[INSN_MEM_4];
+
+  // Pop dependence token if instructed
+  if (pop_prev_dependence) {
+    l2g_dep_queue.read();
+  }
+  if (pop_next_dependence) {
+    s2g_dep_queue.read();
+  }
+
+  // Perform action based on opcode
+  if (opcode == OPCODE_FINISH) {
+
+    // Set done flag if we reach a FINISH instruction
+    done = 1;
+
+  } else if (opcode == OPCODE_LOAD || opcode == OPCODE_STORE) {
+
+    // Set done value
+    done = 0;
+
+    // Decode instruction
+    memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
+    memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
+    memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
+    memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
+    memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
+    memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
+    memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
+    memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
+    memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
+    memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+
+    // Initialize indices
+    memop_sram_T sram_idx = sram_base;
+    memop_dram_T dram_idx = dram_base;
+
+    // Pre-compute dimensions, and offsets
+    memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
+    memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
+    memop_sram_T y_offset = x_size_total * y_pad_0;
+#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+
+    if (memory_type == MEM_ID_UOP) {
+      // Perform data transfer
+      memcpy(
+        &uop_mem[sram_base],
+        (const uop_T*) &uops[dram_base],
+        x_size * sizeof(uop_T)
+      );
+    } else {
+      // Skip vertical padding
+      sram_idx += y_offset;
+      // Perform data transfer from DRAM
+      for (int y = 0; y < y_size; y ++) {
+#pragma HLS PIPELINE rewind
+        // Skip padding along x dimension
+        sram_idx += x_pad_0;
+        // Perform data transfer
+        memcpy(
+            &acc_mem[sram_idx][0],
+            (const acc_vec_T*) &biases[dram_idx * BATCH],
+            x_size*ACC_ELEM_BYTES
+        );
+        sram_idx += x_size;
+        dram_idx += x_stride;
+        // Skip padding along x dimension
+        sram_idx += x_pad_1;
+      }
+    }
+
+  } else if (opcode == OPCODE_GEMM || opcode == OPCODE_ALU) {
+
+    // Set done value
+    done = 0;
+
+    // Decode
+    uop_idx_T uop_bgn = insn.range(INSN_GEM_5_1, INSN_GEM_5_0);
+    uop_idx_T uop_end = insn.range(INSN_GEM_6_1, INSN_GEM_6_0);
+    loop_T iter_out  = insn.range(INSN_GEM_7_1, INSN_GEM_7_0);
+    loop_T iter_in  = insn.range(INSN_GEM_8_1, INSN_GEM_8_0);
+    acc_idx_T dst_factor_out = insn.range(INSN_GEM_9_1, INSN_GEM_9_0);
+    acc_idx_T dst_factor_in = insn.range(INSN_GEM_A_1, INSN_GEM_A_0);
+    inp_idx_T src_factor_out = insn.range(INSN_GEM_B_1, INSN_GEM_B_0);
+    inp_idx_T src_factor_in = insn.range(INSN_GEM_C_1, INSN_GEM_C_0);
+
+    // GEMM-specific fields
+    wgt_idx_T wgt_factor_out = insn.range(INSN_GEM_D_1, INSN_GEM_D_0);
+    wgt_idx_T wgt_factor_in = insn.range(INSN_GEM_E_1, INSN_GEM_E_0);
+
+    // ALU-specific field
+    aluop_opcode_T alu_opcode = insn.range(INSN_ALU_D_1, INSN_ALU_D_0);
+    bool use_imm = insn[INSN_ALU_E];
+    aluop_imm_T imm = insn.range(INSN_ALU_F_1, INSN_ALU_F_0);
+
+    acc_idx_T dst_offset_out = 0;
+    inp_idx_T src_offset_out = 0;
+    wgt_idx_T wgt_offset_out = 0;
+
+    // Outer Loop
+    EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out ++) {
+#pragma HLS DEPENDENCE variable=acc_mem inter false
+
+      acc_idx_T dst_offset_in = dst_offset_out;
+      inp_idx_T src_offset_in = src_offset_out;
+      wgt_idx_T wgt_offset_in = wgt_offset_out;
+
+      // Inner Loop
+      EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in ++) {
+
+        // Perform appropriate computation based on opcode
+        if (opcode == OPCODE_GEMM) {
+
+          // Iterate over micro op
+          READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
+#pragma HLS PIPELINE II=1 rewind
+
+            // Read micro-op fields
+            uop_T uop = uop_mem[upc];
+
+            // Decode indices
+            bool reset_out = uop[UOP_GEM_0];
+            acc_idx_T dst_idx =
+              uop.range(UOP_GEM_1_1, UOP_GEM_1_0) + dst_offset_in;
+            acc_idx_T src_idx =
+              uop.range(UOP_GEM_2_1, UOP_GEM_2_0) + src_offset_in;
+            wgt_idx_T wgt_idx =
+              uop.range(UOP_GEM_3_1, UOP_GEM_3_0) + wgt_offset_in;
+
+            // Read weight matrix
+            wgt_vec_T w_matrix[BLOCK_OUT];
+            for (int i = 0; i < BLOCK_OUT; i ++) {
+              w_matrix[i] = wgt_mem[wgt_idx][i];
+            }
+            // Read input matrix and accum matrix
+            acc_vec_T o_matrix[BATCH];
+            out_vec_T i_matrix[BATCH];
+            for (int i = 0; i < BATCH; i ++) {
+              o_matrix[i] = acc_mem[dst_idx][i];
+              i_matrix[i] = inp_mem[src_idx][i];
+            }
+            // Result matrices
+            acc_vec_T acc_mem_val[BATCH];
+            out_vec_T st_buf_val[BATCH];
+
+            // Inner GEMM loop
+            for (int i = 0; i < BATCH; i ++) {
+              for (int b = 0; b < BLOCK_OUT; b ++) {
+                // Initialize the accumulator values
+                acc_T accum =
+                  o_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
+                // Dot product sum
+                sum_T tmp = 0;
+                // Inner matrix multiplication loop (input channel/feature)
+                for (int k=0; k<BLOCK_IN; k++) {
+                  wgt_T w_elem =
+                    w_matrix[b].range((k + 1) * WGT_WIDTH - 1, k * WGT_WIDTH);
+                  inp_T i_elem =
+                    i_matrix[i].range((k + 1) * INP_WIDTH - 1, k * INP_WIDTH);
+                  mul_T prod = i_elem * w_elem;
+                  tmp += (sum_T) prod;
+                }
+                // Update summation
+                accum += (acc_T) tmp;
+                // Update result vector
+                acc_mem_val[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
+                  reset_out ? (acc_T) 0 : accum;
+                st_buf_val[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
+                  (inp_T) accum.range(INP_WIDTH - 1, 0);
+              }
+              // Write to buffers
+              acc_mem[dst_idx][i] = acc_mem_val[i];
+              out_mem[dst_idx][i] = st_buf_val[i];
+            }
+          }
+
+        } else if (opcode == OPCODE_ALU) {
+
+          // Iterate over micro op
+          READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
+
+            // Read micro-op fields
+            uop_T uop = uop_mem[upc];
+
+            // Decode
+            bool reset_out = uop[UOP_ALU_0];
+            acc_idx_T dst_idx =
+              uop.range(UOP_ALU_1_1, UOP_ALU_1_0) + dst_offset_in;
+            acc_idx_T src_idx =
+              uop.range(UOP_ALU_2_1, UOP_ALU_2_0) + src_offset_in;
+
+            // Read input matrix and accum matrix
+            acc_vec_T dst_matrix[BATCH];
+            acc_vec_T src_matrix[BATCH];
+            for (int i = 0; i < BATCH; i ++) {
+#pragma HLS UNROLL complete
+              dst_matrix[i] = acc_mem[dst_idx][i];
+              src_matrix[i] = acc_mem[src_idx][i];
+            }
+
+            // Result matrices
+            acc_vec_T cmp_res[BATCH];
+            acc_vec_T add_res[BATCH];
+            acc_vec_T shr_res[BATCH];
+            out_vec_T short_cmp_res[BATCH];
+            out_vec_T short_add_res[BATCH];
+            out_vec_T short_shr_res[BATCH];
+
+            // Perform ALU op over matrix elements
+            for (int i = 0; i < BATCH; i ++) {
+#pragma HLS PIPELINE II=1 rewind
+              // Results vector
+              acc_vec_T res_vec = 0;
+              for (int b = 0; b < BLOCK_OUT; b ++) {
+                // Read in operands
+                acc_T src_0 =
+                  dst_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
+                acc_T src_1 =
+                  use_imm ?
+                    (acc_T) imm :
+                    src_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
+                // Compute Min/Max
+                acc_T mix_val =
+                  src_0 < src_1 ?
+                    (alu_opcode == ALU_OPCODE_MIN ? src_0 : src_1) :
+                    (alu_opcode == ALU_OPCODE_MIN ? src_1 : src_0);
+                cmp_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
+                  mix_val;
+                short_cmp_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
+                  (inp_T) mix_val.range(INP_WIDTH - 1, 0);
+                // Compute Sum
+                acc_T add_val =
+                  src_0.range(ACC_WIDTH - 1, 0) + src_1.range(ACC_WIDTH - 1, 0);
+                add_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
+                  add_val;
+                short_add_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
+                  (inp_T) add_val.range(INP_WIDTH - 1, 0);
+                // Compute Shift
+                acc_T shr_val =
+                  src_0 >> (aluop_sh_imm_T) src_1.range(LOG_ACC_WIDTH - 1, 0);
+                shr_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
+                  shr_val;
+                short_shr_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
+                  (inp_T) shr_val.range(INP_WIDTH-1, 0);
+              }
+
+              // Store to accum memory/store buffer
+              if (alu_opcode == ALU_OPCODE_MIN ||
+                  alu_opcode == ALU_OPCODE_MAX) {
+                acc_mem[dst_idx][i] = cmp_res[i];
+                out_mem[dst_idx][i] = short_cmp_res[i];
+              } else if (alu_opcode==ALU_OPCODE_ADD) {
+                acc_mem[dst_idx][i] = add_res[i];
+                out_mem[dst_idx][i] = short_add_res[i];
+              } else if (alu_opcode==ALU_OPCODE_SHR) {
+                acc_mem[dst_idx][i] = shr_res[i];
+                out_mem[dst_idx][i] = short_shr_res[i];
+              }
+            }
+          }
+        }
+
+        // Update offsets
+        dst_offset_in += dst_factor_in;
+        src_offset_in += src_factor_in;
+        wgt_offset_in += wgt_factor_in;
+      }
+
+      // Update offsets
+      dst_offset_out += dst_factor_out;
+      src_offset_out += src_factor_out;
+      wgt_offset_out += wgt_factor_out;
+    }
+  }
+
+  // Push dependence token if instructed
+  if (push_prev_dependence) {
+    g2l_dep_queue.write(1);
+  }
+  if (push_next_dependence) {
+    g2s_dep_queue.write(1);
+  }
+
+}
+
+void store (
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  ) {
+#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
+#pragma HLS INTERFACE axis port=store_queue
+#pragma HLS INTERFACE axis port=g2s_dep_queue
+#pragma HLS INTERFACE axis port=s2g_dep_queue
+#pragma HLS INTERFACE bram port=out_mem
+#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
+
+  // Load buffer
+  insn_T insn = store_queue.read();
+
+  // Decode
+  bool pop_prev_dependence = insn[INSN_MEM_1];
+  bool pop_next_dependence = insn[INSN_MEM_2];
+  bool push_prev_dependence = insn[INSN_MEM_3];
+  bool push_next_dependence = insn[INSN_MEM_4];
+  memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+
+  // Pop dependence token if instructed
+  if (pop_prev_dependence) {
+    g2s_dep_queue.read();
+  }
+
+  // Initialize indices
+  memop_sram_T sram_idx = sram_base;
+  memop_dram_T dram_idx = dram_base;
+
+  // Skip padding along y dimension
+  memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
+  sram_idx += y_offset;
+#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+
+  // Copy along y dimension
+  for (int y = 0; y < y_size; y ++) {
+#pragma HLS PIPELINE rewind
+    // Skip padding along x dimension
+    sram_idx += x_pad_0;
+    // Perform data transfer
+    memcpy(
+      (out_vec_T *) &outputs[dram_idx*BATCH],
+      (const out_vec_T*) &out_mem[sram_idx][0],
+      x_size * INP_ELEM_BYTES);
+    sram_idx += x_size;
+    dram_idx += x_stride;
+    // Skip padding along x dimension
+    sram_idx += x_pad_1;
+  }
+
+  // Push dependence token if instructed
+  if (push_prev_dependence) {
+    s2g_dep_queue.write(1);
+  }
+}
+
+void vta (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs) {
+#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
+#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
+#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
+#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
+#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
+#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
+#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
+#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+
+  // Instantiate temporary instruction queues (used for peeking)
+  hls::stream<insn_T> tmp_load_queue;
+  hls::stream<insn_T> tmp_gemm_queue;
+  hls::stream<insn_T> tmp_store_queue;
+
+  // Instatiate physical instruction queues
+  hls::stream<insn_T> load_queue;
+  hls::stream<insn_T> gemm_queue;
+  hls::stream<insn_T> store_queue;
+
+  // Dependence queues
+  hls::stream<bool> l2g_dep_queue;
+  hls::stream<bool> s2g_dep_queue;
+  hls::stream<bool> g2l_dep_queue;
+  hls::stream<bool> g2s_dep_queue;
+
+  // Instantiate memories
+  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH];
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT];
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH];
+
+  // Push all instructions into the queues
+  fetch(
+    insn_count,
+    insns,
+    tmp_load_queue,
+    tmp_gemm_queue,
+    tmp_store_queue
+  );
+
+  // Global done indicator
+  uint32_t done = 0;
+
+  // Temporary instructions
+  insn_T tmp_load;
+  insn_T tmp_gemv;
+  insn_T tmp_store;
+
+  // Peeking status
+  bool tmp_load_popped = false;
+  bool tmp_gemm_popped = false;
+  bool tmp_store_popped = false;
+  int exit_counter = 0;
+
+  // Main control loop
+  while (true) {
+    // First execute as many load instructions as possible
+    while (!tmp_load_queue.empty() || tmp_load_popped == true) {
+      // Pop the load instruction
+      if (!tmp_load_popped) {
+        tmp_load_queue.read(tmp_load);
+        tmp_load_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_next_dependence = tmp_load[INSN_MEM_2];
+      if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
+          !pop_next_dependence) {
+        // Push the instruction in the load queue
+        load_queue.write(tmp_load);
+        tmp_load_popped = false;
+        load(
+          inputs,
+          weights,
+          load_queue,
+          g2l_dep_queue,
+          l2g_dep_queue,
+          inp_mem,
+          wgt_mem
+        );
+      } else {
+        // Execution of load stage pending on completion of other stages, so break here...
+        break;
+      }
+    }
+    // Next execute as many gemm instructions as possible
+    while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
+      // Pop the gemm instruction
+      if (!tmp_gemm_popped) {
+        tmp_gemm_queue.read(tmp_gemv);
+        tmp_gemm_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_prev_dependence = tmp_gemv[INSN_MEM_1];
+      bool pop_next_dependence = tmp_gemv[INSN_MEM_2];
+      if (
+        (pop_prev_dependence && !l2g_dep_queue.empty() &&
+         pop_next_dependence && !s2g_dep_queue.empty()) ||
+        (!pop_prev_dependence && pop_next_dependence &&
+         !s2g_dep_queue.empty()) ||
+        (pop_prev_dependence && !l2g_dep_queue.empty() &&
+        !pop_next_dependence) ||
+        (!pop_prev_dependence && !pop_next_dependence)
+      ) {
+        // Push the instruction in the load queue
+        gemm_queue.write(tmp_gemv);
+        tmp_gemm_popped = false;
+        compute(
+          done,
+          uops,
+          biases,
+          gemm_queue,
+          l2g_dep_queue,
+          s2g_dep_queue,
+          g2l_dep_queue,
+          g2s_dep_queue,
+          inp_mem,
+          wgt_mem,
+          out_mem
+        );
+      } else {
+        // Execution of load stage pending on completion of other stages,
+        // so break here...
+        break;
+      }
+    }
+    // Finally execute as many store instructions as possible
+    while (!tmp_store_queue.empty() || tmp_store_popped == true) {
+      // Pop the load instruction
+      if (!tmp_store_popped) {
+        tmp_store_queue.read(tmp_store);
+        tmp_store_popped = true;
+      }
+      // Check dependences and invoke the load stage
+      bool pop_prev_dependence = tmp_store[INSN_MEM_1];
+      if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
+          !pop_prev_dependence) {
+        // Push the instruction in the load queue
+        store_queue.write(tmp_store);
+        tmp_store_popped = false;
+        store(
+          outputs,
+          store_queue,
+          g2s_dep_queue,
+          s2g_dep_queue,
+          out_mem
+        );
+      } else {
+        // Execution of load stage pending on completion of other stages, so break here...
+        break;
+      }
+    }
+    // Check if we get a signal that we are done
+    if (done) {
+      break;
+    }
+    exit_counter ++;
+    if (exit_counter > 1000) {
+      if (tmp_load_popped) {
+        if (g2l_dep_queue.empty()) {
+          printf("waiting on g2l\n");
+        }
+      }
+      if (tmp_gemm_popped) {
+        if (l2g_dep_queue.empty() && tmp_gemv[INSN_MEM_1]) {
+          printf("waiting on l2g\n");
+        }
+        if (s2g_dep_queue.empty() && tmp_gemv[INSN_MEM_2]) {
+          printf("waiting on s2g\n");
+        }
+      }
+      if (tmp_store_popped) {
+        if (g2s_dep_queue.empty()) {
+          printf("waiting on g2s\n");
+        }
+      }
+      break;
+    }
+  }
+
+  // Ensure that the tokens are empty
+  bool tmp_tok;
+  int l2g_count = 0;
+  int s2g_count = 0;
+  int g2l_count = 0;
+  int g2s_count = 0;
+  while(l2g_dep_queue.read_nb(tmp_tok)) {
+    l2g_count ++;
+  }
+  while(s2g_dep_queue.read_nb(tmp_tok)) {
+    s2g_count ++;
+  }
+  while(g2l_dep_queue.read_nb(tmp_tok)) {
+    g2l_count ++;
+  }
+  while(g2s_dep_queue.read_nb(tmp_tok)) {
+    g2s_count ++;
+  }
+
+  assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
+}
--- a/vta/src/test/vta_test_lib.cc
+++ b/vta/src/test/vta_test_lib.cc
--- a/vta/tests/driver/Makefile
+++ b/vta/tests/driver/Makefile
@ -0,0 +1,59 @@
+CC ?= g++
+CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
+LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
+LIBS = -l:libsds_lib.so -l:libdma.so
+SRC_DIR = ../../src
+INCLUDE_DIR = ../../include
+DRIVER_DIR = $(SRC_DIR)/driver/pynq
+TESTLIB_DIR = $(SRC_DIR)/test
+VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
+SOURCES = vta_pynq_driver.c vta_test_lib.cc
+OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
+EXECUTABLE = vta
+
+# VTA Parameters
+#  Log of input width in bits
+LOG_INP_WIDTH = 3
+#  Log of weight width in bits
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits
+LOG_ACC_WIDTH = 5
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_IN_BLOCK = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_OUT_BLOCK = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+
+# Define flags
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
+	-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
+	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
+	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
+	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
+	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
+	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
+
+# All Target
+all: $(EXECUTABLE)
+
+%.o: %.cc $(SOURCES)
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(EXECUTABLE): $(OBJECTS)
+	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
+
+clean:
+	rm -rf *.o $(EXECUTABLE)
--- a/vta/tests/driver/driver_test.cc
+++ b/vta/tests/driver/driver_test.cc
@ -0,0 +1,152 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file driver_test.cpp
+ * \brief Bare-metal test to test driver and VTA design.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "vta_test_lib.h"
+#include "vta_pynq_driver.h"
+
+// VTA invocation (present the same abstraction as in the simulation tests)
+uint64_t vta (
+    uint32_t insn_count,
+    VTAGenericInsn *insns,
+    VTAUop *uops,
+    inp_T *inputs,
+    wgt_T *weights,
+    acc_T *biases,
+    inp_T *outputs) {
+
+    // Performance counter variables
+    uint64_t t_fpga;
+    struct timespec start, stop;
+
+    // Derive bitstream file
+    char bitstream[64];
+    char str_batch_size[4];
+    char str_block_out_size[4];
+    char str_block_in_size[4];
+    char str_block_bit_width[4];
+    sprintf(str_batch_size, "%d", BATCH);
+    sprintf(str_block_out_size, "%d", BLOCK_OUT);
+    sprintf(str_block_in_size, "%d", BLOCK_IN);
+    sprintf(str_block_bit_width, "%d", WGT_WIDTH);
+    strcpy(bitstream, "vta.bit");
+
+#if DEBUG==1
+    printf("INFO - Programming FPGA: %s!\n", bitstream);
+#endif
+
+    // Program VTA
+    ProgramVTA(bitstream);
+    // Get VTA handles
+    VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
+
+    // Physical address pointers
+    uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+    uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+    uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+    uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+    uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+
+#if DEBUG==1
+    printf("INFO - Starting FPGA!\n");
+#endif
+
+    clock_gettime(CLOCK_REALTIME, &start);
+
+    // FETCH @ 0x10 : Data signal of insn_count_V
+    WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+    // FETCH @ 0x18 : Data signal of insns_V
+    if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+    // LOAD @ 0x10 : Data signal of weight_V
+    if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
+    // LOAD @ 0x18 : Data signal of inputs_V
+    if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
+    // COMPUTE @ 0x20 : Data signal of uops_V
+    if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+    // COMPUTE @ 0x28 : Data signal of biases_V
+    if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+    // STORE @ 0x10 : Data signal of outputs_V
+    if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
+
+    // VTA start
+    WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+    WriteMappedReg(vta_load_handle, 0x0, 0x81);
+    WriteMappedReg(vta_compute_handle, 0x0, 0x81);
+    WriteMappedReg(vta_store_handle, 0x0, 0x81);
+
+    int flag = 0, t = 0;
+    for (t = 0; t < 10000000; ++t) {
+      flag = ReadMappedReg(vta_compute_handle, 0x18);
+      if (flag & VTA_DONE) break;
+    }
+
+    if (t==10000000) {
+        printf("\tWARNING: VTA TIMEOUT!!!!\n");
+    }
+#if DEBUG==1
+    else {
+        printf("INFO - FPGA Finished!\n");
+    }
+#endif
+
+    clock_gettime(CLOCK_REALTIME, &stop);
+    t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
+
+    // Unmap VTA register
+    UnmapRegister(vta_fetch_handle, VTA_RANGE);
+    UnmapRegister(vta_load_handle, VTA_RANGE);
+    UnmapRegister(vta_compute_handle, VTA_RANGE);
+    UnmapRegister(vta_store_handle, VTA_RANGE);
+
+    return t_fpga;
+};
+
+int main(void)
+{
+
+#if DEBUG==1
+    printParameters();
+#endif
+
+    int status = 0;
+
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+
+    if (status==0) {
+        printf("\nINFO - Unit tests successful!\n");
+    } else {
+        printf("\nINTO - Unit tests failed!\n");
+    }
+
+    return status;
+
+}