From 28a10b690429434da0c827e7995f689985c0aa78 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@cs.washington.edu>
Date: Sun, 18 Mar 2018 00:21:54 -0700
Subject: [PATCH] [REFACTOR] Code base refactoring (#5)

---
 vta/Makefile                                  |   12 +-
 vta/apps/pynq_rpc/start_rpc_server.sh         |    4 +
 vta/hardware/vivado/Makefile                  |   71 +-
 vta/hardware/vivado/scripts/hls.tcl           |    4 +-
 vta/hardware/vivado/sim/vta_test.cc           |    4 +-
 .../hls => hardware/vivado/src}/vta.cc        |    2 +-
 .../hls => hardware/vivado/src}/vta.h         |   84 +-
 vta/include/hardware/hls/vta_typedefs.h       |   97 --
 vta/include/vta/driver.h                      |  100 ++
 vta/include/{vta_params.h => vta/hw_spec.h}   |   14 +-
 vta/include/vta/runtime.h                     |  274 ++++
 vta/include/vta_pynq_driver.h                 |  152 --
 vta/make/config.mk                            |   69 +
 .../vta_pynq_driver.c => pynq/pynq_driver.cc} |   40 +-
 vta/src/pynq/pynq_driver.h                    |   83 +
 vta/src/runtime.cc                            | 1410 +++++++++++++++++
 vta/src/tvm/vta_device_api.cc                 |  106 ++
 vta/tests/driver/Makefile                     |   59 -
 .../hardware/common/test_lib.cc}              |   60 +-
 .../hardware/common/test_lib.h}               |   13 +-
 vta/tests/hardware/pynq/Makefile              |   37 +
 .../pynq/metal_test.cc}                       |   53 +-
 22 files changed, 2302 insertions(+), 446 deletions(-)
 create mode 100755 vta/apps/pynq_rpc/start_rpc_server.sh
 rename vta/{src/hardware/hls => hardware/vivado/src}/vta.cc (99%)
 rename vta/{include/hardware/hls => hardware/vivado/src}/vta.h (65%)
 delete mode 100644 vta/include/hardware/hls/vta_typedefs.h
 create mode 100644 vta/include/vta/driver.h
 rename vta/include/{vta_params.h => vta/hw_spec.h} (99%)
 create mode 100644 vta/include/vta/runtime.h
 delete mode 100644 vta/include/vta_pynq_driver.h
 rename vta/src/{driver/pynq/vta_pynq_driver.c => pynq/pynq_driver.cc} (67%)
 create mode 100644 vta/src/pynq/pynq_driver.h
 create mode 100644 vta/src/runtime.cc
 create mode 100644 vta/src/tvm/vta_device_api.cc
 delete mode 100644 vta/tests/driver/Makefile
 rename vta/{src/test/vta_test_lib.cc => tests/hardware/common/test_lib.cc} (97%)
 rename vta/{include/vta_test_lib.h => tests/hardware/common/test_lib.h} (97%)
 create mode 100644 vta/tests/hardware/pynq/Makefile
 rename vta/tests/{driver/driver_test.cc => hardware/pynq/metal_test.cc} (71%)

diff --git a/vta/Makefile b/vta/Makefile
index 20414cde..6007ed2e 100644
--- a/vta/Makefile
+++ b/vta/Makefile
@@ -54,9 +54,13 @@ endif
 
 all: lib/libvta.$(SHARED_LIBRARY_SUFFIX)
 
-SRC = $(wildcard src/*.cc src/*.cc)
-ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
-ALL_DEP = $(ALL_OBJ)
+VTA_LIB_SRC = $(wildcard src/*.cc src/tvm/*.cc)
+ifeq ($(TARGET), PYNQ_TARGET)
+	VTA_LIB_SRC += $(wildcard src/pynq/*.cc)
+	LDFLAGS += -L/usr/lib -lsds_lib
+	LDFLAGS += -L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/ -l:libdma.so
+endif
+VTA_LIB_OBJ = $(patsubst %.cc, build/%.o, $(VTA_LIB_SRC))
 
 test: $(TEST)
 
@@ -65,7 +69,7 @@ build/src/%.o: src/%.cc
 	$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
 	$(CXX) -c $(CFLAGS) -c $< -o $@
 
-lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(ALL_DEP)
+lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS)
 
diff --git a/vta/apps/pynq_rpc/start_rpc_server.sh b/vta/apps/pynq_rpc/start_rpc_server.sh
new file mode 100755
index 00000000..d5a1202a
--- /dev/null
+++ b/vta/apps/pynq_rpc/start_rpc_server.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/tvm/python
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/
+python -m  tvm.exec.rpc_server --load-library /home/xilinx/vta/lib/libvta.so
diff --git a/vta/hardware/vivado/Makefile b/vta/hardware/vivado/Makefile
index b52fb628..dfcb0631 100644
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
@@ -2,9 +2,9 @@
 ROOTDIR = $(CURDIR)
 BUILD_DIR = $(ROOTDIR)/build
 SCRIPT_DIR = $(ROOTDIR)/scripts
-SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
+SRC_DIR = $(ROOTDIR)/src
 SIM_DIR = $(ROOTDIR)/sim
-TEST_DIR = $(ROOTDIR)/../../src/test
+TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
 INCLUDE_DIR = $(ROOTDIR)/../../include
 
 # Executables
@@ -12,59 +12,28 @@ VIVADO_HLS = vivado_hls
 VIVADO = vivado
 HSI = hsi
 
-# Build parameters:
+# Include top-level config file
+ifndef config
+ifneq ("$(wildcard ../../config.mk)", "")
+	config = ../../config.mk
+else
+	config = ../../make/config.mk
+endif
+endif
+include $(config)
+
+#---------------------
+# Compilation parameters
+#--------------------
+
 #  Number of threads during compilation
 NUM_THREADS = 8
+
 #  Target Frequency
 CLOCK_FREQ = 100
-#  Log of input width in bits
-LOG_INP_WIDTH = 3
-#  Log of weight width in bits
-LOG_WGT_WIDTH = 3
-#  Log of accum width in bits
-LOG_ACC_WIDTH = 5
-#  Log of output width in bits
-LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
-#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
-LOG_BATCH = 0
-#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
-LOG_IN_BLOCK = 4
-#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
-LOG_OUT_BLOCK = 4
-#  Log of uop buffer size in Bytes
-LOG_UOP_BUFF_SIZE = 15
-#  Log of inp buffer size in Bytes
-LOG_INP_BUFF_SIZE = 15
-#  Log of wgt buffer size in Bytes
-LOG_WGT_BUFF_SIZE = 15
-#  Log of acc buffer size in Bytes
-LOG_ACC_BUFF_SIZE = 17
-#  Log of out buffer size in Bytes
-LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
 
-# Derived parameter
-#  Input width in bits
-INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
-#  Weight width in bits
-WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
-#  Output width in bits
-OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
-#  Tensor batch size
-BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
-#  Tensor outer block size
-IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
-#  Tensor inner block size
-OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
-#  Uop buffer size in Bytes
-UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
-#  Inp buffer size in Bytes
-INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
-#  Wgt buffer size in Bytes
-WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
-#  Acc buffer size in Bytes
-ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
-#  Out buffer size in Bytes
-OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+#  Timing closure compensation (0 for none, 3 for highest)
+TIMING_CLOSURE_COMP = 0
 
 # Derive clock target period
 TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
@@ -85,7 +54,7 @@ ip:
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
 			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
 			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
-			$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
+			$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
 			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
 			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
 
diff --git a/vta/hardware/vivado/scripts/hls.tcl b/vta/hardware/vivado/scripts/hls.tcl
index 55b3c01d..220c8f3b 100644
--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
@@ -62,7 +62,7 @@ if { [llength $argv] eq 19 } {
 }
 
 # C define flags to pass to compiler
-set cflags "-I $include_dir -I $include_dir/hardware/hls \
+set cflags "-I $include_dir -I $src_dir -I $test_dir \
 	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
 	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
 	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
@@ -127,7 +127,7 @@ open_project vta_sim
 set_top vta
 add_files $src_dir/vta.cc -cflags $cflags
 add_files -tb $sim_dir/vta_test.cc -cflags $cflags
-add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
+add_files -tb $test_dir/test_lib.cc -cflags $cflags
 open_solution "solution0"
 init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
 csim_design -clean
diff --git a/vta/hardware/vivado/sim/vta_test.cc b/vta/hardware/vivado/sim/vta_test.cc
index 858926aa..2031186f 100644
--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 #include <iostream>
 
-#include "vta.h"
-#include "vta_test_lib.h"
+#include "../src/vta.h"
+#include "../../../tests/hardware/common/test_lib.h"
 
 int main(void)
 {
diff --git a/vta/src/hardware/hls/vta.cc b/vta/hardware/vivado/src/vta.cc
similarity index 99%
rename from vta/src/hardware/hls/vta.cc
rename to vta/hardware/vivado/src/vta.cc
index 206fb5fe..7a0f9ded 100644
--- a/vta/src/hardware/hls/vta.cc
+++ b/vta/hardware/vivado/src/vta.cc
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "vta.h"
+#include "./vta.h"
 
 void fetch (
   uint32_t insn_count,
diff --git a/vta/include/hardware/hls/vta.h b/vta/hardware/vivado/src/vta.h
similarity index 65%
rename from vta/include/hardware/hls/vta.h
rename to vta/hardware/vivado/src/vta.h
index b959d8a2..5dd4d953 100644
--- a/vta/include/hardware/hls/vta.h
+++ b/vta/hardware/vivado/src/vta.h
@@ -11,8 +11,88 @@
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#include "vta_typedefs.h"
-#include "vta_params.h"
+#include <vta/hw_spec.h>
+
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<UOP_WIDTH> uop_T;
+
+/* \typedef inp_T Input datatype*/
+typedef ap_int<INP_WIDTH> inp_T;
+
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<WGT_WIDTH> wgt_T;
+
+/* \typedef out_T Output datatype*/
+typedef ap_int<OUT_WIDTH> out_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<ACC_WIDTH> acc_T;
+
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<INS_WIDTH> insn_T;
+
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
 
 /*!
 * \brief Fetch module.
diff --git a/vta/include/hardware/hls/vta_typedefs.h b/vta/include/hardware/hls/vta_typedefs.h
deleted file mode 100644
index b2e90e23..00000000
--- a/vta/include/hardware/hls/vta_typedefs.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file vta_typedefs.h
- * \brief Type definitions for VTA HLS design.
- */
-#ifndef VTA_TYPEDEFS_H_
-#define VTA_TYPEDEFS_H_
-
-#include <assert.h>
-#include <ap_axi_sdata.h>
-#include <ap_int.h>
-#include <hls_stream.h>
-
-#include "vta_params.h"
-
-/* \typedef uop_T Micro-op datatype*/
-typedef ap_uint<UOP_WIDTH> uop_T;
-
-/* \typedef inp_T Input datatype*/
-typedef ap_int<INP_WIDTH> inp_T;
-
-/* \typedef wgt_T Weight datatype*/
-typedef ap_int<WGT_WIDTH> wgt_T;
-
-/* \typedef out_T Output datatype*/
-typedef ap_int<OUT_WIDTH> out_T;
-
-/* \typedef acc_T Accumulator datatype*/
-typedef ap_int<ACC_WIDTH> acc_T;
-
-/* \typedef mul_T Multiplier output datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
-
-/* \typedef sum_T GEMM accumulator datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
-
-/* \typedef inp_vec_T Input vector datatype*/
-typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
-
-/* \typedef wgt_vec_T Weight vector datatype*/
-typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
-
-/* \typedef acc_vec_T Accumulator vector datatype*/
-typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
-
-/* \typedef out_vec_T Output vector datatype*/
-typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
-
-/* \typedef uop_idx_T Micro-op SRAM index datatype*/
-typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
-
-/* \typedef inp_idx_T Input SRAM index datatype*/
-typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
-
-/* \typedef wgt_idx_T Weight SRAM index datatype*/
-typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
-
-/* \typedef acc_idx_T Accumulator SRAM index datatype*/
-typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
-
-/* \typedef opcode_T Opcode datatype*/
-typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
-
-/* \typedef insn_T Instruction datatype*/
-typedef ap_uint<INS_WIDTH> insn_T;
-
-/* \typedef loop_T Loop bound datatype*/
-typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
-
-/* \typedef memop_id_T Memory operation ID datatype*/
-typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
-
-/* \typedef memop_sram_T Memory operation SRAM index datatype*/
-typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
-
-/* \typedef memop_dram_T Memory operation DRAM index datatype*/
-typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
-
-/* \typedef memop_size_T Memory operation range datatype*/
-typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
-
-/* \typedef memop_stride_T Memory operation stride datatype*/
-typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
-
-/* \typedef memop_pad_T Memory operation pad width datatype*/
-typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
-
-/* \typedef aluop_opcode_T ALU operation opcode datatype*/
-typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
-
-/* \typedef aluop_opcode_T ALU operation immediate datatype*/
-typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
-
-/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
-typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
-
-#endif // VTA_TYPEDEFS_H_
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
new file mode 100644
index 00000000..2b5e0ea9
--- /dev/null
+++ b/vta/include/vta/driver.h
@@ -0,0 +1,100 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_driver.h
+ * \brief General driver interface.
+ */
+
+#ifndef VTA_DRIVER_H_
+#define VTA_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/*! \brief Memory management constants with libxlnk_cma */
+#define CACHED 1
+/*! \brief Memory management constants with libxlnk_cma */
+#define NOT_CACHED 0
+
+/*! \brief VTA command handle */
+typedef void * VTAHandle;
+
+/*!
+ * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
+ * \param size Size of the region in Bytes.
+ * \param cached Region can be set to not cached (write-back) if set to 0.
+ * \return A pointer to the allocated region.
+ */
+void* VTAMemAlloc(size_t size, int cached);
+
+/*!
+ * \brief Frees a physically contiguous region in memory.
+ * \param buf Buffer to free.
+ */
+void VTAMemFree(void* buf);
+
+/*!
+ * \brief Returns a physical address to the region of memory allocated with VTAMemAlloc.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc.
+ * \return The physical address of the memory region.
+ */
+uint32_t VTAGetMemPhysAddr(void* buf);
+
+/*!
+ * \brief Flushes the region of memory out of the CPU cache to DRAM.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
+ * \param size Size of the region to flush in Bytes.
+ */
+void VTAFlushCache(void* buf, int size);
+
+/*!
+ * \brief Invalidates the region of memory that is cached.
+ * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
+ * \param size Size of the region to invalidate in Bytes.
+ */
+void VTAInvalidateCache(void* buf, int size);
+
+/*!
+ * \brief Returns a memory map to FPGA configuration registers.
+ * \param addr The base physical address of the configuration registers.
+ * \param length The size of the memory mapped region in bytes.
+ * \return A pointer to the memory mapped region.
+ */
+void *VTAMapRegister(unsigned addr, size_t length);
+
+/*!
+ * \brief Deletes the configuration register memory map.
+ * \param vta The memory mapped region.
+ * \param length The size of the memory mapped region in bytes.
+ */
+void VTAUnmapRegister(void *vta, size_t length);
+
+/*!
+ * \brief Writes to a memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to write to.
+ * \param val The value to be written to the memory mapped register.
+ */
+void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
+
+/*!
+ * \brief Reads from the memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to read from.
+ * \return The value read from the memory mapped register.
+ */
+unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
+
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void VTAProgram(const char* bitstream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // VTA_DRIVER_H_
diff --git a/vta/include/vta_params.h b/vta/include/vta/hw_spec.h
similarity index 99%
rename from vta/include/vta_params.h
rename to vta/include/vta/hw_spec.h
index 748e77d9..b18e94e6 100644
--- a/vta/include/vta_params.h
+++ b/vta/include/vta/hw_spec.h
@@ -3,8 +3,13 @@
  * \file vta_defines.h
  * \brief Preprocessor definitions for VTA HLS design and runtime.
  */
-#ifndef VTA_DEFINES_H_
-#define VTA_DEFINES_H_
+
+#ifndef VTA_HW_SPEC_H_
+#define VTA_HW_SPEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #include <stdint.h>
 
@@ -556,4 +561,7 @@ typedef struct {
   uint32_t wgt_idx    : LOG_WGT_BUFF_DEPTH;
 } VTAUop;
 
-#endif // VTA_DEFINES_H_
+#ifdef __cplusplus
+}
+#endif
+#endif // VTA_HW_SPEC_H_
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
new file mode 100644
index 00000000..e1aae32f
--- /dev/null
+++ b/vta/include/vta/runtime.h
@@ -0,0 +1,274 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file runtime.h
+ * \brief VTA runtime library.
+ */
+
+#ifndef VTA_RUNTIME_H_
+#define VTA_RUNTIME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./driver.h"
+
+#define VTA_MEMCPY_H2D 1
+#define VTA_MEMCPY_D2H 2
+#define VTA_MEMCPY_D2D 3
+
+#define VTA_DEBUG_DUMP_INSN (1 << 1)
+#define VTA_DEBUG_DUMP_UOP (1 << 2)
+#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
+#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
+#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
+
+/*! \brief VTA command handle */
+typedef void * VTACommandHandle;
+
+/*! \brief Shutdown hook of VTA to cleanup resources */
+void VTARuntimeShutdown();
+
+/*!
+ * \brief Get thread local command handle.
+ * \return A thread local command handle.
+ */
+VTACommandHandle VTATLSCommandHandle();
+
+/*!
+ * \brief Allocate data buffer.
+ * \param cmd The VTA command handle.
+ * \param size Buffer size.
+ * \return A pointer to the allocated buffer.
+ */
+void* VTABufferAlloc(VTACommandHandle cmd, size_t size);
+
+/*!
+ * \brief Free data buffer.
+ * \param cmd The VTA command handle.
+ * \param buffer The data buffer to be freed.
+ */
+void VTABufferFree(VTACommandHandle cmd, void* buffer);
+
+/*!
+ * \brief Get the buffer access pointer on CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The data buffer.
+ * \return The pointer that can be accessed by the CPU.
+ */
+void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
+
+/*!
+ * \brief Copy data buffer from one location to another.
+ * \param cmd The VTA command handle.
+ * \param from The source buffer base address.
+ * \param from_offset The offset of the source buffer.
+ * \param to The target buffer base address.
+ * \param to_offset The offset of the target buffer.
+ * \param size Size of copy.
+ * \param kind_mask The memory copy kind.
+ */
+void VTABufferCopy(VTACommandHandle cmd,
+                   const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask);
+
+/*!
+ * \brief Set debug mode on the command handle.
+ * \param cmd The VTA command handle.
+ * \param debug_flag The debug flag.
+ */
+void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
+
+/*!
+ * \brief Perform a write barrier to make a memory region visible to the CPU.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The size in bits of each element.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAWriteBarrier(VTACommandHandle cmd,
+                     void* buffer, uint32_t elem_bits,
+                     uint32_t start, uint32_t extent);
+
+/*!
+ * \brief Perform a read barrier to a memory region visible to VTA.
+ * \param cmd The VTA command handle.
+ * \param buffer The head buffer pointer.
+ * \param elem_bits The unit bits of each elements.
+ * \param start The start of the region (in elements).
+ * \param extent The end of the region (in elements).
+ */
+void VTAReadBarrier(VTACommandHandle cmd,
+                    void* buffer, uint32_t elem_bits,
+                    uint32_t start, uint32_t extent);
+
+/*!
+ * \brief Perform a 2D data load from DRAM.
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_dram_addr Source DRAM address.
+ * \param src_elem_offset The source DRAM offset in number of unit elements.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows (y axis).
+ * \param x_stride The x axis stride.
+ * \param x_pad_before The start padding on x axis.
+ * \param y_pad_before The start padding on y axis.
+ * \param x_pad_after The end padding on x axis.
+ * \param y_pad_after The end padding of y axis.
+ * \param dst_sram_index Destination SRAM index.
+ * \param dst_memory_type Destination memory type.
+ */
+void VTALoadBuffer2D(VTACommandHandle cmd,
+                     void* src_dram_addr,
+                     uint32_t src_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride,
+                     uint32_t x_pad_before,
+                     uint32_t y_pad_before,
+                     uint32_t x_pad_after,
+                     uint32_t y_pad_after,
+                     uint32_t dst_sram_index,
+                     uint32_t dst_memory_type);
+
+/*!
+ * \brief Perform a 2D data store into DRAM
+ *  Sizes are measured in units of vector elements.
+ * \param cmd The VTA command handle.
+ * \param src_sram_index Source SRAM index.
+ * \param src_memory_type Source memory type.
+ * \param dst_dram_addr Destination DRAM address.
+ * \param x_size The lowest dimension (x axis) size in number of unit elements.
+ * \param y_size The number of rows.
+ * \param x_stride The x axis stride.
+ */
+void VTAStoreBuffer2D(VTACommandHandle cmd,
+                      uint32_t src_sram_index,
+                      uint32_t src_memory_type,
+                      void* dst_dram_addr,
+                      uint32_t dst_elem_offset,
+                      uint32_t x_size,
+                      uint32_t y_size,
+                      uint32_t x_stride);
+
+/*!
+ * \brief Push uop into kernel buffer.
+ * In GEMM mode, do a blocked GEMM with 2d access pattern.
+ * In ALU mode, do a vectorized ALU operation with 2d access pattern.
+ *
+ *  \code
+ *
+ *   DType accum[INP_BUFF_DEPTH][l][n];
+ *   DType weight[WGT_BUFF_DEPTH][n][m];
+ *   DType input[INP_BUFF_DEPTH][l][m];
+ *   if reset_out == 1
+ *    accum[dst_index] = 0
+ *   elif mode == 0
+ *    accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
+ *   else
+ *    if (use_imm)
+ *      accum[dst_index] = opcode(accum[dst_index], imm_val);
+ *    else
+ *      accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
+ *
+ *  \endcode
+ *
+ * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
+ * \param reset_out Resets the accum to 0.
+ * \param dst_index The accum memory index.
+ * \param src_index The input memory (gemm) / accum memory (alu) index.
+ * \param wgt_index The weight memory index.
+ * \param opcode The ALU opcode.
+ * \param use_imm Use immediate in ALU mode if set to true.
+ * \param imm_val Immediate value in ALU mode.
+ */
+void VTAUopPush(uint32_t mode,
+                uint32_t reset_out,
+                uint32_t dst_index,
+                uint32_t src_index,
+                uint32_t wgt_index,
+                uint32_t opcode,
+                uint32_t use_imm,
+                uint32_t imm_val);
+
+/*!
+ * \brief Mark start of a micro op loop.
+ * \param extent The extent of the loop.
+ * \param dst_factor The accum factor.
+ * \param src_factor The input factor.
+ * \param wgt_factor The weight factor.
+ */
+void VTAUopLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor);
+
+/*!
+ * \brief Mark end of a micro op loop.
+ */
+void VTAUopLoopEnd();
+
+/*!
+ * \brief Push GEMM uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes);
+
+/*!
+ * \brief Push ALU uop kernel into the command handle.
+ * \param uop_handle The uop cache handle.
+ * \param finit The initalization function to initialize uop.
+ * \param signature The closure arguments of the finit.
+ * \param nbytes Number of bytes to in the closure arguments.
+ * \return 0 if success.
+ */
+int VTAPushALUOp(void** uop_handle,
+                 int (*finit)(void*),
+                 void* signature,
+                 int nbytes);
+
+/*!
+ * \brief Push dependence token.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Pop dependence signal.
+ * \param cmd The VTA command handle.
+ * \param from_qid The source queue.
+ * \param to_qid The destination queue.
+ * \return 0 if success.
+ */
+int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
+
+/*!
+ * \brief Synchronize the command handle.
+ *  Commit all the instructions to VTA and wait until
+ *  the accelerator finishes its job.
+ *  Perform all of the out-of-order DRAM stores.
+ * \param cmd The VTA command handle.
+ * \param wait_cycles The limit of poll cycles.
+ *
+ */
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_RUNTIME_H_
diff --git a/vta/include/vta_pynq_driver.h b/vta/include/vta_pynq_driver.h
deleted file mode 100644
index 9a078468..00000000
--- a/vta/include/vta_pynq_driver.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file vta_pynq_driver.h
- * \brief VTA driver for Pynq board.
- */
-
-#ifndef VTA_PYNQ_DRIVER_H_
-#define VTA_PYNQ_DRIVER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <assert.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#ifdef __arm__
-#include "libxlnk_cma.h"
-#else
-void* cma_alloc(size_t size, int cached);
-void cma_free(void* buf);
-uint32_t cma_get_phy_addr(void* buf);
-void xlnkFlushCache(void* buf, int size);
-void xlnkInvalidateCache(void* buf, int size);
-#endif
-
-/*! \brief VTA command handle */
-typedef void * VTAHandle;
-
-/*! \brief DMA command handle */
-typedef struct {
-  /*! \brief Register map to the AXI DMA control registers*/
-  void *dma_register_map;
-  /*! \brief Transmit data descriptor*/
-  void *mm2s_descriptor_register_map;
-  /*! \brief Receive data descriptor*/
-  void *s2mm_descriptor_register_map;
-  /*! \brief Transmit data descriptor physical address*/
-  uint32_t mm2s_descriptor_phy;
-  /*! \brief Receive data descriptor physical address*/
-  uint32_t s2mm_descriptor_phy;
-  /*! \brief Descriptor size */
-  uint32_t descriptor_size;
-  /*! \brief Transaction count for tx channel */
-  uint32_t mm2s_count;
-  /*! \brief Transaction count for rx channel */
-  uint32_t s2mm_count;
-  /*! \brief Multi-channel mode enable */
-  int multichannel_en;
-} DMAHandle;
-
-/*! \brief partial bitstream status file path */
-#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
-/*! \brief bitstream destination file path */
-#define BS_XDEVCFG "/dev/xdevcfg"
-
-/*! \brief Path to /dev/mem */
-#define DEV_MEM_PATH "/dev/mem"
-/*! \brief MMIO driver constant */
-#define MMIO_WORD_LENGTH 4
-/*! \brief MMIO driver constant */
-#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
-
-/*! \brief VTA configuration register address range */
-#define VTA_RANGE 0x100
-/*! \brief VTA configuration register start value */
-#define VTA_START 0x1
-/*! \brief VTA configuration register auto-restart value */
-#define VTA_AUTORESTART 0x81
-/*! \brief VTA configuration register done value */
-#define VTA_DONE 0x1
-
-/*! \brief VTA fetch stage configuration register address
-*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_FETCH_ADDR    0x43C00000
-/*! \brief VTA compute stage configuration register address
-*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_COMPUTE_ADDR  0x43C10000
-/*! \brief VTA compute stage configuration register address
-*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_LOAD_ADDR     0x43C20000
-/*! \brief VTA store stage configuration register address
-*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_STORE_ADDR    0x43C30000
-
-/*! \brief Memory management constants with libxlnk_cma */
-#define CACHED 1
-/*! \brief Memory management constants with libxlnk_cma */
-#define NOT_CACHED 0
-
-/*! \brief log2 of SDS buffer size limit */
-#define LOG_MAX_XFER 22
-/*! \brief SDS buffer size limit */
-#define MAX_XFER (1<<LOG_MAX_XFER)
-
-/*!
- * \brief Returns a memory map to FPGA configuration registers.
- * \param addr The base physical address of the configuration registers.
- * \param length The size of the memory mapped region in bytes.
- * \return A pointer to the memory mapped region.
- */
-void *MapRegister(unsigned addr, size_t length);
-
-/*!
- * \brief Deletes the configuration register memory map.
- * \param vta The memory mapped region.
- * \param length The size of the memory mapped region in bytes.
- */
-void UnmapRegister(void *vta, size_t length);
-
-/*!
- * \brief Writes to a memory mapped configuration register.
- * \param vta_base The handle to the memory mapped configuration registers.
- * \param offset The offset of the register to write to.
- * \param val The value to be written to the memory mapped register.
- */
-void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
-
-/*!
- * \brief Reads from the memory mapped configuration register.
- * \param vta_base The handle to the memory mapped configuration registers.
- * \param offset The offset of the register to read from.
- * \return The value read from the memory mapped register.
- */
-unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
-
-/*!
- * \brief Programming the bit stream on the FPGA.
- * \param bitstream The path to the bit stream file.
- */
-void ProgramVTA(const char* bitstream);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
diff --git a/vta/make/config.mk b/vta/make/config.mk
index 76d817fb..8142571a 100644
--- a/vta/make/config.mk
+++ b/vta/make/config.mk
@@ -25,3 +25,72 @@ ADD_LDFLAGS=
 
 # the additional compile flags you want to add
 ADD_CFLAGS=
+
+# the hardware target
+TARGET=PYNQ_TARGET
+
+#---------------------
+# VTA hardware parameters
+#--------------------
+
+#  Log of input/activation width in bits (default 3 -> 8 bits)
+LOG_INP_WIDTH = 3
+#  Log of kernel weight width in bits (default 3 -> 8 bits)
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits (default 5 -> 32 bits)
+LOG_ACC_WIDTH = 5
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_BLOCK_IN = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_BLOCK_OUT = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+
+#---------------------
+# Derived VTA hardware parameters
+#--------------------
+
+#  Input width in bits
+INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+#  Weight width in bits
+WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Output width in bits
+OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+#  Tensor batch size
+BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+#  Tensor outer block size
+IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
+#  Tensor inner block size
+OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
+#  Uop buffer size in Bytes
+UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+#  Inp buffer size in Bytes
+INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+#  Wgt buffer size in Bytes
+WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+#  Acc buffer size in Bytes
+ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+#  Out buffer size in Bytes
+OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+
+# Update ADD_CFLAGS
+ADD_CFLAGS += \
+	-D$(TARGET) \
+	-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
+	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
+	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
+	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
+	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
+	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
diff --git a/vta/src/driver/pynq/vta_pynq_driver.c b/vta/src/pynq/pynq_driver.cc
similarity index 67%
rename from vta/src/driver/pynq/vta_pynq_driver.c
rename to vta/src/pynq/pynq_driver.cc
index ca9464cf..b4f78db0 100644
--- a/vta/src/driver/pynq/vta_pynq_driver.c
+++ b/vta/src/pynq/pynq_driver.cc
@@ -4,15 +4,31 @@
  * \brief VTA driver for Pynq board.
  */
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "vta_pynq_driver.h"
-#ifdef __cplusplus
-}
-#endif
+#include <vta/driver.h>
+#include "./pynq_driver.h"
 
-void *MapRegister(uint32_t addr, size_t length) {
+
+void* VTAMemAlloc(size_t size, int cached) {
+  return cma_alloc(size, cached);
+}
+
+void VTAMemFree(void* buf) {
+  cma_free(buf);
+}
+
+uint32_t VTAGetMemPhysAddr(void* buf) {
+  return cma_get_phy_addr(buf);
+}
+
+void VTAFlushCache(void* buf, int size) {
+  xlnkFlushCache(buf, size);
+}
+
+void VTAInvalidateCache(void* buf, int size) {
+  xlnkInvalidateCache(buf, size);
+}
+
+void *VTAMapRegister(uint32_t addr, size_t length) {
 
   // Align the base address with the pages
   uint32_t virt_base = addr & ~(getpagesize() - 1);
@@ -24,21 +40,21 @@ void *MapRegister(uint32_t addr, size_t length) {
   return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
 }
 
-void UnmapRegister(void *vta, size_t length) {
+void VTAUnmapRegister(void *vta, size_t length) {
   // Unmap memory
   int status = munmap(vta, length);
   assert(status==0);
 }
 
-void WriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
   *((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
 }
 
-uint32_t ReadMappedReg(void* base_addr, uint32_t offset) {
+uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
   return *((volatile uint32_t *) (((char *) base_addr) + offset));
 }
 
-void ProgramVTA(const char* bitstream) {
+void VTAProgram(const char* bitstream) {
 
     int elem;
     FILE *src, *dst, *partial;
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
new file mode 100644
index 00000000..9e948282
--- /dev/null
+++ b/vta/src/pynq/pynq_driver.h
@@ -0,0 +1,83 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.h
+ * \brief VTA driver for Pynq board.
+ */
+
+#ifndef VTA_PYNQ_DRIVER_H_
+#define VTA_PYNQ_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <assert.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef __arm__
+#include <libxlnk_cma.h>
+#else
+void* cma_alloc(size_t size, int cached);
+void cma_free(void* buf);
+uint32_t cma_get_phy_addr(void* buf);
+void xlnkFlushCache(void* buf, int size);
+void xlnkInvalidateCache(void* buf, int size);
+#endif
+
+/*! \brief partial bitstream status file path */
+#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief bitstream destination file path */
+#define BS_XDEVCFG "/dev/xdevcfg"
+
+/*! \brief Path to /dev/mem */
+#define DEV_MEM_PATH "/dev/mem"
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_LENGTH 4
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+
+/*! \brief VTA configuration register address range */
+#define VTA_RANGE 0x100
+/*! \brief VTA configuration register start value */
+#define VTA_START 0x1
+/*! \brief VTA configuration register auto-restart value */
+#define VTA_AUTORESTART 0x81
+/*! \brief VTA configuration register done value */
+#define VTA_DONE 0x1
+
+/*! \brief VTA fetch stage configuration register address
+*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_FETCH_ADDR    0x43C00000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_COMPUTE_ADDR  0x43C10000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_LOAD_ADDR     0x43C20000
+/*! \brief VTA store stage configuration register address
+*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_STORE_ADDR    0x43C30000
+
+/*! \brief Buffer size limit */
+#define MAX_XFER (1<<22)
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
new file mode 100644
index 00000000..570816e5
--- /dev/null
+++ b/vta/src/runtime.cc
@@ -0,0 +1,1410 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_runtime.cc
+ * \brief VTA runtime for PYNQ in C++11
+ */
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include <thread>
+#include <memory>
+#include <atomic>
+#include <vta/driver.h>
+#include <vta/hw_spec.h>
+#include <vta/runtime.h>
+
+#ifdef PYNQ_TARGET
+#include "./pynq/pynq_driver.h"
+#endif //PYNQ_TARGET
+
+namespace vta {
+
+/*! \brief Enable coherent access between VTA and CPU. */
+static const bool kBufferCoherent = true;
+
+/*!
+ * \brief Data buffer represents data on CMA.
+ */
+struct DataBuffer {
+  /*! \return Virtual address of the data. */
+  void* virt_addr() const {
+    return data_;
+  }
+  /*! \return Physical address of the data. */
+  uint32_t phy_addr() const {
+    return phy_addr_;
+  }
+  /*!
+   * \brief Invalidate the cache of given location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void InvalidateCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAInvalidateCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
+    }
+  }
+  /*!
+   * \brief Invalidate the cache of certain location in data buffer.
+   * \param offset The offset to the data.
+   * \param size The size of the data.
+   */
+  void FlushCache(size_t offset, size_t size) {
+    if (!kBufferCoherent) {
+      VTAFlushCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
+    }
+  }
+  /*!
+   * \brief Allocate a buffer of a given size.
+   * \param size The size of the buffer.
+   */
+  static DataBuffer* Alloc(size_t size) {
+    void* data = VTAMemAlloc(size, 1);
+    assert(data != nullptr);
+    DataBuffer* buffer = new DataBuffer();
+    buffer->data_ = data;
+    buffer->phy_addr_ = VTAGetMemPhysAddr(data);
+    return buffer;
+  }
+  /*!
+   * \brief Free the data buffer.
+   * \param buffer The buffer to be freed.
+   */
+  static void Free(DataBuffer* buffer) {
+    VTAMemFree(buffer->data_);
+    delete buffer;
+  }
+  /*!
+   * \brief Create data buffer header from buffer ptr.
+   * \param buffer The buffer pointer.
+   * \return The corresponding data buffer header.
+   */
+  static DataBuffer* FromHandle(const void* buffer) {
+    return const_cast<DataBuffer*>(
+        reinterpret_cast<const DataBuffer*>(buffer));
+  }
+
+ private:
+  /*! \brief The internal data. */
+  void* data_;
+  /*! \brief The physical address of the buffer, excluding header. */
+  uint32_t phy_addr_;
+};
+
+/*!
+ * \brief Micro op kernel.
+ *  Contains functions to construct the kernel with prefix Push.
+ */
+class UopKernel {
+ public:
+  /*! \brief Loop information. */
+  struct LoopEntry {
+    uint32_t extent;
+    uint32_t dst_factor;
+    uint32_t src_factor;
+    uint32_t wgt_factor;
+  };
+  /*!
+   * \brief Construct UopKernel with signature.
+   * \param signature The pointer to signature.
+   * \param nbytes Number of bytes.
+   */
+  UopKernel(const char* signature, int nbytes)
+      : signature_(signature, signature + nbytes) {
+  }
+  /*!
+   * \brief Verify if the signature is correct.
+   * \param signature Signature ptr.
+   * \param nbytes Number of bytes.
+   */
+  bool MatchSignature(void* signature, int nbytes) const {
+    if (static_cast<size_t>(nbytes) != signature_.size()) return false;
+    return memcmp(signature, signature_.data(), nbytes) == 0;
+  }
+  /*! \return Whether the kernel is cached in SRAM. */
+  bool cached() const {
+    return sram_begin_ != sram_end_;
+  }
+  /*! \return The length of the micro op sequence. */
+  size_t size() const {
+    return seq_.size();
+  }
+  /*! \return The micro-op data. */
+  const VTAUop* data() const {
+    return seq_.data();
+  }
+  /*! \return The loop structure. */
+  const std::vector<LoopEntry>& loop() const {
+    return loop_;
+  }
+  /*!
+   * \brief Declare loop start.
+   * \param extent The loop extent.
+   * \param dst_factor Loop factor of accum index.
+   * \param src_factor Loop factor of input index
+   * \param wgt_factor Loop factor of weight index.
+   */
+  void PushLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor) {
+    LoopEntry le;
+    le.extent = extent;
+    le.dst_factor = dst_factor;
+    le.src_factor = src_factor;
+    le.wgt_factor = wgt_factor;
+    assert(seq_.size() == 0);
+    assert(loop_.size() < 2);
+    loop_.push_back(le);
+    ++loop_ptr_;
+  }
+  /*!
+   * \brief Declare loop end.
+   */
+  void PushLoopEnd() {
+    --loop_ptr_;
+  }
+  /*!
+   * \brief Push micro op into kernel.
+   * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
+   * \param reset_out Resets the accum to 0.
+   * \param dst_index The accum memory index.
+   * \param src_index The input memory (gemm) / accum memory (alu) index.
+   * \param wgt_index The weight memory index.
+   * \param opcode The ALU opcode.
+   * \param use_imm Use immediate in ALU mode if set to true.
+   * \param imm_val Immediate value in ALU mode.
+   */
+  void Push(uint32_t mode,
+            uint32_t reset_out,
+            uint32_t dst_index,
+            uint32_t src_index,
+            uint32_t wgt_index,
+            uint32_t opcode,
+            uint32_t use_imm,
+            uint32_t imm_val) {
+    // The loop nest structure
+    VerifyDep(dst_index);
+    VTAUop op;
+    op.reset_out = reset_out;
+    op.dst_idx = dst_index;
+    op.src_idx = src_index;
+    op.wgt_idx = wgt_index;
+    seq_.push_back(op);
+    // Ensure that mode is consistent if set
+    if (mode_==0xFFFFFFFF) {
+      mode_ = mode;
+    } else {
+      assert(mode_==mode);
+    }
+    // Check kernel op and imm/imm_val in ALU mode
+    if (mode==1) {
+      if (opcode_==0xFFFFFFFF) {
+        opcode_=opcode;
+        use_imm_=use_imm;
+        imm_val_=imm_val;
+      } else {
+        assert(opcode_==opcode);
+        assert(use_imm_==use_imm);
+        assert(imm_val_==imm_val);
+      }
+    }
+  }
+  /*! \brief Dump kernel micro ops to stdout. */
+  void Dump() {
+    uint32_t size = seq_.size();
+    printf("There are %u uops\n", size);
+    for (uint32_t i = 0; i < size; ++i) {
+      printf("[%04u]\t acc=%u, inp=%u, wgt=%u, reset_out=%u\n",
+             i,
+             seq_[i].dst_idx,
+             seq_[i].src_idx,
+             seq_[i].wgt_idx,
+             seq_[i].reset_out);
+
+    }
+    printf("\n");
+  }
+
+ public:
+  // The kernel's mode, opcode, immediate setting and value
+  uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
+  uint32_t opcode_{0xFFFFFFFF};
+  bool use_imm_{false};
+  uint16_t imm_val_{0};
+ private:
+  // Verify that we don't write to the same acc_mem index two cycles in a row
+  void VerifyDep(uint32_t dst_index) {
+    size_t step = std::min(static_cast<size_t>(2U), seq_.size());
+    for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
+      assert(seq_[i].dst_idx != dst_index);
+    }
+  }
+  // The uop buffer
+  template<int, bool, bool>
+  friend class UopQueue;
+  friend class CommandQueue;
+  // SRAM location if begin != end.
+  uint32_t sram_begin_{0};
+  uint32_t sram_end_{0};
+  // The signature used for verification
+  std::vector<char> signature_;
+  // Internal sequence
+  std::vector<VTAUop> seq_;
+  // The loop nest structure specific to ALU instructions
+  std::vector<LoopEntry> loop_;
+  // The loop pointer
+  size_t loop_ptr_{0};
+};
+
+/*!
+ * \brief Base class of all queues to send and recv serial data.
+ * \param kElemBytes Element unit bytes.
+ * \param kMaxBytes Maximum number of bytes.
+ * \param kCoherent Whether we have coherent access to the buffer.
+ * \param kAlwaysCache Wether we should use cached memory.
+ */
+class BaseQueue {
+ public:
+  ~BaseQueue() {
+    if (dram_buffer_ != nullptr) {
+      VTAMemFree(dram_buffer_);
+    }
+  }
+  /*! \return Content of DRAM buffer. */
+  char* dram_buffer() const {
+    return dram_buffer_;
+  }
+  /*! \return Physical address of DRAM. */
+  uint32_t dram_phy_addr() const {
+    return dram_phy_addr_;
+  }
+  /*! \return Whether there is pending information. */
+  bool pending() const {
+    return sram_begin_ != sram_end_;
+  }
+  /*! \brief Initialize the space of the buffer. */
+  void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
+    coherent_ = coherent;
+    always_cache_ = always_cache;
+    elem_bytes_ = elem_bytes;
+    dram_buffer_ = static_cast<char*>(VTAMemAlloc(
+        max_bytes, coherent || always_cache_));
+    assert(dram_buffer_ != nullptr);
+    dram_phy_addr_ = VTAGetMemPhysAddr(dram_buffer_);
+  }
+  /*!
+   * \brief Reset the pointer of the buffer.
+   *  Set SRAM pointer to be the current end.
+   */
+  void Reset() {
+    dram_begin_ = dram_end_ = 0;
+    sram_begin_ = sram_end_;
+  }
+  void AutoReadBarrier() {
+    ReadBarrier(elem_bytes_ * 8, 0, dram_end_);
+  }
+  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
+  void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
+    if (!coherent_ && always_cache_ && dram_extent != 0) {
+      dram_begin = dram_begin * elem_bits / 8;
+      dram_extent = dram_extent * elem_bits / 8;
+      VTAFlushCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
+                     dram_extent);
+    }
+  }
+  /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */
+  void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
+    if (!coherent_ && always_cache_ && dram_extent != 0) {
+      dram_begin = dram_begin * elem_bits / 8;
+      dram_extent = dram_extent * elem_bits / 8;
+      VTAInvalidateCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
+                          dram_extent);
+    }
+  }
+
+ protected:
+  // Cache coherence access
+  bool coherent_{false};
+  // Make the buffer cacheable
+  bool always_cache_{false};
+  // Element bytes
+  uint32_t elem_bytes_{0};
+  // Begin location of current SRAM read in FIFO mode
+  uint32_t sram_begin_{0};
+  // End location of current SRAM write in FIFO mode
+  uint32_t sram_end_{0};
+  // The current pending offset in DRAM in FIFO mode
+  uint32_t dram_begin_{0};
+  // The current pending offset in DRAM in FIFO mode
+  uint32_t dram_end_{0};
+  // The buffer in DRAM
+  char* dram_buffer_{nullptr};
+  // Physics address of the buffer
+  uint32_t dram_phy_addr_;
+};
+
+/*!
+ * \brief Micro op buffer that manages the micro op cache.
+ */
+template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
+class UopQueue : public BaseQueue {
+ public:
+  void InitSpace() {
+    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
+  }
+  // Push data to the queue
+  template<typename FAutoSync>
+  void Push(UopKernel* kernel, FAutoSync fautosync) {
+    if (kernel->cached()) return;
+    size_t num_op = kernel->size();
+    if (dram_end_ + num_op > kMaxElems) {
+      fautosync();
+      assert(dram_end_ <= kMaxElems);
+    }
+    assert(num_op <= kMaxNumUop);
+    uint32_t uop_begin = 0;
+    if (sram_end_ + num_op > kMaxElems) {
+      // Need to evict
+      cache_ptr_ = 0;
+      sram_end_ = num_op;
+    } else {
+      uop_begin = sram_end_;
+      sram_end_ += num_op;
+    }
+    // Simple eviction policy
+    uint32_t evict_begin = cache_ptr_;
+    for (;cache_ptr_ < cache_.size(); ++cache_ptr_) {
+      if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
+      cache_[cache_ptr_]->sram_begin_ = 0;
+      cache_[cache_ptr_]->sram_end_ = 0;
+    }
+    memcpy(dram_buffer_ + dram_end_ * kElemBytes,
+           kernel->data(),
+           num_op * kElemBytes);
+    dram_end_ += num_op;
+    kernel->sram_begin_ = uop_begin;
+    kernel->sram_end_ = sram_end_;
+    assert(uop_begin != sram_end_);
+    cache_.insert(cache_.begin() + cache_ptr_, kernel);
+    cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_);
+    cache_ptr_ = evict_begin + 1;
+  }
+  // Flush as weight load
+  void FlushUopLoad(VTAMemInsn* insn) {
+    if (sram_begin_ != sram_end_) {
+      assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
+      insn->memory_type = MEM_ID_UOP;
+      insn->sram_base = sram_begin_;
+      insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
+      insn->y_size = 1;
+      insn->x_size = (dram_end_ - dram_begin_);
+      insn->x_stride = (dram_end_ - dram_begin_);
+      insn->y_pad_0 = 0;
+      insn->y_pad_1 = 0;
+      insn->x_pad_0 = 0;
+      insn->x_pad_1 = 0;
+      // Reset indices
+      sram_begin_ = sram_end_;
+      dram_begin_ = dram_end_;
+    }
+  }
+
+ private:
+  // Cache pointer
+  uint32_t cache_ptr_{0};
+  // Cached ring, sorted by sram_begin
+  std::vector<UopKernel*> cache_;
+  // Constants
+  static constexpr int kElemBytes = sizeof(VTAUop);
+  static constexpr int kMaxNumUop = UOP_BUFF_DEPTH;
+  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+};
+
+// Internal kernel structure
+class UopKernelMap {
+ public:
+  // Simple hash map
+  UopKernel** Get(void* signature,
+                  int nbytes) {
+    uint32_t key = 0;
+    assert(nbytes == 0 || nbytes == sizeof(int));
+    if (nbytes == sizeof(int)) {
+      memcpy(&key, signature, sizeof(int));
+      key = key + 1;
+    }
+    assert(key < 100);
+    if (kmap_.size() <= key) {
+      kmap_.resize(key + 1, nullptr);
+    }
+    return &(kmap_[key]);
+  }
+
+ private:
+  std::vector<UopKernel*> kmap_;
+};
+
+enum PipelineStage : int {
+  kNoneStage = 0,
+  kLoadStage = 1,
+  kComputeStage = 2,
+  kStoreStage = 3
+};
+
+// Instruction Queue
+template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
+class InsnQueue : public BaseQueue {
+ public:
+  /*! \brief Initialize the space. */
+  void InitSpace() {
+    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
+    // Initialize the stage
+    std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
+    std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
+  }
+  /*! \return The data pointer. */
+  VTAGenericInsn* data() {
+    return reinterpret_cast<VTAGenericInsn*>(dram_buffer_);
+  }
+  /*! \return Number of instructions. */
+  uint32_t count() {
+    return dram_end_;
+  }
+  // Insert dependency push of load
+  void DepPop(int from, int to) {
+    // NOTE: This instruction executes on queue[to]
+    if (from < to) {
+      if (pending_pop_prev_[to]) {
+        this->CommitPendingPop(to);
+      }
+      pending_pop_prev_[to] = 1;
+    } else {
+      if (pending_pop_next_[to]) {
+        this->CommitPendingPop(to);
+      }
+      pending_pop_next_[to] = 1;
+    }
+    // Impossible condition
+    assert(from != kLoadStage || to != kStoreStage);
+    assert(to != kLoadStage || to != kComputeStage);
+  }
+  // Insert dependency push of load
+  void DepPush(int from, int to) {
+    // NOTE: this instruction executes on queue[from]
+    this->CommitPendingPop(from);
+    if (dram_end_ != 0) {
+      VTAMemInsn* mptr =
+          reinterpret_cast<VTAMemInsn*>(dram_buffer_) + dram_end_ - 1;
+      if (GetPipelineStage(mptr) == from) {
+        if (from < to && !mptr->push_next_dep) {
+          // push(LD->C) or push(C->ST)
+          mptr->push_next_dep = true; return;
+        } else if (from > to && !mptr->push_prev_dep) {
+          // push(C->LD) or push(ST->C)
+          mptr->push_prev_dep = true; return;
+        }
+      }
+    }
+    if (from < to) {
+      // Push next dep
+      PushNoop(from, false, true, false, false);
+    } else {
+      // Push prev dep
+      PushNoop(from, true, false, false, false);
+    }
+  }
+  // Create a new instruction for a GEMM stage
+  VTAGemInsn* CreateGemInsn() {
+    return reinterpret_cast<VTAGemInsn*>(
+        Create(kComputeStage));
+  }
+  // Create a new instruction for a ALU stage
+  VTAAluInsn* CreateAluInsn() {
+    return reinterpret_cast<VTAAluInsn*>(
+        Create(kComputeStage));
+  }
+  // Create a new instruction for a memory stage
+  VTAMemInsn* CreateMemInsn(int memory_type) {
+    return reinterpret_cast<VTAMemInsn*>(
+        Create(GetMemPipelineStage(memory_type)));
+  }
+  // create a new instruction for a store stage
+  VTAMemInsn* CreateStoreInsn() {
+    return reinterpret_cast<VTAMemInsn*>(
+        Create(kStoreStage));
+  }
+  // Rewrite instruction stream to force serial execution
+  void RewriteForceSerial() {
+    int insn_count = count();
+    VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
+    for (int i = 1; i < insn_count; ++i) {
+      PipelineStage prev = GetPipelineStage(mem_ptr + i - 1);
+      PipelineStage now = GetPipelineStage(mem_ptr + i);
+      if (prev==kLoadStage && now==kComputeStage) {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = true;
+        mem_ptr[i].pop_prev_dep = true;
+        mem_ptr[i].pop_next_dep = false;
+      } else if (prev==kComputeStage && now==kLoadStage) {
+        mem_ptr[i - 1].push_prev_dep = true;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = true;
+      } else if (prev==kStoreStage && now==kComputeStage) {
+        mem_ptr[i - 1].push_prev_dep = true;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = true;
+      } else if (prev==kComputeStage && now==kStoreStage) {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = true;
+        mem_ptr[i].pop_prev_dep = true;
+        mem_ptr[i].pop_next_dep = false;
+      } else {
+        mem_ptr[i - 1].push_prev_dep = false;
+        mem_ptr[i - 1].push_next_dep = false;
+        mem_ptr[i].pop_prev_dep = false;
+        mem_ptr[i].pop_next_dep = false;
+      }
+    }
+  }
+
+  // Helper function: Get Opcode string
+  const char* getOpcodeString(int opcode, bool use_imm) {
+      // The string name
+      if (opcode==ALU_OPCODE_MIN) {
+          if (use_imm) {
+              return "min imm";
+          } else {
+              return "min";
+          }
+      } else if (opcode==ALU_OPCODE_MAX) {
+          if (use_imm) {
+              return "max imm";
+          } else {
+              return "max";
+          }
+      } else if (opcode==ALU_OPCODE_ADD) {
+          if (use_imm) {
+              return "add imm";
+          } else {
+              return "add";
+          }
+      } else if (opcode==ALU_OPCODE_SUB) {
+          if (use_imm) {
+              return "sub imm";
+          } else {
+              return "sub";
+          }
+      } else if (opcode==ALU_OPCODE_MUL) {
+          if (use_imm) {
+              return "mul imm";
+          } else {
+              return "mul";
+          }
+      } else if (opcode==ALU_OPCODE_SHL) {
+          return "shl";
+      } else if (opcode==ALU_OPCODE_SHR) {
+          return "shr";
+      }
+
+      return "unknown op";
+  }
+
+  // Dump instructions in the queue
+  void DumpInsn() {
+    // Keep tabs on dependence queues
+    int l2g_queue = 0;
+    int g2l_queue = 0;
+    int s2g_queue = 0;
+    int g2s_queue = 0;
+    // Converter
+    union VTAInsn c;
+    // Iterate over all instructions
+    int insn_count = count();
+    const VTAGenericInsn* insn = data();
+    printf("There are %u instructions\n", insn_count);
+    for (int i = 0; i < insn_count; ++i) {
+      // Fetch instruction and decode opcode
+      c.generic = insn[i];
+      printf("INSTRUCTION %u: ", i);
+      if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
+        if (c.mem.x_size == 0) {
+          if (c.mem.opcode == OPCODE_STORE) {
+            printf("NOP-STORE-STAGE\n");
+          }
+          else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
+            printf("NOP-COMPUTE-STAGE\n");
+          } else {
+            printf("NOP-MEMORY-STAGE\n");
+          }
+          printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+                 static_cast<int>(c.mem.pop_prev_dep),
+                 static_cast<int>(c.mem.pop_next_dep),
+                 static_cast<int>(c.mem.push_prev_dep),
+                 static_cast<int>(c.mem.push_next_dep));
+          // Count status in queues
+          if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
+            if (c.mem.opcode == OPCODE_STORE) {
+                assert(c.mem.pop_next_dep == false);
+                assert(c.mem.push_next_dep == false);
+                if (c.mem.pop_prev_dep) g2s_queue--;
+                if (c.mem.push_prev_dep) s2g_queue++;
+            } else if (c.mem.opcode == OPCODE_LOAD &&
+                       (c.mem.memory_type == MEM_ID_INP ||
+                        c.mem.memory_type == MEM_ID_WGT) ) {
+                assert(c.mem.pop_prev_dep == false);
+                assert(c.mem.push_prev_dep == false);
+                if (c.mem.pop_next_dep) g2l_queue--;
+                if (c.mem.push_next_dep) l2g_queue++;
+            } else {
+                if (c.mem.pop_prev_dep) l2g_queue--;
+                if (c.mem.push_prev_dep) g2l_queue++;
+                if (c.mem.pop_next_dep) s2g_queue--;
+                if (c.mem.push_next_dep) g2s_queue++;
+            }
+          } else if (c.mem.opcode == OPCODE_GEMM) {
+            // Print instruction field information
+            if (c.gemm.pop_prev_dep) l2g_queue--;
+            if (c.gemm.push_prev_dep) g2l_queue++;
+            if (c.gemm.pop_next_dep) s2g_queue--;
+            if (c.gemm.push_next_dep) g2s_queue++;
+          }
+          printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+          printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+          continue;
+        }
+        // Print instruction field information
+        if (c.mem.opcode==OPCODE_LOAD) {
+            printf("LOAD ");
+            if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n");
+            if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n");
+            if (c.mem.memory_type == MEM_ID_INP) printf("INP\n");
+            if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n");
+        }
+        if (c.mem.opcode==OPCODE_STORE) {
+            printf("STORE\n");
+        }
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
+               static_cast<int>(c.mem.dram_base),
+               static_cast<int>(c.mem.sram_base));
+        printf("\ty: size=%d, pad=[%d, %d]\n",
+               static_cast<int>(c.mem.y_size),
+               static_cast<int>(c.mem.y_pad_0),
+               static_cast<int>(c.mem.y_pad_1));
+        printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+               static_cast<int>(c.mem.x_size),
+               static_cast<int>(c.mem.x_stride),
+               static_cast<int>(c.mem.x_pad_0),
+               static_cast<int>(c.mem.x_pad_1));
+      } else if (c.mem.opcode==OPCODE_GEMM) {
+        // Print instruction field information
+        printf("GEMM\n");
+
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\trange (%d, %d)\n",
+               static_cast<int>(c.gemm.uop_bgn),
+               static_cast<int>(c.gemm.uop_end));
+        printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+               static_cast<int>(c.gemm.iter_out),
+               static_cast<int>(c.gemm.wgt_factor_out),
+               static_cast<int>(c.gemm.src_factor_out),
+               static_cast<int>(c.gemm.dst_factor_out));
+        printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+               static_cast<int>(c.gemm.iter_in),
+               static_cast<int>(c.gemm.wgt_factor_in),
+               static_cast<int>(c.gemm.src_factor_in),
+               static_cast<int>(c.gemm.dst_factor_in));
+      } else if (c.mem.opcode == OPCODE_ALU) {
+        // Print instruction field information
+        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
+        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+        printf("\trange (%d, %d)\n",
+               static_cast<int>(c.alu.uop_bgn),
+               static_cast<int>(c.alu.uop_end));
+        printf("\touter loop - iter: %d, dst: %d, src: %d\n",
+               static_cast<int>(c.alu.iter_out),
+               static_cast<int>(c.alu.dst_factor_out),
+               static_cast<int>(c.alu.src_factor_out));
+        printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
+               static_cast<int>(c.alu.iter_in),
+               static_cast<int>(c.alu.dst_factor_in),
+               static_cast<int>(c.alu.src_factor_in));
+      } else if (c.mem.opcode == OPCODE_FINISH) {
+        printf("FINISH\n");
+      }
+
+      // Count status in queues
+      if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
+        if (c.mem.opcode == OPCODE_STORE) {
+            assert(c.mem.pop_next_dep == false);
+            assert(c.mem.push_next_dep == false);
+            if (c.mem.pop_prev_dep) g2s_queue--;
+            if (c.mem.push_prev_dep) s2g_queue++;
+        } else if (c.mem.opcode == OPCODE_LOAD &&
+                   (c.mem.memory_type == MEM_ID_INP ||
+                    c.mem.memory_type == MEM_ID_WGT) ) {
+            assert(c.mem.pop_prev_dep == false);
+            assert(c.mem.push_prev_dep == false);
+            if (c.mem.pop_next_dep) g2l_queue--;
+            if (c.mem.push_next_dep) l2g_queue++;
+        } else {
+            if (c.mem.pop_prev_dep) l2g_queue--;
+            if (c.mem.push_prev_dep) g2l_queue++;
+            if (c.mem.pop_next_dep) s2g_queue--;
+            if (c.mem.push_next_dep) g2s_queue++;
+        }
+      } else if (c.mem.opcode == OPCODE_GEMM ||
+                 c.mem.opcode == OPCODE_ALU) {
+        // Print instruction field information
+        if (c.gemm.pop_prev_dep) l2g_queue--;
+        if (c.gemm.push_prev_dep) g2l_queue++;
+        if (c.gemm.pop_next_dep) s2g_queue--;
+        if (c.gemm.push_next_dep) g2s_queue++;
+      }
+      printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+    }
+  }
+
+  // Commit all pending pop of corresponding stage
+  void CommitPendingPop(int stage) {
+    // Handle the LD<->compute queue
+    // NOTE: pop executes on target(stage)
+    assert(stage > 0 && stage < 4);
+    if (pending_pop_prev_[stage] ||
+        pending_pop_next_[stage]) {
+      PushNoop(stage, false, false,
+               pending_pop_prev_[stage],
+               pending_pop_next_[stage]);
+      pending_pop_prev_[stage] = 0;
+      pending_pop_next_[stage] = 0;
+    }
+  }
+
+  void CommitPending() {
+    for (int i = kLoadStage; i <= kStoreStage; ++i) {
+      CommitPendingPop(i);
+    }
+  }
+
+  bool PendingPop() {
+    for (int i = kLoadStage; i <= kStoreStage; ++i) {
+      if (pending_pop_prev_[i]) return true;
+      if (pending_pop_next_[i]) return true;
+    }
+    return false;
+  }
+
+ protected:
+  /*! \return Add new instruction to the buffer. */
+  VTAGenericInsn* NextInsn() {
+    VTAGenericInsn* insn  = data() + dram_end_;
+    ++dram_end_;
+    assert(dram_end_ < kMaxElems);
+    return insn;
+  }
+  // Create a new instruction for a given stage
+  VTAGenericInsn* Create(PipelineStage stage) {
+    VTAGenericInsn* gptr = NextInsn();
+    VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
+    mptr->pop_prev_dep = pending_pop_prev_[stage];
+    mptr->pop_next_dep = pending_pop_next_[stage];
+    mptr->push_prev_dep = false;
+    mptr->push_next_dep = false;
+    pending_pop_prev_[stage] = 0;
+    pending_pop_next_[stage] = 0;
+    return gptr;
+  }
+  // Get stage of the memory
+  static PipelineStage GetMemPipelineStage(int memory_type) {
+    if (memory_type == MEM_ID_ACC) return kComputeStage;
+    if (memory_type == MEM_ID_UOP) return kComputeStage;
+    return kLoadStage;
+  }
+  // Get stage of the computation
+  static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
+    if (insn->opcode == OPCODE_GEMM) return kComputeStage;
+    if (insn->opcode == OPCODE_ALU) return kComputeStage;
+    if (insn->opcode == OPCODE_LOAD) {
+      if (insn->x_size == 0) return kNoneStage;
+      if (insn->memory_type == MEM_ID_ACC) return kComputeStage;
+      if (insn->memory_type == MEM_ID_UOP) return kComputeStage;
+      return kLoadStage;
+    }
+    if (insn->opcode == OPCODE_STORE) {
+      // FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0
+      //        For now we'll refrain from checking the memory_type to avoid an assertion error...
+      return kStoreStage;
+    }
+    assert(false);
+    return kNoneStage;
+  }
+  // Push no-op
+  void PushNoop(int stage,
+                bool push_prev_dep, bool push_next_dep,
+                bool pop_prev_dep, bool pop_next_dep) {
+    VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
+    insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD);
+    insn->push_prev_dep = push_prev_dep;
+    insn->push_next_dep = push_next_dep;
+    insn->pop_prev_dep = pop_prev_dep;
+    insn->pop_next_dep = pop_next_dep;
+    insn->sram_base = 0;
+    insn->dram_base = 0;
+    insn->y_size = 0;
+    insn->x_size = 0;
+    insn->x_stride = 0;
+    insn->y_pad_0 = 0;
+    insn->y_pad_1 = 0;
+    insn->x_pad_0 = 0;
+    insn->x_pad_1 = 0;
+    insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP);
+  }
+
+ private:
+  // Pending pop of each isntruction queue, qid=0 is not used
+  int pending_pop_prev_[4];
+  int pending_pop_next_[4];
+  static constexpr int kElemBytes = sizeof(VTAGenericInsn);
+  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+};
+
+/*!
+ * \brief The command queue object that handles the request.
+ */
+class CommandQueue {
+ public:
+  CommandQueue() {
+    this->InitSpace();
+  }
+  void InitSpace() {
+    uop_queue_.InitSpace();
+    insn_queue_.InitSpace();
+    // VTA stage handles
+    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+    printf("Initialize VTACommandHandle...\n");
+  }
+
+  ~CommandQueue() {
+    // Close VTA stage handle
+    VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
+    printf("Close VTACommandhandle...\n");
+  }
+
+  uint32_t GetElemBytes(uint32_t memory_id) {
+    switch (memory_id){
+      case MEM_ID_UOP: return UOP_ELEM_BYTES;
+      case MEM_ID_INP: return INP_ELEM_BYTES;
+      case MEM_ID_WGT: return WGT_ELEM_BYTES;
+      case MEM_ID_ACC: return ACC_ELEM_BYTES;
+      case MEM_ID_OUT: return INP_ELEM_BYTES;
+      default: break;
+    }
+    printf("Memory id not recognized: %d\n", memory_id);
+    assert(false);
+    return 0;
+  }
+
+  void LoadBuffer2D(void* src_dram_addr,
+                    uint32_t src_elem_offset,
+                    uint32_t x_size,
+                    uint32_t y_size,
+                    uint32_t x_stride,
+                    uint32_t x_pad_before,
+                    uint32_t y_pad_before,
+                    uint32_t x_pad_after,
+                    uint32_t y_pad_after,
+                    uint32_t dst_sram_index,
+                    uint32_t dst_memory_type) {
+    VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
+    insn->opcode = OPCODE_LOAD;
+    insn->memory_type = dst_memory_type;
+    insn->sram_base = dst_sram_index;
+    DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
+    insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
+    insn->y_size = y_size;
+    insn->x_size = x_size;
+    insn->x_stride = x_stride;
+    insn->y_pad_0 = y_pad_before;
+    insn->y_pad_1 = y_pad_after;
+    insn->x_pad_0 = x_pad_before;
+    insn->x_pad_1 = x_pad_after;
+    this->CheckInsnOverFlow();
+  }
+
+  void StoreBuffer2D(uint32_t src_sram_index,
+                     uint32_t src_memory_type,
+                     void* dst_dram_addr,
+                     uint32_t dst_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride) {
+    VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
+    insn->opcode = OPCODE_STORE;
+    insn->memory_type = src_memory_type;
+    insn->sram_base = src_sram_index;
+    DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
+    insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
+    insn->y_size = y_size;
+    insn->x_size = x_size;
+    insn->x_stride = x_stride;
+    insn->y_pad_0 = 0;
+    insn->y_pad_1 = 0;
+    insn->x_pad_0 = 0;
+    insn->x_pad_1 = 0;
+    this->CheckInsnOverFlow();
+  }
+
+  void DepPush(int from_qid, int to_qid) {
+    insn_queue_.DepPush(from_qid, to_qid);
+  }
+
+  void DepPop(int from_qid, int to_qid) {
+    insn_queue_.DepPop(from_qid, to_qid);
+  }
+
+  void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
+    if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
+      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
+      DataBuffer::FromHandle(buffer)->FlushCache(
+          elem_bytes * start, elem_bytes * extent);
+    }
+  }
+
+  void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
+    if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
+      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
+      DataBuffer::FromHandle(buffer)->InvalidateCache(
+          elem_bytes * start, elem_bytes * extent);
+    }
+  }
+
+  void Synchronize(uint32_t wait_cycles) {
+    // Insert dependences to force serialization
+    if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
+      insn_queue_.RewriteForceSerial();
+    }
+    // This will issue finish after last store finishes
+    insn_queue_.DepPush(kStoreStage, kComputeStage);
+    insn_queue_.DepPush(kLoadStage, kComputeStage);
+    insn_queue_.DepPop(kStoreStage, kComputeStage);
+    insn_queue_.DepPop(kLoadStage, kComputeStage);
+    insn_queue_.CommitPendingPop(kComputeStage);
+    // NOTE: FINISH cannot contain pop
+    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
+    insn->opcode = OPCODE_FINISH;
+    assert(!insn_queue_.PendingPop());
+    // Check if there are no instruction to execute at all
+    if (insn_queue_.count() == 0) return;
+    // Synchronization for the queues
+    uop_queue_.AutoReadBarrier();
+    insn_queue_.AutoReadBarrier();
+    // Dump instructions if debug enabled
+    if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
+      insn_queue_.DumpInsn();
+    }
+    // Make sure that the last instruction is a finish instruction
+    assert(reinterpret_cast<VTAMemInsn*>(
+        insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH);
+
+#ifdef PYNQ_TARGET
+    // Make sure that we don't exceed contiguous physical memory limits
+    assert(insn_queue_.count() < MAX_XFER);
+
+    // NOTE: Register address map is derived from the auto-generated
+    // driver files available under hardware/build/vivado/<design>/export/driver
+    // FETCH @ 0x10 : Data signal of insn_count_V
+    VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count());
+    // FETCH @ 0x18 : Data signal of insns_V
+    VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr());
+    // LOAD @ 0x10 : Data signal of inputs_V
+    VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
+    // LOAD @ 0x18 : Data signal of weight_V
+    VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
+    // COMPUTE @ 0x10 : Data signal of uops_V
+    VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
+    // COMPUTE @ 0x18 : Data signal of biases_V
+    VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
+    // STORE @ 0x10 : Data signal of outputs_V
+    VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
+
+    // VTA start
+    VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
+    VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
+
+    // Loop until the VTA is done
+    unsigned t, flag = 0;
+    for (t = 0; t < wait_cycles; ++t) {
+      flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
+      if (flag == VTA_DONE) break;
+      std::this_thread::yield();
+    }
+    // Report error if timeout
+    assert(t < wait_cycles);
+#endif //PYNQ_TARGET
+
+    // Reset buffers
+    uop_queue_.Reset();
+    insn_queue_.Reset();
+  }
+
+  // Get record kernel
+  UopKernel* record_kernel() const {
+    assert(record_kernel_ != nullptr);
+    return record_kernel_;
+  }
+
+  // Set debug flag
+  void SetDebugFlag(int debug_flag) {
+    debug_flag_ = debug_flag;
+  }
+
+  void PushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
+    if (uptr[0] == nullptr) {
+      uptr[0] = new UopKernelMap();
+    }
+    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
+    if (kptr[0] == nullptr) {
+      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
+      assert((*finit)(signature) == 0);
+      kptr[0] = static_cast<UopKernel*>(record_kernel_);
+      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
+        record_kernel_->Dump();
+      }
+      record_kernel_ = nullptr;
+    }
+    this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
+    this->CheckInsnOverFlow();
+  }
+
+  void PushALUUop(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
+    if (uptr[0] == nullptr) {
+      uptr[0] = new UopKernelMap();
+    }
+    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
+    if (kptr[0] == nullptr) {
+      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
+      assert((*finit)(signature) == 0);
+      kptr[0] = static_cast<UopKernel*>(record_kernel_);
+      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
+        record_kernel_->Dump();
+      }
+      record_kernel_ = nullptr;
+    }
+    this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
+    this->CheckInsnOverFlow();
+  }
+
+  static std::shared_ptr<CommandQueue>& ThreadLocal() {
+    static std::shared_ptr<CommandQueue> inst =
+        std::make_shared<CommandQueue>();
+    return inst;
+  }
+
+  static void Shutdown() {
+    ThreadLocal().reset();
+  }
+
+ private:
+  // Push GEMM uop to the command buffer
+  void PushGEMMOp(UopKernel* kernel) {
+    uop_queue_.Push(kernel,
+                    [this]() { this->AutoSync(); });
+    if (uop_queue_.pending()) {
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
+      insn->opcode = OPCODE_LOAD;
+      uop_queue_.FlushUopLoad(insn);
+    }
+    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
+    insn->opcode = OPCODE_GEMM;
+    insn->uop_bgn = kernel->sram_begin_;
+    insn->uop_end = kernel->sram_end_;
+    const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
+    if (loop.size() > 0) {
+      insn->iter_out = loop[0].extent;
+      insn->wgt_factor_out = loop[0].wgt_factor;
+      insn->src_factor_out = loop[0].src_factor;
+      insn->dst_factor_out = loop[0].dst_factor;
+    } else {
+      insn->iter_out = 1;
+      insn->wgt_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->dst_factor_out = 0;
+    }
+    if (loop.size() > 1) {
+      insn->iter_in = loop[1].extent;
+      insn->wgt_factor_in = loop[1].wgt_factor;
+      insn->src_factor_in = loop[1].src_factor;
+      insn->dst_factor_in = loop[1].dst_factor;
+    } else {
+      insn->iter_in = 1;
+      insn->wgt_factor_in = 0;
+      insn->src_factor_in = 0;
+      insn->dst_factor_in = 0;
+    }
+  }
+
+  // Push ALU uop to the command buffer
+  void PushALUUop(UopKernel* kernel) {
+    uop_queue_.Push(kernel,
+                    [this]() { this->AutoSync(); });
+    if (uop_queue_.pending()) {
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
+      insn->opcode = OPCODE_LOAD;
+      uop_queue_.FlushUopLoad(insn);
+    }
+    VTAAluInsn* insn = insn_queue_.CreateAluInsn();
+    insn->opcode = OPCODE_ALU;
+    insn->uop_bgn = kernel->sram_begin_;
+    insn->uop_end = kernel->sram_end_;
+    insn->alu_opcode = kernel->opcode_;
+    insn->use_imm = kernel->use_imm_;
+    insn->imm = kernel->imm_val_;
+    const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
+    if (loop.size() == 0) {
+      insn->iter_out = 1;
+      insn->dst_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->iter_in = 1;
+      insn->dst_factor_in = 0;
+      insn->src_factor_in = 0;
+    } else if (loop.size() == 1) {
+      insn->iter_out = 1;
+      insn->dst_factor_out = 0;
+      insn->src_factor_out = 0;
+      insn->iter_in = loop[0].extent;
+      insn->dst_factor_in = loop[0].dst_factor;
+      insn->src_factor_in = loop[0].src_factor;
+    } else {
+      insn->iter_out = loop[0].extent;
+      insn->dst_factor_out = loop[0].dst_factor;
+      insn->src_factor_out = loop[0].src_factor;
+      insn->iter_in = loop[1].extent;
+      insn->dst_factor_in = loop[1].dst_factor;
+      insn->src_factor_in = loop[1].src_factor;
+    }
+  }
+
+  void CheckInsnOverFlow() {
+    // At each API call, we can at most commit:
+    // one pending store, one pending load, and one uop
+    if (insn_queue_.count() >= MAX_XFER) {
+      this->AutoSync();
+    }
+  }
+  // Auto sync when instruction overflow
+  void AutoSync() {
+    this->Synchronize(1 << 31);
+  }
+  // VTA handles (register maps)
+  VTAHandle vta_fetch_handle_{nullptr};
+  VTAHandle vta_load_handle_{nullptr};
+  VTAHandle vta_compute_handle_{nullptr};
+  VTAHandle vta_store_handle_{nullptr};
+  // Internal debug flag
+  int debug_flag_{0};
+  // The kernel we currently recording
+  UopKernel* record_kernel_{nullptr};
+  // Micro op queue
+  UopQueue<MAX_XFER, true, true> uop_queue_;
+  // instruction queue
+  InsnQueue<MAX_XFER, true, true> insn_queue_;
+};
+
+}  // namespace vta
+
+
+VTACommandHandle VTATLSCommandHandle() {
+  return vta::CommandQueue::ThreadLocal().get();
+}
+
+void VTARuntimeShutdown() {
+  vta::CommandQueue::Shutdown();
+}
+
+void* VTABufferAlloc(VTACommandHandle cmd, size_t size) {
+  return vta::DataBuffer::Alloc(size);
+}
+
+void VTABufferFree(VTACommandHandle cmd, void* buffer) {
+  vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
+}
+
+void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
+  return vta::DataBuffer::FromHandle(buffer)->virt_addr();
+}
+
+void VTABufferCopy(VTACommandHandle cmd,
+                   const void* from,
+                   size_t from_offset,
+                   void* to,
+                   size_t to_offset,
+                   size_t size,
+                   int kind_mask) {
+  vta::DataBuffer* from_buffer = nullptr;
+  vta::DataBuffer* to_buffer = nullptr;
+
+  if (kind_mask & 2) {
+    from_buffer = vta::DataBuffer::FromHandle(from);
+    from = from_buffer->virt_addr();
+  }
+  if (kind_mask & 1) {
+    to_buffer = vta::DataBuffer::FromHandle(to);
+    to = to_buffer->virt_addr();
+  }
+  if (from_buffer) {
+    from_buffer->InvalidateCache(from_offset, size);
+  }
+
+  memcpy(static_cast<char*>(to) + to_offset,
+         static_cast<const char*>(from) + from_offset,
+         size);
+  if (to_buffer) {
+    to_buffer->FlushCache(to_offset, size);
+  }
+}
+
+void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      SetDebugFlag(debug_flag);
+}
+
+void VTAWriteBarrier(VTACommandHandle cmd,
+                     void* buffer, uint32_t elem_bits,
+                     uint32_t start, uint32_t extent) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      WriteBarrier(buffer, elem_bits, start, extent);
+}
+
+void VTAReadBarrier(VTACommandHandle cmd,
+                    void* buffer, uint32_t elem_bits,
+                    uint32_t start, uint32_t extent) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      ReadBarrier(buffer, elem_bits, start, extent);
+}
+
+void VTALoadBuffer2D(VTACommandHandle cmd,
+                     void* src_dram_addr,
+                     uint32_t src_elem_offset,
+                     uint32_t x_size,
+                     uint32_t y_size,
+                     uint32_t x_stride,
+                     uint32_t x_pad_before,
+                     uint32_t y_pad_before,
+                     uint32_t x_pad_after,
+                     uint32_t y_pad_after,
+                     uint32_t dst_sram_index,
+                     uint32_t dst_memory_type) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      LoadBuffer2D(src_dram_addr, src_elem_offset,
+                   x_size, y_size, x_stride,
+                   x_pad_before, y_pad_before,
+                   x_pad_after, y_pad_after,
+                   dst_sram_index, dst_memory_type);
+}
+
+void VTAStoreBuffer2D(VTACommandHandle cmd,
+                      uint32_t src_sram_index,
+                      uint32_t src_memory_type,
+                      void* dst_dram_addr,
+                      uint32_t dst_elem_offset,
+                      uint32_t x_size,
+                      uint32_t y_size,
+                      uint32_t x_stride) {
+ static_cast<vta::CommandQueue*>(cmd)->
+     StoreBuffer2D(src_sram_index, src_memory_type,
+                   dst_dram_addr, dst_elem_offset,
+                   x_size, y_size, x_stride);
+}
+
+void VTAUopPush(uint32_t mode,
+                uint32_t reset_out,
+                uint32_t dst_index,
+                uint32_t src_index,
+                uint32_t wgt_index,
+                uint32_t opcode,
+                uint32_t use_imm,
+                uint32_t imm_val) {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->Push(mode, reset_out, dst_index, src_index,
+             wgt_index, opcode, use_imm, imm_val);
+}
+
+void VTAUopLoopBegin(uint32_t extent,
+                     uint32_t dst_factor,
+                     uint32_t src_factor,
+                     uint32_t wgt_factor) {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor);
+}
+
+void VTAUopLoopEnd() {
+  vta::CommandQueue::ThreadLocal()->record_kernel()
+      ->PushLoopEnd();
+}
+
+int VTAPushGEMMOp(void** uop_handle,
+                  int (*finit)(void*),
+                  void* signature,
+                  int nbytes) {
+  vta::CommandQueue::ThreadLocal()->
+      PushGEMMOp(uop_handle, finit, signature, nbytes);
+  return 0;
+}
+
+int VTAPushALUOp(void** uop_handle,
+                 int (*finit)(void*),
+                 void* signature,
+                 int nbytes) {
+  vta::CommandQueue::ThreadLocal()->
+      PushALUUop(uop_handle, finit, signature, nbytes);
+  return 0;
+}
+
+int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      DepPush(from_qid, to_qid);
+  return 0;
+}
+
+int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      DepPop(from_qid, to_qid);
+  return 0;
+}
+
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      Synchronize(wait_cycles);
+}
diff --git a/vta/src/tvm/vta_device_api.cc b/vta/src/tvm/vta_device_api.cc
new file mode 100644
index 00000000..b686b65f
--- /dev/null
+++ b/vta/src/tvm/vta_device_api.cc
@@ -0,0 +1,106 @@
+// simply include the driver for now.
+#include <tvm/runtime/registry.h>
+#include <dmlc/thread_local.h>
+#include <vta/runtime.h>
+#include "../../tvm/src/runtime/workspace_pool.h"
+
+namespace tvm {
+namespace runtime {
+
+std::string VTARPCGetPath(const std::string& name) {
+  static const PackedFunc* f =
+      runtime::Registry::Get("tvm.contrib.rpc.server.workpath");
+  CHECK(f != nullptr) << "require tvm.contrib.rpc.server.workpath";
+  return (*f)(name);
+}
+
+// Global functions that can be called
+TVM_REGISTER_GLOBAL("tvm.contrib.vta.init")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::string path = VTARPCGetPath(args[0]);
+    VTAProgram(path.c_str());
+    LOG(INFO) << "VTA initialization end with bistream " << path;
+  });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.rpc.server.shutdown")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    VTARuntimeShutdown();
+  });
+
+class VTADeviceAPI final : public DeviceAPI {
+ public:
+  void SetDevice(TVMContext ctx) final {}
+
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+    if (kind == kExist) {
+      *rv = 1;
+    }
+  }
+
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size, size_t alignment,
+                       TVMType type_hint) final {
+    return VTABufferAlloc(VTATLSCommandHandle(), size);
+  }
+
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+    VTABufferFree(VTATLSCommandHandle(), ptr);
+  }
+
+  void CopyDataFromTo(const void* from,
+                      size_t from_offset,
+                      void* to,
+                      size_t to_offset,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final {
+    int kind_mask = 0;
+    if (ctx_from.device_type != kDLCPU) {
+      kind_mask |= 2;
+    }
+    if (ctx_to.device_type != kDLCPU) {
+      kind_mask |= 1;
+    }
+    VTABufferCopy(VTATLSCommandHandle(),
+                  from, from_offset,
+                  to, to_offset,
+                  size, kind_mask);
+  }
+
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  }
+
+  void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
+
+  void FreeWorkspace(TVMContext ctx, void* data) final;
+
+  static const std::shared_ptr<VTADeviceAPI>& Global() {
+    static std::shared_ptr<VTADeviceAPI> inst =
+        std::make_shared<VTADeviceAPI>();
+    return inst;
+  }
+};
+
+struct VTAWorkspacePool : public WorkspacePool {
+  VTAWorkspacePool() :
+      WorkspacePool(static_cast<DLDeviceType>(kExtDev),
+                    VTADeviceAPI::Global()) {}
+};
+
+void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
+  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()
+      ->AllocWorkspace(ctx, size);
+}
+
+void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
+  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
+}
+
+TVM_REGISTER_GLOBAL("device_api.ext_dev")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = VTADeviceAPI::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/vta/tests/driver/Makefile b/vta/tests/driver/Makefile
deleted file mode 100644
index dad8dbed..00000000
--- a/vta/tests/driver/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-CC ?= g++
-CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
-LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
-LIBS = -l:libsds_lib.so -l:libdma.so
-SRC_DIR = ../../src
-INCLUDE_DIR = ../../include
-DRIVER_DIR = $(SRC_DIR)/driver/pynq
-TESTLIB_DIR = $(SRC_DIR)/test
-VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
-SOURCES = vta_pynq_driver.c vta_test_lib.cc
-OBJECTS = vta_pynq_driver.o vta_test_lib.o driver_test.o
-EXECUTABLE = vta
-
-# VTA Parameters
-#  Log of input width in bits
-LOG_INP_WIDTH = 3
-#  Log of weight width in bits
-LOG_WGT_WIDTH = 3
-#  Log of accum width in bits
-LOG_ACC_WIDTH = 5
-#  Log of output width in bits
-LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
-#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
-LOG_BATCH = 0
-#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
-LOG_IN_BLOCK = 4
-#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
-LOG_OUT_BLOCK = 4
-#  Log of uop buffer size in Bytes
-LOG_UOP_BUFF_SIZE = 15
-#  Log of inp buffer size in Bytes
-LOG_INP_BUFF_SIZE = 15
-#  Log of wgt buffer size in Bytes
-LOG_WGT_BUFF_SIZE = 15
-#  Log of acc buffer size in Bytes
-LOG_ACC_BUFF_SIZE = 17
-#  Log of out buffer size in Bytes
-LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
-
-# Define flags
-CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM \
-	-DDEBUG=0 -DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_IN_BLOCK) -DLOG_BLOCK_OUT=$(LOG_OUT_BLOCK) \
-	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
-
-# All Target
-all: $(EXECUTABLE)
-
-%.o: %.cc $(SOURCES)
-	$(CC) -c -o $@ $< $(CFLAGS)
-
-$(EXECUTABLE): $(OBJECTS)
-	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
-
-clean:
-	rm -rf *.o $(EXECUTABLE)
diff --git a/vta/src/test/vta_test_lib.cc b/vta/tests/hardware/common/test_lib.cc
similarity index 97%
rename from vta/src/test/vta_test_lib.cc
rename to vta/tests/hardware/common/test_lib.cc
index df8fce0d..d203b2aa 100644
--- a/vta/src/test/vta_test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -4,7 +4,7 @@
  * \brief Test library for the VTA design simulation and driver tests.
  */
 
-#include "vta_test_lib.h"
+#include "./test_lib.h"
 
 const char* getOpcodeString(int opcode, bool use_imm) {
   // Returns string name
@@ -153,7 +153,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
 
 void * allocBuffer(size_t num_bytes) {
 #ifdef NO_SIM
-  return cma_alloc(num_bytes, CACHED);
+  return VTAMemAlloc(num_bytes, CACHED);
 #else
   return malloc(num_bytes);
 #endif
@@ -161,7 +161,7 @@ void * allocBuffer(size_t num_bytes) {
 
 void freeBuffer(void * buffer) {
 #ifdef NO_SIM
-  return cma_free(buffer);
+  return VTAMemFree(buffer);
 #else
   return free(buffer);
 #endif
@@ -353,7 +353,7 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
   VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -388,7 +388,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
   VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -449,7 +449,7 @@ VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) cma_alloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
 #else
   VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
 #endif
@@ -762,7 +762,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   }
 
   // Compute reference output
-  inp_T **outputs_ref = alloc2dArray<inp_T>(batch, vector_size);
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
   for (int i = 0; i < batch; i ++) {
     for (int j = 0; j < vector_size; j ++) {
       acc_T tmp = 0;
@@ -802,7 +802,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
         tmp = inputs[i][j] >> immediate[i / BATCH];
       }
       // Set
-      outputs_ref[i][j] = (inp_T) tmp;
+      outputs_ref[i][j] = (out_T) tmp;
     }
   }
 
@@ -811,7 +811,7 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);
 
   // Prepare output buffer
-  inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
+  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
 
 #ifdef NO_SIM
   // Invoke the VTA
@@ -833,8 +833,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
 #endif
 
   // Unpack output buffer
-  inp_T **outputs = alloc2dArray<inp_T>(batch, vector_size);
-  unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
+  out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
+  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
@@ -853,8 +853,8 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   // Free all allocated arrays
   free(immediate);
   free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
-  free2dArray<inp_T>(outputs_ref, batch, vector_size);
-  free2dArray<inp_T>(outputs, batch, vector_size);
+  free2dArray<out_T>(outputs_ref, batch, vector_size);
+  free2dArray<out_T>(outputs, batch, vector_size);
   freeBuffer(insn_buf);
   freeBuffer(uop_buf);
   freeBuffer(bias_buf);
@@ -891,17 +891,17 @@ virtual_threads=%d\n",
   int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
   int uop_size = uop_compression ? block / BATCH * virtual_threads :
     block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
-  int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
   int inp_size = batch / BATCH * in_feat / BLOCK_IN;
+  int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
   int out_size = batch / BATCH * out_feat / BLOCK_OUT;
   // Blocked buffer sizes (in terms of elements)
-  int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
   int inp_block_size = block / BATCH * block / BLOCK_IN;
+  int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
   int out_block_size = block / BATCH * block / BLOCK_OUT;
   // Make sure we don't exceed buffer bounds
   assert(uop_size <= UOP_BUFF_DEPTH);
-  assert(wgt_block_size <= WGT_BUFF_DEPTH);
   assert(inp_block_size <= INP_BUFF_DEPTH);
+  assert(wgt_block_size <= WGT_BUFF_DEPTH);
   assert(out_block_size <= ACC_BUFF_DEPTH);
 
   // Initialize instruction buffer
@@ -1017,15 +1017,15 @@ virtual_threads=%d\n",
   printMicroOp(uop_size, uop_buf);
 #endif
 
-  // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
   // Initialize inputs
   inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
+  // Initialize weights
+  wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
   // Initialize biases
   acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);
 
   // Reference GEMM implementation
-  inp_T **outputs_ref = alloc2dArray<inp_T>(batch, out_feat);
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
   for (int i = 0; i < batch; i ++) {
     for (int j = 0; j < out_feat; j ++) {
       acc_T sum = biases[i][j];
@@ -1033,21 +1033,21 @@ virtual_threads=%d\n",
         sum += (acc_T) (inputs[i][k] * weights[j][k]);
       }
       // Set
-      outputs_ref[i][j] = (inp_T) sum;
+      outputs_ref[i][j] = (out_T) sum;
     }
   }
 
-  // Prepare the weight buffer
-  wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
-  packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
   // Prepare the input buffer
   inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
   packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
+  // Prepare the weight buffer
+  wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
+  packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
   // Prepare the bias buffer
   acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
   packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
   // Prepare the output buffer
-  inp_T *output_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * out_size);
+  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);
 
 #ifdef NO_SIM
   // Invoke the VTA
@@ -1069,8 +1069,8 @@ virtual_threads=%d\n",
 #endif
 
   // Unpack output data
-  inp_T **outputs = alloc2dArray<inp_T>(batch, out_feat);
-  unpackBuffer<inp_T, INP_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
+  out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
+  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
@@ -1087,15 +1087,15 @@ virtual_threads=%d\n",
   }
 
   // Free all allocated arrays
-  free2dArray<wgt_T>(weights, out_feat, in_feat);
   free2dArray<inp_T>(inputs, batch, in_feat);
+  free2dArray<wgt_T>(weights, out_feat, in_feat);
   free2dArray<acc_T>(biases, batch, out_feat);
-  free2dArray<inp_T>(outputs_ref, batch, out_feat);
-  free2dArray<inp_T>(outputs, batch, out_feat);
+  free2dArray<out_T>(outputs_ref, batch, out_feat);
+  free2dArray<out_T>(outputs, batch, out_feat);
   freeBuffer((void *) insn_buf);
   freeBuffer((void *) uop_buf);
-  freeBuffer((void *) weight_buf);
   freeBuffer((void *) input_buf);
+  freeBuffer((void *) weight_buf);
   freeBuffer((void *) bias_buf);
   freeBuffer((void *) output_buf);
 
diff --git a/vta/include/vta_test_lib.h b/vta/tests/hardware/common/test_lib.h
similarity index 97%
rename from vta/include/vta_test_lib.h
rename to vta/tests/hardware/common/test_lib.h
index b4eb1684..fad2e4da 100644
--- a/vta/include/vta_test_lib.h
+++ b/vta/tests/hardware/common/test_lib.h
@@ -7,21 +7,25 @@
 #ifndef VTA_TESTLIB_H_
 #define VTA_TESTLIB_H_
 
-#include "vta_params.h"
-
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vta/hw_spec.h>
 
 #ifdef NO_SIM
 
-#include "vta_pynq_driver.h"
+#include <vta/driver.h>
+
+#ifdef PYNQ_TARGET
+#include "../../../src/pynq/pynq_driver.h"
+#endif //PYNQ_TARGET
 
 typedef uint64_t axi_T;
 typedef uint32_t uop_T;
 typedef int8_t wgt_T;
 typedef int8_t inp_T;
+typedef int8_t out_T;
 typedef int32_t acc_T;
 
 uint64_t vta (
@@ -35,8 +39,7 @@ uint64_t vta (
 
 #else //NO_SIM
 
-#include "vta.h"
-#include "vta_typedefs.h"
+#include "../../../hardware/vivado/src/vta.h"
 
 #endif //NO_SIM
 
diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/pynq/Makefile
new file mode 100644
index 00000000..7e70366f
--- /dev/null
+++ b/vta/tests/hardware/pynq/Makefile
@@ -0,0 +1,37 @@
+CC ?= g++
+CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
+LDFLAGS = -L/usr/lib -L/home/xilinx/pynq/drivers
+LIBS = -l:libsds_lib.so -l:libdma.so
+INCLUDE_DIR = ../../../include
+DRIVER_DIR = ../../../src/pynq
+TESTLIB_DIR = ../common
+VPATH = $(DRIVER_DIR):$(TESTLIB_DIR)
+SOURCES = pynq_driver.cc test_lib.cc
+OBJECTS = pynq_driver.o test_lib.o metal_test.o
+EXECUTABLE = vta
+
+# Include top-level config file
+ifndef config
+ifneq ("$(wildcard ../../../config.mk)", "")
+	config = ../../../config.mk
+else
+	config = ../../../make/config.mk
+endif
+endif
+include $(config)
+
+# Define flags
+CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DDEBUG=0
+CFLAGS += $(ADD_CFLAGS)
+
+# All Target
+all: $(EXECUTABLE)
+
+%.o: %.cc $(SOURCES)
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(EXECUTABLE): $(OBJECTS)
+	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
+
+clean:
+	rm -rf *.o $(EXECUTABLE)
diff --git a/vta/tests/driver/driver_test.cc b/vta/tests/hardware/pynq/metal_test.cc
similarity index 71%
rename from vta/tests/driver/driver_test.cc
rename to vta/tests/hardware/pynq/metal_test.cc
index 6cdc32a9..b5147399 100644
--- a/vta/tests/driver/driver_test.cc
+++ b/vta/tests/hardware/pynq/metal_test.cc
@@ -9,8 +9,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include "vta_test_lib.h"
-#include "vta_pynq_driver.h"
+#include <vta/driver.h>
+#include "../../../src/pynq/pynq_driver.h"
+#include "../common/test_lib.h"
 
 // VTA invocation (present the same abstraction as in the simulation tests)
 uint64_t vta (
@@ -43,18 +44,18 @@ uint64_t vta (
 #endif
 
     // Program VTA
-    ProgramVTA(bitstream);
+    VTAProgram(bitstream);
     // Get VTA handles
-    VTAHandle vta_fetch_handle = MapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-    VTAHandle vta_load_handle = MapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-    VTAHandle vta_compute_handle = MapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-    VTAHandle vta_store_handle = MapRegister(VTA_STORE_ADDR, VTA_RANGE);
+    VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+    VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+    VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+    VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
 
     // Physical address pointers
     uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
     uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
     uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
     uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
     uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
 
@@ -65,29 +66,29 @@ uint64_t vta (
     clock_gettime(CLOCK_REALTIME, &start);
 
     // FETCH @ 0x10 : Data signal of insn_count_V
-    WriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+    VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
     // FETCH @ 0x18 : Data signal of insns_V
-    if (insns) WriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
-    // LOAD @ 0x10 : Data signal of weight_V
-    if (weights) WriteMappedReg(vta_load_handle, 0x10, weight_phy);
-    // LOAD @ 0x18 : Data signal of inputs_V
-    if (inputs) WriteMappedReg(vta_load_handle, 0x18, input_phy);
+    if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+    // LOAD @ 0x10 : Data signal of inputs_V
+    if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
+    // LOAD @ 0x18 : Data signal of weight_V
+    if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
     // COMPUTE @ 0x20 : Data signal of uops_V
-    if (uops) WriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+    if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
     // COMPUTE @ 0x28 : Data signal of biases_V
-    if (biases) WriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+    if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
     // STORE @ 0x10 : Data signal of outputs_V
-    if (outputs) WriteMappedReg(vta_store_handle, 0x10, output_phy);
+    if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
 
     // VTA start
-    WriteMappedReg(vta_fetch_handle, 0x0, 0x1);
-    WriteMappedReg(vta_load_handle, 0x0, 0x81);
-    WriteMappedReg(vta_compute_handle, 0x0, 0x81);
-    WriteMappedReg(vta_store_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+    VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
+    VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
 
     int flag = 0, t = 0;
     for (t = 0; t < 10000000; ++t) {
-      flag = ReadMappedReg(vta_compute_handle, 0x18);
+      flag = VTAReadMappedReg(vta_compute_handle, 0x18);
       if (flag & VTA_DONE) break;
     }
 
@@ -104,10 +105,10 @@ uint64_t vta (
     t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
 
     // Unmap VTA register
-    UnmapRegister(vta_fetch_handle, VTA_RANGE);
-    UnmapRegister(vta_load_handle, VTA_RANGE);
-    UnmapRegister(vta_compute_handle, VTA_RANGE);
-    UnmapRegister(vta_store_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_load_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
+    VTAUnmapRegister(vta_store_handle, VTA_RANGE);
 
     return t_fpga;
 };