[DOC, TVM] ResNet tutorial, updated TVM (#51)
This commit is contained in:
Родитель
5739acaba6
Коммит
3ae9e15559
|
@ -20,7 +20,7 @@ verbose = False
|
|||
# only run fpga component, mark non-conv ops as nop
|
||||
debug_fpga_only = False
|
||||
|
||||
# Obtain model and hardware files (they're too large to check-in)
|
||||
# Obtain model files (they're too large to check-in)
|
||||
# Download them into _data dir
|
||||
data_dir = "_data/"
|
||||
url = "https://homes.cs.washington.edu/~moreau/media/vta/"
|
||||
|
@ -115,11 +115,6 @@ sym = vta.graph.clean_conv_fuse(sym)
|
|||
if target.device_name == "vta":
|
||||
sym = vta.graph.pack(sym, shape_dict, bfactor, cfactor)
|
||||
|
||||
graph_attr.set_shape_inputs(sym, shape_dict)
|
||||
sym = sym.apply("InferShape")
|
||||
graph_attr.set_dtype_inputs(sym, dtype_dict)
|
||||
sym = sym.apply("InferType")
|
||||
|
||||
with nnvm.compiler.build_config(opt_level=3):
|
||||
if target.device_name != "vta":
|
||||
graph, lib, params = nnvm.compiler.build(
|
||||
|
|
|
@ -38,7 +38,7 @@ def test_vta_conv2d():
|
|||
res_conv = vta.top.packed_conv2d(
|
||||
data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
|
||||
res = topi.right_shift(res_conv, 8)
|
||||
res = topi.add(res, bias)
|
||||
res = topi.broadcast_add(res, bias)
|
||||
res = my_clip(res, 0, 127)
|
||||
res = topi.cast(res, "int8")
|
||||
|
||||
|
|
|
@ -0,0 +1,427 @@
|
|||
"""
|
||||
2D Convolution Optimization
|
||||
===========================
|
||||
**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
|
||||
|
||||
This tutorial provides an overview on how to use TVM to map a 2D convolution
|
||||
workload efficiently on the VTA design.
|
||||
We recommend covering the :ref:`mat-mult-opt` tutorial first.
|
||||
|
||||
2D convolution is dominant in most computer vision deep neural networks.
|
||||
In this tutorial, we will demonstrate TVM schedule optimizations to map
|
||||
2D convolution operators in NCHW layout onto VTA.
|
||||
We also introduce the notion of latency hiding, which allows us to
|
||||
maximize VTA's compute and memory resource utilization.
|
||||
"""
|
||||
|
||||
######################################################################
|
||||
# RPC Setup
|
||||
# ---------
|
||||
# We start by programming the Pynq's FPGA and building its RPC runtime.
|
||||
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
import os
|
||||
import tvm
|
||||
import vta
|
||||
import numpy as np
|
||||
|
||||
from tvm.contrib import rpc, util
|
||||
from vta.testing import simulator
|
||||
|
||||
# Load VTA parameters from the config.json file
|
||||
env = vta.get_env()
|
||||
|
||||
# We read the Pynq RPC host IP address and port number from the OS environment
|
||||
host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
|
||||
port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
|
||||
|
||||
# We configure both the bitstream and the runtime system on the Pynq
|
||||
# to match the VTA configuration specified by the config.json file.
|
||||
if env.TARGET == "pynq":
|
||||
|
||||
# Make sure that TVM was compiled with RPC=1
|
||||
assert tvm.module.enabled("rpc")
|
||||
remote = rpc.connect(host, port)
|
||||
|
||||
# Reconfigure the JIT runtime
|
||||
vta.reconfig_runtime(remote)
|
||||
|
||||
# Program the FPGA with a pre-compiled VTA bitstream.
|
||||
# You can program the FPGA with your own custom bitstream
|
||||
# by passing the path to the bitstream file instead of None.
|
||||
vta.program_fpga(remote, bitstream=None)
|
||||
|
||||
# In simulation mode, host the RPC server locally.
|
||||
elif env.TARGET == "sim":
|
||||
remote = rpc.LocalSession()
|
||||
|
||||
######################################################################
|
||||
# Computation Declaration
|
||||
# -----------------------
|
||||
# As a first step, we need to describe our 2D convolution computation
|
||||
# in NCHW format.
|
||||
#
|
||||
# We define the 2D convolution shape by the batch size,
|
||||
# spatial dimensions, input channels, output channels, kernel dimensions,
|
||||
# kernel dimensions, padding dimensions, and stride dimensions.
|
||||
#
|
||||
# We pick the shape of the 9th convolutional layer of the ResNet-18
|
||||
# architecture as our convolution workload parameters.
|
||||
#
|
||||
# We've added extra operators to the 2D convolution that apply
|
||||
# shifting and clipping to the output in order to mimic a fixed-point
|
||||
# convolution followed by a rectified linear activation.
|
||||
# We describe the TVM dataflow graph of the 2D convolution layer below:
|
||||
#
|
||||
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/conv2d_dataflow.png
|
||||
# :align: center
|
||||
#
|
||||
# This computation is intentionally too large to fit onto VTA's on-chip
|
||||
# buffers all at once. Therefore in the scheduling phase we'll
|
||||
# rely on computation blocking strategies to break the computation down into
|
||||
# manageable chunks.
|
||||
#
|
||||
# .. note::
|
||||
#
|
||||
# *Spatial padding*
|
||||
#
|
||||
# Note that we'll need to import the TOPI library to apply spatial padding
|
||||
# on the input feature map tensor.
|
||||
# Spatial padding facilitates blocking in the context of 2D convolutions
|
||||
# due to the fact that the same (x, y) spatial location of the input
|
||||
# feature map of any given layer is read more than once if the convolution
|
||||
# kernel window size is greater than one.
|
||||
# On CPUs, and GPUs, one way to increase efficiency of memory accesses
|
||||
# when parallelizing work is spatial packing, which requires data re-layout.
|
||||
# VTA load DMA engine can insert padding automatically so that the original
|
||||
# input feature map does not have to be re-packed in memory.
|
||||
#
|
||||
# We show the effect of VTA's on the fly spatial padding when data is being
|
||||
# loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
|
||||
# read.
|
||||
#
|
||||
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/padding.png
|
||||
# :align: center
|
||||
# :width: 480px
|
||||
|
||||
import topi
|
||||
|
||||
# 2D convolution layer dimensions taken from ResNet-18 architecture
|
||||
# (9th convolutional layer)
|
||||
batch_size = 1
|
||||
height = 14
|
||||
width = 14
|
||||
in_channels = 256
|
||||
out_channels = 256
|
||||
kernel_h = 3
|
||||
kernel_w = 3
|
||||
pad_h = 1
|
||||
pad_w = 1
|
||||
stride_h = 1
|
||||
stride_w = 1
|
||||
assert batch_size % env.BATCH == 0
|
||||
assert in_channels % env.BLOCK_IN == 0
|
||||
assert out_channels % env.BLOCK_OUT == 0
|
||||
|
||||
# Input feature map: (N, IC, H, W, n, ic)
|
||||
data_shape = (batch_size // env.BATCH,
|
||||
in_channels // env.BLOCK_IN,
|
||||
height,
|
||||
width,
|
||||
env.BATCH,
|
||||
env.BLOCK_IN)
|
||||
# Kernel: (OC, IC, H, W, oc, ic)
|
||||
kernel_shape = (out_channels // env.BLOCK_OUT,
|
||||
in_channels // env.BLOCK_IN,
|
||||
kernel_h,
|
||||
kernel_w,
|
||||
env.BLOCK_OUT,
|
||||
env.BLOCK_IN)
|
||||
# Derive output feature map dimensions
|
||||
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
|
||||
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
|
||||
# Output feature map: (N, OC, H, W, n, oc)
|
||||
output_shape = (batch_size // env.BATCH,
|
||||
out_channels // env.BLOCK_OUT,
|
||||
fout_height,
|
||||
fout_width,
|
||||
env.BATCH,
|
||||
env.BLOCK_OUT)
|
||||
|
||||
# Convolution reduction axes
|
||||
dy = tvm.reduce_axis((0, kernel_h), name='dy')
|
||||
dx = tvm.reduce_axis((0, kernel_w), name='dx')
|
||||
ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
|
||||
ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
|
||||
|
||||
# Input placeholder tensors
|
||||
data = tvm.placeholder(data_shape,
|
||||
name="data",
|
||||
dtype=env.inp_dtype)
|
||||
kernel = tvm.placeholder(kernel_shape,
|
||||
name="kernel",
|
||||
dtype=env.wgt_dtype)
|
||||
|
||||
# Copy buffers:
|
||||
# Apply spatial padding to input feature map
|
||||
data_buf = topi.nn.pad(data,
|
||||
[0, 0, pad_h, pad_w, 0, 0],
|
||||
name="data_buf")
|
||||
kernel_buf = tvm.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
|
||||
|
||||
# Declare 2D convolution
|
||||
res_conv = tvm.compute(
|
||||
output_shape,
|
||||
lambda bo, co, i, j, bi, ci: tvm.sum(
|
||||
data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
|
||||
kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
|
||||
axis=[ic, dy, dx, ic_tns]),
|
||||
name="res_conv")
|
||||
|
||||
# Add shift stage for fix-point normalization
|
||||
res_shr = tvm.compute(output_shape,
|
||||
lambda *i: res_conv(*i) >> 8,
|
||||
name="res_shr")
|
||||
|
||||
# Apply clipping between (0, input max value)
|
||||
inp_max = (1 << (env.INP_WIDTH - 1)) - 1
|
||||
res_max = tvm.compute(output_shape,
|
||||
lambda *i: tvm.max(res_shr(*i), 0),
|
||||
"res_max")
|
||||
res_min = tvm.compute(output_shape,
|
||||
lambda *i: tvm.min(res_max(*i), inp_max),
|
||||
"res_min")
|
||||
|
||||
# Result Tensor
|
||||
res = tvm.compute(output_shape,
|
||||
lambda *i: res_min(*i).astype(env.inp_dtype),
|
||||
name="res")
|
||||
|
||||
|
||||
######################################################################
|
||||
# Scheduling the Computation
|
||||
# --------------------------
|
||||
# We'll look at a set of schedule transformations necessary to map the
|
||||
# 2D convolution onto VTA in an efficient fashion.
|
||||
# Those include:
|
||||
#
|
||||
# - Computation blocking
|
||||
# - Virtual threading to increase compute utilization
|
||||
# - Lowering to VTA hardware intrinsics
|
||||
|
||||
# Create TVM schedule
|
||||
s = tvm.create_schedule(res.op)
|
||||
# Let's look at the default TVM schedule
|
||||
print(tvm.lower(s, [data, kernel, res], simple_mode=True))
|
||||
|
||||
######################################################################
|
||||
# Blocking the Computation
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# The 2D convolution is by default too large for activations or kernel weights
|
||||
# to fit on VTA's on-chip buffers all at once.
|
||||
# We apply blocking along input channels, output channels, and along
|
||||
# the height spatial dimensions.
|
||||
# We don't apply blocking along the width spatial dimension since it's
|
||||
# the innermost dimension in the NCHW layout (and consequently to increase
|
||||
# locality, it's best not to block along the innermost dimension).
|
||||
|
||||
# Let's define tiling sizes
|
||||
b_block = 1 // env.BATCH
|
||||
oc_block = 128 // env.BLOCK_OUT
|
||||
ic_block = 16 // env.BLOCK_IN
|
||||
h_block = 7
|
||||
w_block = 14
|
||||
|
||||
# Tile the output tensor along the spatial and output channel dimensions
|
||||
# (since by default we are doing single batch inference, the split along
|
||||
# the batch dimension has no effect)
|
||||
b, oc, y, x, b_tns, oc_tns = s[res].op.axis
|
||||
b_out, b_inn = s[res].split(b, factor=b_block)
|
||||
oc_out, oc_inn = s[res].split(oc, factor=oc_block)
|
||||
y_out, y_inn = s[res].split(y, factor=h_block)
|
||||
x_out, x_inn = s[res].split(x, factor=w_block)
|
||||
s[res].reorder(b_out, oc_out, y_out, x_out, b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns)
|
||||
|
||||
# Move intermediate computation into each output compute tile
|
||||
s[res_conv].compute_at(s[res], x_out)
|
||||
s[res_shr].compute_at(s[res], x_out)
|
||||
s[res_max].compute_at(s[res], x_out)
|
||||
s[res_min].compute_at(s[res], x_out)
|
||||
|
||||
# Apply additional loop split along reduction axis (input channel)
|
||||
b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns = s[res_conv].op.axis
|
||||
ic_out, ic_inn = s[res_conv].split(ic, factor=ic_block)
|
||||
|
||||
# Reorder axes.
|
||||
# 1) Group the VTA tensor axes in the inner most position: b_tns, oc_tns, ic_tns
|
||||
# to allow TVM to tensorize.
|
||||
# 2) We move the ic_out axis all the way out of the convolution loop to block
|
||||
# along the reduction axis.
|
||||
# 3) Now we re-order the block axes: b_inn, oc_inn, y_inn, x_inn, ic_inn, dy, dx.
|
||||
# VTA runtime/hardware requires us to write to a different output feature map
|
||||
# location for every VTA tensor operation.
|
||||
# This restriction requires us to order one of oc_inn, y_inn or x_inn right
|
||||
# before b_tns, since they all affect output feature map indexing.
|
||||
# Therefore, we choose to bring x_inn inside as shown below.
|
||||
s[res_conv].reorder(ic_out, b_inn, oc_inn, y_inn, ic_inn, dy, dx, x_inn, b_tns, oc_tns, ic_tns)
|
||||
|
||||
######################################################################
|
||||
# Virtual Threading
|
||||
# ~~~~~~~~~~~~~~~~~
|
||||
# Virtual threading is a mechanism that increases task-level pipeline
|
||||
# parallelism in the VTA hardware design.
|
||||
# Put it another way, it increases compute resource utilization by hiding
|
||||
# memory access latency.
|
||||
#
|
||||
# In the implementation below, virtual threading distributes work across two
|
||||
# threads split along the output channel axis.
|
||||
# We show how work is split when computing the 2D convolution in the figure
|
||||
# below.
|
||||
#
|
||||
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/virtual_threading.png
|
||||
# :align: center
|
||||
# :width: 480px
|
||||
|
||||
# VTA only supports 2 virtual threads
|
||||
v_threads = 2
|
||||
|
||||
# Perform virtual thread split along output channel outer axis
|
||||
_, tx = s[res].split(oc_out, factor=v_threads)
|
||||
s[res].reorder(tx, b_out)
|
||||
s[res].bind(tx, tvm.thread_axis("cthread"))
|
||||
|
||||
# Let's look at the current TVM schedule after blocking and virtual threading
|
||||
print(tvm.lower(s, [data, kernel, res], simple_mode=True))
|
||||
|
||||
######################################################################
|
||||
# Lowering Copies to DMA Transfers
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
|
||||
# We move the load loops into the 2D convolution computation loop to stage
|
||||
# memory loads such that they fit in the on-chip SRAM buffers.
|
||||
# Finally we annotate the load/store loop outer axes with the DMA copy pragma
|
||||
# to perform bulk memory transfers on VTA.
|
||||
|
||||
# Set scope of SRAM buffers
|
||||
s[data_buf].set_scope(env.inp_scope)
|
||||
s[kernel_buf].set_scope(env.wgt_scope)
|
||||
s[res_conv].set_scope(env.acc_scope)
|
||||
s[res_shr].set_scope(env.acc_scope)
|
||||
s[res_min].set_scope(env.acc_scope)
|
||||
s[res_max].set_scope(env.acc_scope)
|
||||
|
||||
# Block data and kernel cache reads
|
||||
s[data_buf].compute_at(s[res_conv], ic_out)
|
||||
s[kernel_buf].compute_at(s[res_conv], ic_out)
|
||||
|
||||
# Use DMA copy pragma on DRAM->SRAM operations
|
||||
s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
|
||||
s[kernel_buf].pragma(s[kernel_buf].op.axis[0], env.dma_copy)
|
||||
|
||||
# Use DMA copy pragma on SRAM->DRAM operation in each result block
|
||||
# (this implies that these copies should be performed along b_inn,
|
||||
# or result axis 4)
|
||||
s[res].pragma(s[res].op.axis[4], env.dma_copy)
|
||||
|
||||
######################################################################
|
||||
# Lowering Computation to VTA Compute Intrinsics
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# The last phase is to lower the computation loops down to VTA hardware
|
||||
# intrinsics by mapping the 2D convolution to tensor intrinsics,
|
||||
# and mapping the shift, and clipping computation to the vector ALU.
|
||||
|
||||
# Apply tensorization over the batch tensor tile axis
|
||||
s[res_conv].tensorize(b_tns, env.gemm)
|
||||
|
||||
# Add an ALU pragma over the shift and clipping operations
|
||||
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
|
||||
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
|
||||
s[res_max].pragma(s[res_max].op.axis[0], env.alu)
|
||||
|
||||
# Let's look at the final lowered TVM schedule after lowering memory
|
||||
# loads/stores down to DMA copy intrinsics, and the computation down to
|
||||
# VTA compute intrinsics.
|
||||
print(vta.lower(s, [data, kernel, res], simple_mode=True))
|
||||
|
||||
######################################################################
|
||||
# TVM Compilation and Verification
|
||||
# --------------------------------
|
||||
# After specifying the schedule, we can compile it into a TVM function.
|
||||
# We save the module so we can send it over RPC.
|
||||
# We run the function and verify it against a numpy implementation to
|
||||
# ensure correctness.
|
||||
|
||||
# This library facilitates 2D convolution testing
|
||||
from topi.testing import conv2d_nchw_python
|
||||
|
||||
# Compile the TVM module
|
||||
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
|
||||
temp = util.tempdir()
|
||||
my_conv.save(temp.relpath("conv2d.o"))
|
||||
remote.upload(temp.relpath("conv2d.o"))
|
||||
f = remote.load_module("conv2d.o")
|
||||
|
||||
# Get the remote device context
|
||||
ctx = remote.ext_dev(0)
|
||||
|
||||
# Initialize the data and kernel arrays randomly in the int range
|
||||
# of (-128, 128] in NCHW layout
|
||||
data_np = np.random.randint(
|
||||
-128, 128,
|
||||
size=(batch_size, in_channels, height, width)).astype(data.dtype)
|
||||
kernel_np = np.random.randint(
|
||||
-128, 128,
|
||||
size=(out_channels, in_channels, kernel_h, kernel_w)).astype(kernel.dtype)
|
||||
|
||||
# Apply packing to the data and kernel arrays from a 2D NCHW
|
||||
# to a 4D NCHWnc packed layout
|
||||
data_packed = data_np.reshape(batch_size // env.BATCH,
|
||||
env.BATCH,
|
||||
in_channels // env.BLOCK_IN,
|
||||
env.BLOCK_IN,
|
||||
height,
|
||||
width).transpose((0, 2, 4, 5, 1, 3))
|
||||
|
||||
kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT,
|
||||
env.BLOCK_OUT,
|
||||
in_channels // env.BLOCK_IN,
|
||||
env.BLOCK_IN,
|
||||
kernel_h,
|
||||
kernel_w).transpose((0, 2, 4, 5, 1, 3))
|
||||
|
||||
# Format the input/output arrays with tvm.nd.array to the DLPack standard
|
||||
data_nd = tvm.nd.array(data_packed, ctx)
|
||||
kernel_nd = tvm.nd.array(kernel_packed, ctx)
|
||||
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
|
||||
|
||||
# Invoke the module to perform the computation
|
||||
f(data_nd, kernel_nd, res_nd)
|
||||
|
||||
# Verify against numpy implementation
|
||||
res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
|
||||
kernel_np.astype(env.acc_dtype),
|
||||
(stride_h, stride_w),
|
||||
(pad_h, pad_w)).astype(env.acc_dtype)
|
||||
res_ref = res_ref >> env.INP_WIDTH
|
||||
res_ref = np.clip(res_ref, 0, inp_max)
|
||||
res_ref = res_ref.astype(res.dtype)
|
||||
res_ref = res_ref.reshape((batch_size // env.BATCH,
|
||||
env.BATCH,
|
||||
out_channels // env.BLOCK_OUT,
|
||||
env.BLOCK_OUT,
|
||||
fout_height,
|
||||
fout_width)).transpose((0, 2, 4, 5, 1, 3))
|
||||
np.testing.assert_allclose(res_ref, res_nd.asnumpy())
|
||||
print("Successful 2D convolution test!")
|
||||
|
||||
######################################################################
|
||||
# Summary
|
||||
# -------
|
||||
# This tutorial demonstrates how TVM scheduling primitives can be used to
|
||||
# lower 2D convolution onto hardware accelerator intrinsics, making
|
||||
# use of hardware specific optimizations, such as latency hiding with
|
||||
# virtual threading.
|
||||
#
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче