[AUTOTVM] TOPI integration for ARM CPU (#1487)
This commit is contained in:
Родитель
b625b992a4
Коммит
32076df815
|
@ -188,3 +188,6 @@ build*
|
||||||
|
|
||||||
# Jetbrain
|
# Jetbrain
|
||||||
.idea
|
.idea
|
||||||
|
|
||||||
|
# tmp file
|
||||||
|
.nfs*
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
# Performance Benchmark
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
|
||||||
|
|
||||||
|
## How to Reproduce
|
||||||
|
|
||||||
|
### ARM CPU
|
||||||
|
We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
|
||||||
|
|
||||||
|
1. Start an RPC Tracker on the host machine
|
||||||
|
```bash
|
||||||
|
python3 -m tvm.exec.rpc_tracker
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Register devices to the tracker
|
||||||
|
* For Linux device
|
||||||
|
* Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
|
||||||
|
* Register your device to tracker by
|
||||||
|
```bash
|
||||||
|
python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
|
||||||
|
```
|
||||||
|
replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
|
||||||
|
|
||||||
|
E.g. Here is an example command for RK3399,
|
||||||
|
`python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
|
||||||
|
|
||||||
|
* For Android device
|
||||||
|
* Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
|
||||||
|
Make sure you can pass the android rpc test. Then you have alreadly known how to register.
|
||||||
|
|
||||||
|
3. Verify the device registration
|
||||||
|
We can query all registered devices by
|
||||||
|
```bash
|
||||||
|
python3 -m tvm.exec.query_rpc_tracker
|
||||||
|
```
|
||||||
|
You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.
|
||||||
|
|
||||||
|
For our test environment, one sample output can be
|
||||||
|
```bash
|
||||||
|
Queue Status
|
||||||
|
------------------------------
|
||||||
|
key free pending
|
||||||
|
------------------------------
|
||||||
|
mate10pro 1 0
|
||||||
|
p20pro 2 0
|
||||||
|
pixel2 2 0
|
||||||
|
rk3399 2 0
|
||||||
|
rasp3b 8 0
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Run benchmark
|
||||||
|
We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
|
||||||
|
and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
|
||||||
|
During compilation, TVM will download these operator parameters automatically.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
|
||||||
|
python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
|
||||||
|
python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
|
||||||
|
python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
|
||||||
|
python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro
|
||||||
|
```
|
||||||
|
|
||||||
|
If your device has a same SoC of the above device, you can reuse these parameters
|
||||||
|
(e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
|
||||||
|
Otherwise, you need to tune for your own device, please follow this
|
||||||
|
[tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
|
||||||
|
|
|
@ -0,0 +1,96 @@
|
||||||
|
"""Benchmark script for performance on ARM CPU.
|
||||||
|
see README.md for the usage and results of this script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import nnvm.testing
|
||||||
|
import nnvm.compiler
|
||||||
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
|
from tvm.contrib.util import tempdir
|
||||||
|
import tvm.contrib.graph_runtime as runtime
|
||||||
|
|
||||||
|
def get_network(name, batch_size):
|
||||||
|
"""Get the symbol definition and random weight of a network"""
|
||||||
|
input_shape = (batch_size, 3, 224, 224)
|
||||||
|
output_shape = (batch_size, 1000)
|
||||||
|
|
||||||
|
if name == 'resnet-18':
|
||||||
|
net, params = nnvm.testing.resnet.get_workload(num_layers=18,
|
||||||
|
batch_size=batch_size, image_shape=(3, 224, 224))
|
||||||
|
elif name == 'mobilenet':
|
||||||
|
net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
|
||||||
|
elif name == 'squeezenet v1.1':
|
||||||
|
net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
|
||||||
|
version='1.1')
|
||||||
|
elif name == 'vgg-16':
|
||||||
|
net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Unsupported network: " + name)
|
||||||
|
|
||||||
|
return net, params, input_shape, output_shape
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
|
||||||
|
parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
|
||||||
|
'pixel2', 'rasp3b', 'pynq'])
|
||||||
|
parser.add_argument("--host", type=str, default='localhost')
|
||||||
|
parser.add_argument("--port", type=int, default=9190)
|
||||||
|
parser.add_argument("--rpc-key", type=str, required=True)
|
||||||
|
parser.add_argument("--number", type=int, default=6)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
dtype = 'float32'
|
||||||
|
|
||||||
|
if args.network is None:
|
||||||
|
networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
|
||||||
|
else:
|
||||||
|
networks = [args.network]
|
||||||
|
|
||||||
|
target = tvm.target.arm_cpu(model=args.device)
|
||||||
|
|
||||||
|
# connect to remote device
|
||||||
|
tracker = tvm.rpc.connect_tracker(args.host, args.port)
|
||||||
|
remote = tracker.request(args.rpc_key)
|
||||||
|
|
||||||
|
print("--------------------------------------------------")
|
||||||
|
print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
|
||||||
|
print("--------------------------------------------------")
|
||||||
|
for network in networks:
|
||||||
|
net, params, input_shape, output_shape = get_network(network, batch_size=1)
|
||||||
|
|
||||||
|
with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
|
||||||
|
graph, lib, params = nnvm.compiler.build(
|
||||||
|
net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
|
||||||
|
|
||||||
|
tmp = tempdir()
|
||||||
|
if 'android' in str(target):
|
||||||
|
from tvm.contrib import ndk
|
||||||
|
filename = "%s.so" % network
|
||||||
|
lib.export_library(tmp.relpath(filename), ndk.create_shared)
|
||||||
|
else:
|
||||||
|
filename = "%s.tar" % network
|
||||||
|
lib.export_library(tmp.relpath(filename))
|
||||||
|
|
||||||
|
# upload library and params
|
||||||
|
ctx = remote.context(str(target), 0)
|
||||||
|
remote.upload(tmp.relpath(filename))
|
||||||
|
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
||||||
|
|
||||||
|
rlib = remote.load_module(filename)
|
||||||
|
module = runtime.create(graph, rlib, ctx)
|
||||||
|
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
|
||||||
|
module.set_input('data', data_tvm)
|
||||||
|
module.set_input(**rparams)
|
||||||
|
|
||||||
|
# evaluate
|
||||||
|
ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
|
||||||
|
prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond
|
||||||
|
print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
|
||||||
|
|
|
@ -1,76 +0,0 @@
|
||||||
""" Benchmark script for performance on Raspberry Pi. For example, run the file with:
|
|
||||||
`python rasp_imagenet_bench.py --model='modbilenet' --host='rasp0' --port=9090`. For
|
|
||||||
more details about how to set up the inference environment on Raspberry Pi, Please
|
|
||||||
refer to NNVM Tutorial: Deploy the Pretrained Model on Raspberry Pi """
|
|
||||||
import time
|
|
||||||
import argparse
|
|
||||||
import numpy as np
|
|
||||||
import tvm
|
|
||||||
import nnvm.compiler
|
|
||||||
import nnvm.testing
|
|
||||||
from tvm.contrib import util, rpc
|
|
||||||
from tvm.contrib import graph_runtime as runtime
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'],
|
|
||||||
help="The model type.")
|
|
||||||
parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.")
|
|
||||||
parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.")
|
|
||||||
parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
|
|
||||||
parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
opt_level = args.opt_level
|
|
||||||
|
|
||||||
num_iter = args.num_iter
|
|
||||||
batch_size = 1
|
|
||||||
num_classes = 1000
|
|
||||||
image_shape = (3, 224, 224)
|
|
||||||
|
|
||||||
data_shape = (batch_size,) + image_shape
|
|
||||||
out_shape = (batch_size, num_classes)
|
|
||||||
if args.model == 'resnet':
|
|
||||||
net, params = nnvm.testing.resnet.get_workload(
|
|
||||||
batch_size=1, image_shape=image_shape)
|
|
||||||
elif args.model == 'mobilenet':
|
|
||||||
net, params = nnvm.testing.mobilenet.get_workload(
|
|
||||||
batch_size=1, image_shape=image_shape)
|
|
||||||
else:
|
|
||||||
raise ValueError('no benchmark prepared for {}.'.format(args.model))
|
|
||||||
|
|
||||||
|
|
||||||
with nnvm.compiler.build_config(opt_level=opt_level):
|
|
||||||
graph, lib, params = nnvm.compiler.build(
|
|
||||||
net, tvm.target.rasp(), shape={"data": data_shape}, params=params)
|
|
||||||
|
|
||||||
tmp = util.tempdir()
|
|
||||||
lib_fname = tmp.relpath('net.o')
|
|
||||||
lib.save(lib_fname)
|
|
||||||
|
|
||||||
remote = rpc.connect(args.host, args.port)
|
|
||||||
remote.upload(lib_fname)
|
|
||||||
|
|
||||||
ctx = remote.cpu(0)
|
|
||||||
rlib = remote.load_module('net.o')
|
|
||||||
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
|
||||||
|
|
||||||
module = runtime.create(graph, rlib, ctx)
|
|
||||||
module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
|
|
||||||
module.set_input(**rparams)
|
|
||||||
module.run()
|
|
||||||
out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
|
|
||||||
out.asnumpy()
|
|
||||||
|
|
||||||
print('benchmark args: {}'.format(args))
|
|
||||||
ftimer = module.module.time_evaluator("run", ctx, num_iter)
|
|
||||||
for i in range(3):
|
|
||||||
prof_res = ftimer()
|
|
||||||
print(prof_res)
|
|
||||||
# sleep for avoiding cpu overheat
|
|
||||||
time.sleep(45)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -44,6 +44,9 @@ tvm.autotvm.tuner
|
||||||
.. automodule:: tvm.autotvm.tuner.callback
|
.. automodule:: tvm.autotvm.tuner.callback
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
.. automodule:: tvm.autotvm.tuner.graph_tuning
|
||||||
|
:members:
|
||||||
|
|
||||||
tvm.autotvm.task
|
tvm.autotvm.task
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
.. automodule:: tvm.autotvm.task
|
.. automodule:: tvm.autotvm.task
|
||||||
|
@ -55,6 +58,15 @@ tvm.autotvm.task
|
||||||
.. automodule:: tvm.autotvm.task.space
|
.. automodule:: tvm.autotvm.task.space
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
.. automodule:: tvm.autotvm.task.dispatcher
|
||||||
|
:members:
|
||||||
|
|
||||||
|
.. automodule:: tvm.autotvm.task.topi_integration
|
||||||
|
:members:
|
||||||
|
|
||||||
|
.. automodule:: tvm.autotvm.task.nnvm_integration
|
||||||
|
:members:
|
||||||
|
|
||||||
tvm.autotvm.record
|
tvm.autotvm.record
|
||||||
~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~
|
||||||
.. automodule:: tvm.autotvm.record
|
.. automodule:: tvm.autotvm.record
|
||||||
|
|
|
@ -60,6 +60,8 @@ The configuration of tvm can be modified by `config.cmake`.
|
||||||
- Edit ``build/config.cmake`` to customize the compilation options
|
- Edit ``build/config.cmake`` to customize the compilation options
|
||||||
|
|
||||||
- On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
|
- On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
|
||||||
|
- Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
|
||||||
|
(OpenCL, RCOM, METAL, VULKAN, ...).
|
||||||
|
|
||||||
- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
|
- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
|
||||||
|
|
||||||
|
@ -84,7 +86,7 @@ The configuration of tvm can be modified by `config.cmake`.
|
||||||
cmake ..
|
cmake ..
|
||||||
make -j4
|
make -j4
|
||||||
|
|
||||||
If everything goes well, we can go to :ref:`python-package-installation`_
|
If everything goes well, we can go to :ref:`python-package-installation`
|
||||||
|
|
||||||
Building on Windows
|
Building on Windows
|
||||||
~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
|
@ -172,6 +172,77 @@ struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
|
||||||
static const constexpr int kBias = 2;
|
static const constexpr int kBias = 2;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTransformParam> {
|
||||||
|
int tile_size;
|
||||||
|
|
||||||
|
DMLC_DECLARE_PARAMETER(WinogradWeightTransformParam) {
|
||||||
|
DMLC_DECLARE_FIELD(tile_size)
|
||||||
|
.describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
|
||||||
|
}
|
||||||
|
|
||||||
|
static const constexpr int kWeight = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
|
||||||
|
int channels;
|
||||||
|
TShape kernel_size;
|
||||||
|
TShape strides;
|
||||||
|
TShape padding;
|
||||||
|
TShape dilation;
|
||||||
|
int groups;
|
||||||
|
std::string layout;
|
||||||
|
std::string kernel_layout;
|
||||||
|
std::string out_layout;
|
||||||
|
int out_dtype;
|
||||||
|
bool use_bias;
|
||||||
|
int tile_size;
|
||||||
|
|
||||||
|
DMLC_DECLARE_PARAMETER(WinogradConv2DParam) {
|
||||||
|
DMLC_DECLARE_FIELD(channels)
|
||||||
|
.describe("The dimensionality of the output space"
|
||||||
|
"i.e. the number of output channels in the convolution.");
|
||||||
|
DMLC_DECLARE_FIELD(kernel_size)
|
||||||
|
.describe("Specifies the dimensions of the convolution window.");
|
||||||
|
DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
|
||||||
|
.describe("Specifies the strides of the convolution.");
|
||||||
|
DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
|
||||||
|
.describe("If padding is non-zero, then the input is implicitly zero-padded"
|
||||||
|
"on both sides for padding number of points");
|
||||||
|
DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
|
||||||
|
.describe("Specifies the dilation rate to use for dilated convolution.");
|
||||||
|
DMLC_DECLARE_FIELD(groups).set_default(1)
|
||||||
|
.describe("Controls the connections between inputs and outputs."
|
||||||
|
"At groups=1, all inputs are convolved to all outputs."
|
||||||
|
"At groups=2, the operation becomes equivalent to having two convolution"
|
||||||
|
"layers side by side, each seeing half the input channels, and producing"
|
||||||
|
"half the output channels, and both subsequently concatenated.");
|
||||||
|
DMLC_DECLARE_FIELD(layout).set_default("NCHW")
|
||||||
|
.describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
|
||||||
|
"'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
|
||||||
|
"dimensions respectively. Convolution is applied on the 'H' and"
|
||||||
|
"'W' dimensions.");
|
||||||
|
DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
|
||||||
|
.describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
|
||||||
|
"'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
|
||||||
|
"dimensions respectively. Default to be same as input layout.");
|
||||||
|
DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
|
||||||
|
.describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
|
||||||
|
"'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
|
||||||
|
"dimensions respectively.");
|
||||||
|
DMLC_DECLARE_DTYPE_FIELD(out_dtype)
|
||||||
|
.add_enum("same", -1)
|
||||||
|
.set_default(-1)
|
||||||
|
.describe("Output data type, set to explicit type under mixed precision setting");
|
||||||
|
DMLC_DECLARE_FIELD(use_bias).set_default(true)
|
||||||
|
.describe("Whether the layer uses a bias vector.");
|
||||||
|
DMLC_DECLARE_FIELD(tile_size)
|
||||||
|
.describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
|
||||||
|
}
|
||||||
|
// constants
|
||||||
|
static const constexpr int kData = 0;
|
||||||
|
static const constexpr int kWeight = 1;
|
||||||
|
static const constexpr int kBias = 2;
|
||||||
|
};
|
||||||
|
|
||||||
struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
|
struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
|
||||||
int channels;
|
int channels;
|
||||||
|
|
|
@ -6,6 +6,7 @@ import logging
|
||||||
import tvm
|
import tvm
|
||||||
|
|
||||||
from tvm.contrib import graph_runtime
|
from tvm.contrib import graph_runtime
|
||||||
|
from tvm import autotvm
|
||||||
from . import graph_attr, graph_util
|
from . import graph_attr, graph_util
|
||||||
from .. import graph as _graph
|
from .. import graph as _graph
|
||||||
from .. import symbol as sym
|
from .. import symbol as sym
|
||||||
|
@ -238,67 +239,74 @@ def build(graph, target=None, shape=None, dtype="float32",
|
||||||
raise ValueError("Target is not set in env or passed as argument.")
|
raise ValueError("Target is not set in env or passed as argument.")
|
||||||
target = tvm.target.create(target)
|
target = tvm.target.create(target)
|
||||||
|
|
||||||
shape = shape if shape else {}
|
# if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
|
||||||
if not isinstance(shape, dict):
|
if autotvm.task.DispatchContext.current is None:
|
||||||
raise TypeError("require shape to be dict")
|
tophub_context = autotvm.tophub.context(target)
|
||||||
for value in shape.values():
|
|
||||||
if not all(isinstance(x, int) for x in value):
|
|
||||||
raise TypeError("shape value must be int iterator")
|
|
||||||
|
|
||||||
cfg = BuildConfig.current
|
|
||||||
graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
|
|
||||||
shape, dtype = _update_shape_dtype(shape, dtype, params)
|
|
||||||
|
|
||||||
# correct layout if necessary
|
|
||||||
layout = layout if layout else {}
|
|
||||||
graph = graph_attr.set_layout_inputs(graph, layout)
|
|
||||||
graph = graph.apply("CorrectLayout")
|
|
||||||
index = graph.index
|
|
||||||
layouts = graph.json_attr("layout")
|
|
||||||
layout = {x : layouts[index.entry_id(x)] for x in index.input_names}
|
|
||||||
|
|
||||||
# Initial pass do shape type inference
|
|
||||||
ishape, _ = graph_util.infer_shape(graph, **shape)
|
|
||||||
shape.update(zip(graph.index.input_names, ishape))
|
|
||||||
if not isinstance(dtype, str):
|
|
||||||
idtype, _ = graph_util.infer_dtype(graph, **dtype)
|
|
||||||
dtype.update(zip(graph.index.input_names, idtype))
|
|
||||||
# Initialize all variables specified in _all_var_init
|
|
||||||
init_var = {}
|
|
||||||
if _all_var_init:
|
|
||||||
init_var = initialize_variables(shape, dtype)
|
|
||||||
# Apply optimization
|
|
||||||
with target:
|
|
||||||
graph = optimize(graph, shape, dtype, layout)
|
|
||||||
|
|
||||||
# Clear extra params without nodes.
|
|
||||||
_remove_noref_params(params, graph)
|
|
||||||
|
|
||||||
# Precompute prune
|
|
||||||
if params and cfg.pass_enabled("PrecomputePrune"):
|
|
||||||
graph, params = precompute_prune(graph, params)
|
|
||||||
shape, dtype = _update_shape_dtype(shape, dtype, params)
|
|
||||||
# Operator Fusion and generation
|
|
||||||
graph = graph_attr.set_shape_inputs(graph, shape)
|
|
||||||
graph = graph.apply("InferShape")
|
|
||||||
graph = graph_attr.set_dtype_inputs(graph, dtype)
|
|
||||||
graph._set_json_attr("target", str(target), "str")
|
|
||||||
if target_host is not None:
|
|
||||||
graph._set_json_attr("target_host", str(target_host), "str")
|
|
||||||
if cfg.pass_enabled("OpFusion"):
|
|
||||||
graph._set_json_attr("opt_level", 1, "int")
|
|
||||||
else:
|
else:
|
||||||
graph._set_json_attr("opt_level", 0, "int")
|
tophub_context = autotvm.util.EmptyContext()
|
||||||
graph = graph.apply("InferShape").apply("InferType")
|
|
||||||
with target:
|
with tophub_context:
|
||||||
graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
|
shape = shape if shape else {}
|
||||||
libmod = graph_attr._move_out_module(graph, "module")
|
if not isinstance(shape, dict):
|
||||||
# Write variable initial values into params
|
raise TypeError("require shape to be dict")
|
||||||
if init_var:
|
for value in shape.values():
|
||||||
if params is None:
|
if not all(isinstance(x, int) for x in value):
|
||||||
params = {}
|
raise TypeError("shape value must be int iterator")
|
||||||
params.update(init_var)
|
|
||||||
return graph, libmod, params
|
cfg = BuildConfig.current
|
||||||
|
graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
|
||||||
|
shape, dtype = _update_shape_dtype(shape, dtype, params)
|
||||||
|
|
||||||
|
# correct layout if necessary
|
||||||
|
layout = layout if layout else {}
|
||||||
|
graph = graph_attr.set_layout_inputs(graph, layout)
|
||||||
|
graph = graph.apply("CorrectLayout")
|
||||||
|
index = graph.index
|
||||||
|
layouts = graph.json_attr("layout")
|
||||||
|
layout = {x: layouts[index.entry_id(x)] for x in index.input_names}
|
||||||
|
|
||||||
|
# Initial pass do shape type inference
|
||||||
|
ishape, _ = graph_util.infer_shape(graph, **shape)
|
||||||
|
shape.update(zip(graph.index.input_names, ishape))
|
||||||
|
if not isinstance(dtype, str):
|
||||||
|
idtype, _ = graph_util.infer_dtype(graph, **dtype)
|
||||||
|
dtype.update(zip(graph.index.input_names, idtype))
|
||||||
|
# Initialize all variables specified in _all_var_init
|
||||||
|
init_var = {}
|
||||||
|
if _all_var_init:
|
||||||
|
init_var = initialize_variables(shape, dtype)
|
||||||
|
# Apply optimization
|
||||||
|
with target:
|
||||||
|
graph = optimize(graph, shape, dtype, layout)
|
||||||
|
|
||||||
|
# Clear extra params without nodes.
|
||||||
|
_remove_noref_params(params, graph)
|
||||||
|
|
||||||
|
# Precompute prune
|
||||||
|
if params and cfg.pass_enabled("PrecomputePrune"):
|
||||||
|
graph, params = precompute_prune(graph, params)
|
||||||
|
shape, dtype = _update_shape_dtype(shape, dtype, params)
|
||||||
|
# Operator Fusion and generation
|
||||||
|
graph = graph_attr.set_shape_inputs(graph, shape)
|
||||||
|
graph = graph.apply("InferShape")
|
||||||
|
graph = graph_attr.set_dtype_inputs(graph, dtype)
|
||||||
|
graph._set_json_attr("target", str(target), "str")
|
||||||
|
if target_host is not None:
|
||||||
|
graph._set_json_attr("target_host", str(target_host), "str")
|
||||||
|
if cfg.pass_enabled("OpFusion"):
|
||||||
|
graph._set_json_attr("opt_level", 1, "int")
|
||||||
|
else:
|
||||||
|
graph._set_json_attr("opt_level", 0, "int")
|
||||||
|
graph = graph.apply("InferShape").apply("InferType")
|
||||||
|
with target:
|
||||||
|
graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
|
||||||
|
libmod = graph_attr._move_out_module(graph, "module")
|
||||||
|
# Write variable initial values into params
|
||||||
|
if init_var:
|
||||||
|
if params is None:
|
||||||
|
params = {}
|
||||||
|
params.update(init_var)
|
||||||
|
return graph, libmod, params
|
||||||
|
|
||||||
def _remove_noref_params(params, graph):
|
def _remove_noref_params(params, graph):
|
||||||
""" Helper to clear non referenced params
|
""" Helper to clear non referenced params
|
||||||
|
|
|
@ -89,7 +89,7 @@ def compute_conv2d(attrs, inputs, _):
|
||||||
layout = attrs["layout"]
|
layout = attrs["layout"]
|
||||||
kernel_layout = attrs["kernel_layout"]
|
kernel_layout = attrs["kernel_layout"]
|
||||||
out_dtype = attrs["out_dtype"]
|
out_dtype = attrs["out_dtype"]
|
||||||
out_dtype = None if out_dtype == "same" else out_dtype
|
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
|
||||||
assert layout == "NCHW" or layout == "NHWC"
|
assert layout == "NCHW" or layout == "NHWC"
|
||||||
(dilation_h, dilation_w) = dilation
|
(dilation_h, dilation_w) = dilation
|
||||||
if dilation_h < 1 or dilation_w < 1:
|
if dilation_h < 1 or dilation_w < 1:
|
||||||
|
@ -196,6 +196,53 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
|
||||||
|
|
||||||
reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
|
reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
|
||||||
|
|
||||||
|
|
||||||
|
@reg.register_compute("_contrib_conv2d_winograd_weight_transform")
|
||||||
|
def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
|
||||||
|
return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
|
||||||
|
|
||||||
|
@reg.register_schedule("_contrib_conv2d_winograd_weight_transform")
|
||||||
|
def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
|
||||||
|
with tvm.target.create(target):
|
||||||
|
return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
|
||||||
|
|
||||||
|
reg.register_pattern("_contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
|
||||||
|
|
||||||
|
|
||||||
|
@reg.register_compute("_contrib_conv2d_winograd_without_weight_transform")
|
||||||
|
def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
|
||||||
|
"""Compute definition of conv2d NCHWc"""
|
||||||
|
padding = attrs.get_int_tuple("padding")
|
||||||
|
strides = attrs.get_int_tuple("strides")
|
||||||
|
dilation = attrs.get_int_tuple("dilation")
|
||||||
|
groups = attrs.get_int("groups")
|
||||||
|
layout = attrs.get_string("layout")
|
||||||
|
out_dtype = attrs.get_string("out_dtype")
|
||||||
|
tile_size = attrs.get_int("tile_size")
|
||||||
|
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
|
||||||
|
assert dilation == (1, 1), "Do not support dilate now"
|
||||||
|
assert groups == 1, "Do not supoort arbitrary group number"
|
||||||
|
|
||||||
|
# pylint: disable=assignment-from-no-return
|
||||||
|
out = topi.nn.conv2d_winograd_without_weight_transform(
|
||||||
|
inputs[0], inputs[1], strides, padding, layout, out_dtype,
|
||||||
|
tile_size)
|
||||||
|
|
||||||
|
if attrs.get_bool("use_bias"):
|
||||||
|
bias = inputs[2]
|
||||||
|
bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
|
||||||
|
out = topi.add(out, bias)
|
||||||
|
return out
|
||||||
|
|
||||||
|
@reg.register_schedule("_contrib_conv2d_winograd_without_weight_transform")
|
||||||
|
def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
|
||||||
|
with tvm.target.create(target):
|
||||||
|
return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs)
|
||||||
|
|
||||||
|
reg.register_pattern("_contrib_conv2d_winograd_without_weight_transform",
|
||||||
|
OpPattern.OUT_ELEMWISE_FUSABLE)
|
||||||
|
|
||||||
|
|
||||||
# conv2d_transpose
|
# conv2d_transpose
|
||||||
@reg.register_compute("conv2d_transpose")
|
@reg.register_compute("conv2d_transpose")
|
||||||
def compute_conv2d_transpose(attrs, inputs, _):
|
def compute_conv2d_transpose(attrs, inputs, _):
|
||||||
|
|
|
@ -130,11 +130,110 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
|
||||||
|
std::vector<TShape>* in_shape,
|
||||||
|
std::vector<TShape>* out_shape) {
|
||||||
|
static const Layout kNCHW("NCHW");
|
||||||
|
static const Layout kOIHW("OIHW");
|
||||||
|
|
||||||
|
const WinogradConv2DParam& param = nnvm::get<WinogradConv2DParam>(attrs.parsed);
|
||||||
|
|
||||||
|
const Layout in_layout(param.layout);
|
||||||
|
const Layout kernel_layout(param.kernel_layout);
|
||||||
|
CHECK(in_layout.convertible(kNCHW))
|
||||||
|
<< "Conv only support input layouts that are convertible from NCHW."
|
||||||
|
<< " But got " << in_layout;
|
||||||
|
CHECK(kernel_layout.convertible(kOIHW))
|
||||||
|
<< "Conv only support kernel layouts that are convertible from OIHW."
|
||||||
|
<< " But got "<< kernel_layout;
|
||||||
|
|
||||||
|
Layout out_layout(param.out_layout);
|
||||||
|
if (!out_layout.defined()) out_layout = in_layout;
|
||||||
|
CHECK(out_layout.convertible(kNCHW))
|
||||||
|
<< "Conv only support output layouts that are convertible from NCHW."
|
||||||
|
<< " But got " << out_layout;
|
||||||
|
|
||||||
|
if (param.use_bias) {
|
||||||
|
CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
|
||||||
|
} else {
|
||||||
|
CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
|
||||||
|
}
|
||||||
|
CHECK_EQ(out_shape->size(), 1U);
|
||||||
|
|
||||||
|
TShape dshape = in_shape->at(0);
|
||||||
|
if (dshape.ndim() == 0) return false;
|
||||||
|
dshape = ConvertLayout(dshape, in_layout, kNCHW);
|
||||||
|
|
||||||
|
CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
|
||||||
|
CHECK_EQ(param.kernel_size.ndim(), 2U);
|
||||||
|
CHECK_EQ(param.strides.ndim(), 2U)
|
||||||
|
<< "incorrect stride size: " << param.strides;
|
||||||
|
CHECK_EQ(param.dilation.ndim(), 2U)
|
||||||
|
<< "incorrect dilate size: " << param.dilation;
|
||||||
|
CHECK_EQ(dshape[1] % param.groups, 0U)
|
||||||
|
<< "input channels must divide group size";
|
||||||
|
CHECK_EQ(param.channels % param.groups, 0U)
|
||||||
|
<< "output channels must divide group size";
|
||||||
|
|
||||||
|
// NOTE: Do not check weight shape here!
|
||||||
|
// Different backend requires different layout to compute
|
||||||
|
// the batch gemm stage in winograd efficiently, but we want to
|
||||||
|
// make this NNVM symbol work for all backends.
|
||||||
|
// So we accept all weight shapes, and assume the TOPI developers
|
||||||
|
// can handle this correctly in alter_op_layout.
|
||||||
|
|
||||||
|
if (param.use_bias) {
|
||||||
|
static const Layout default_bias_layout("C");
|
||||||
|
TShape bias_shape({param.channels});
|
||||||
|
auto oc_block = out_layout.subsizeof('C');
|
||||||
|
if (oc_block > 0) {
|
||||||
|
size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
|
||||||
|
bias_shape = ConvertLayout(bias_shape, default_bias_layout,
|
||||||
|
default_bias_layout.split('C', split_axis, oc_block));
|
||||||
|
}
|
||||||
|
NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kBias, bias_shape);
|
||||||
|
}
|
||||||
|
// dilation
|
||||||
|
dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
|
||||||
|
dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
|
||||||
|
TShape oshape({dshape[0], param.channels, 0, 0});
|
||||||
|
if (dshape[2] != 0) {
|
||||||
|
oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
|
||||||
|
}
|
||||||
|
if (dshape[3] != 0) {
|
||||||
|
oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
|
||||||
|
}
|
||||||
|
NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
|
||||||
|
// Perform incomplete shape inference. Fill in the missing values in data shape.
|
||||||
|
// 1) We can always fill in the batch_size.
|
||||||
|
// 2) We can back-calculate the input height/width if the corresponding stride is 1.
|
||||||
|
oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
|
||||||
|
dshape[0] = oshape[0];
|
||||||
|
if (oshape[2] && param.strides[0] == 1) {
|
||||||
|
dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
|
||||||
|
}
|
||||||
|
if (oshape[3] && param.strides[1] == 1) {
|
||||||
|
dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
|
||||||
|
}
|
||||||
|
NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kData,
|
||||||
|
ConvertLayout(dshape, kNCHW, in_layout));
|
||||||
|
// Check whether the kernel sizes are valid
|
||||||
|
if (dshape[2] != 0) {
|
||||||
|
CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
|
||||||
|
<< "kernel size exceed input";
|
||||||
|
}
|
||||||
|
if (dshape[3] != 0) {
|
||||||
|
CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
|
||||||
|
<< "kernel size exceed input";
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename PARAM>
|
||||||
inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
|
inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
|
||||||
std::vector<int>* in_type,
|
std::vector<int>* in_type,
|
||||||
std::vector<int>* out_type) {
|
std::vector<int>* out_type) {
|
||||||
const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
|
const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
|
||||||
if (param.use_bias) {
|
if (param.use_bias) {
|
||||||
CHECK_EQ(in_type->size(), 3U) << "Input:[data, weight, bias]";
|
CHECK_EQ(in_type->size(), 3U) << "Input:[data, weight, bias]";
|
||||||
} else {
|
} else {
|
||||||
|
@ -154,11 +253,12 @@ inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename PARAM>
|
||||||
inline bool Conv2DCorrectLayout(const NodeAttrs& attrs,
|
inline bool Conv2DCorrectLayout(const NodeAttrs& attrs,
|
||||||
std::vector<Layout> *ilayouts,
|
std::vector<Layout> *ilayouts,
|
||||||
const std::vector<Layout> *last_ilayouts,
|
const std::vector<Layout> *last_ilayouts,
|
||||||
std::vector<Layout> *olayouts) {
|
std::vector<Layout> *olayouts) {
|
||||||
const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
|
const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
|
||||||
|
|
||||||
const Layout in_layout(param.layout);
|
const Layout in_layout(param.layout);
|
||||||
Layout out_layout(param.out_layout);
|
Layout out_layout(param.out_layout);
|
||||||
|
@ -213,8 +313,8 @@ a bias vector is created and added to the outputs.
|
||||||
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
|
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
|
||||||
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
|
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
|
||||||
.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
|
.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
|
||||||
.set_attr<FInferType>("FInferType", Conv2DInferType)
|
.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
|
||||||
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout)
|
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
|
||||||
.set_num_outputs(1)
|
.set_num_outputs(1)
|
||||||
.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
|
.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
|
||||||
.set_support_level(2)
|
.set_support_level(2)
|
||||||
|
@ -238,12 +338,81 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
|
||||||
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
|
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
|
||||||
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
|
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
|
||||||
.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
|
.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
|
||||||
.set_attr<FInferType>("FInferType", Conv2DInferType)
|
.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
|
||||||
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout)
|
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
|
||||||
.set_num_outputs(1)
|
.set_num_outputs(1)
|
||||||
.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
|
.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
|
||||||
.set_support_level(2);
|
.set_support_level(2);
|
||||||
|
|
||||||
|
|
||||||
|
NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
|
||||||
|
.describe(R"code(Weight transformation of winograd fast convolution algorithm.
|
||||||
|
Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
|
||||||
|
weight transformation in advance.
|
||||||
|
|
||||||
|
- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
|
||||||
|
)code" NNVM_ADD_FILELINE)
|
||||||
|
.add_argument("weight", "4D Tensor", "Weight tensor.")
|
||||||
|
.add_arguments(WinogradWeightTransformParam::__FIELDS__())
|
||||||
|
.set_attr_parser(ParamParser<WinogradWeightTransformParam>)
|
||||||
|
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradWeightTransformParam>)
|
||||||
|
.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
|
||||||
|
std::vector<TShape> *in_shape,
|
||||||
|
std::vector<TShape> *out_shape) {
|
||||||
|
const auto& param = nnvm::get<WinogradWeightTransformParam>(attrs.parsed);
|
||||||
|
const TShape &wshape = (*in_shape)[0];
|
||||||
|
|
||||||
|
CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
|
||||||
|
|
||||||
|
TShape oshape({param.tile_size + wshape[2] - 1,
|
||||||
|
param.tile_size + wshape[3] - 1,
|
||||||
|
wshape[0],
|
||||||
|
wshape[1]});
|
||||||
|
NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
|
||||||
|
return true;
|
||||||
|
})
|
||||||
|
.set_attr<FCorrectLayout>("FCorrectLayot", [](const NodeAttrs& attrs,
|
||||||
|
std::vector<Layout> *ilayouts,
|
||||||
|
const std::vector<Layout> *last_ilayouts,
|
||||||
|
std::vector<Layout> *olayouts) {
|
||||||
|
Layout layout("OIHW");
|
||||||
|
NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
|
||||||
|
NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
|
||||||
|
return true;
|
||||||
|
})
|
||||||
|
.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
|
||||||
|
.set_num_outputs(1)
|
||||||
|
.set_num_inputs(1)
|
||||||
|
.set_support_level(5);
|
||||||
|
|
||||||
|
DMLC_REGISTER_PARAMETER(WinogradWeightTransformParam);
|
||||||
|
|
||||||
|
NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
|
||||||
|
.describe(R"code(Compute conv2d with winograd algorithm.
|
||||||
|
|
||||||
|
- **data**: Input is 4D array of shape (batch_size, in_channels, height, width)
|
||||||
|
- **weight**: Any shape
|
||||||
|
We do not check shape for this input tensor.
|
||||||
|
|
||||||
|
- **bias**: (channels,)
|
||||||
|
- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width)
|
||||||
|
)code" NNVM_ADD_FILELINE)
|
||||||
|
.add_argument("data", "4D Tensor", "Input data.")
|
||||||
|
.add_argument("weight", "Tensor", "Transformed weight tensor.")
|
||||||
|
.add_argument("bias", "1D Tensor", "Bias parameter.")
|
||||||
|
.add_arguments(WinogradConv2DParam::__FIELDS__())
|
||||||
|
.set_attr_parser(ParamParser<WinogradConv2DParam>)
|
||||||
|
.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
|
||||||
|
.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
|
||||||
|
.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape)
|
||||||
|
.set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
|
||||||
|
.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
|
||||||
|
.set_num_outputs(1)
|
||||||
|
.set_num_inputs(UseBiasNumInputs<WinogradConv2DParam>)
|
||||||
|
.set_support_level(5);
|
||||||
|
|
||||||
|
DMLC_REGISTER_PARAMETER(WinogradConv2DParam);
|
||||||
|
|
||||||
NNVM_REGISTER_OP(_conv2d_grad)
|
NNVM_REGISTER_OP(_conv2d_grad)
|
||||||
.describe(R"code(2D convolution grad.
|
.describe(R"code(2D convolution grad.
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,12 @@ from . import record
|
||||||
from . import task
|
from . import task
|
||||||
from . import tuner
|
from . import tuner
|
||||||
from . import util
|
from . import util
|
||||||
|
from . import env
|
||||||
|
from . import tophub
|
||||||
|
|
||||||
# some shortcuts
|
# some shortcuts
|
||||||
from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
|
from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
|
||||||
from .tuner import callback
|
from .tuner import callback
|
||||||
from .task import template, get_config, create, ConfigSpace, ConfigEntity
|
from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
|
||||||
from .record import ApplyHistoryBest as apply_history_best
|
ApplyHistoryBest as apply_history_best
|
||||||
|
from .env import GLOBAL_SCOPE
|
||||||
|
|
|
@ -8,5 +8,6 @@ class AutotvmGlobalScope(object):
|
||||||
AutotvmGlobalScope.current = self
|
AutotvmGlobalScope.current = self
|
||||||
|
|
||||||
self.cuda_target_arch = None
|
self.cuda_target_arch = None
|
||||||
|
self.in_tuning = False
|
||||||
|
|
||||||
GLOBAL_SCOPE = AutotvmGlobalScope()
|
GLOBAL_SCOPE = AutotvmGlobalScope()
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
"""Distributed executor infrastructure to scale up the tuning"""
|
"""Distributed executor infrastructure to scale up the tuning"""
|
||||||
|
|
||||||
from .measure import MeasureInput, MeasureResult, MeasureErrorNo
|
from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
|
||||||
from .measure import create_measure_batch, measure_option
|
from .measure_methods import request_remote, create_measure_batch, use_rpc
|
||||||
from .measure_methods import request_remote
|
|
||||||
|
|
||||||
from .local_executor import LocalExecutor
|
from .local_executor import LocalExecutor
|
||||||
from .executor import Future, Executor
|
from .executor import Future, Executor
|
||||||
|
|
|
@ -8,7 +8,10 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from Queue import Empty
|
from Queue import Empty
|
||||||
|
|
||||||
import psutil
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
psutil = None
|
||||||
|
|
||||||
from . import executor
|
from . import executor
|
||||||
|
|
||||||
|
@ -106,22 +109,28 @@ class LocalFutureNoFork(executor.Future):
|
||||||
|
|
||||||
|
|
||||||
class LocalExecutor(executor.Executor):
|
class LocalExecutor(executor.Executor):
|
||||||
"""Local executor that runs workers on the same machine with multiprocessing."""
|
"""Local executor that runs workers on the same machine with multiprocessing.
|
||||||
def __init__(self, timeout=None):
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
timeout: float, optional
|
||||||
|
timeout of a job. If time is out. A TimeoutError will be returned (not raised)
|
||||||
|
do_fork: bool, optional
|
||||||
|
For some runtime systems that do not support fork after initialization
|
||||||
|
(e.g. cuda runtime, cudnn). Set this to False if you have used these runtime
|
||||||
|
before submitting jobs.
|
||||||
|
"""
|
||||||
|
def __init__(self, timeout=None, do_fork=True):
|
||||||
self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
|
self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
|
||||||
|
self.do_fork = do_fork
|
||||||
|
|
||||||
|
if self.do_fork:
|
||||||
|
if not psutil:
|
||||||
|
raise RuntimeError("Python package psutil is missing. "
|
||||||
|
"please try `pip install psutil`")
|
||||||
|
|
||||||
def submit(self, func, *args, **kwargs):
|
def submit(self, func, *args, **kwargs):
|
||||||
"""
|
if not self.do_fork:
|
||||||
|
|
||||||
Note
|
|
||||||
----------
|
|
||||||
By default, the executor will fork a new process for a new job
|
|
||||||
But some runtime does not support fork (e.g. cuda runtime, cudnn).
|
|
||||||
In this circumstance, you should set 'fork_new_process' to False in kwargs
|
|
||||||
"""
|
|
||||||
fork_new_process = kwargs.pop('fork_new_process', True)
|
|
||||||
|
|
||||||
if not fork_new_process:
|
|
||||||
return LocalFutureNoFork(func(*args, **kwargs))
|
return LocalFutureNoFork(func(*args, **kwargs))
|
||||||
|
|
||||||
queue = Queue(1)
|
queue = Queue(1)
|
||||||
|
|
|
@ -1,18 +1,7 @@
|
||||||
# pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
|
# pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
|
||||||
"""User facing API for specifying how to measure the generated code"""
|
"""User facing API for specifying how to measure the generated code"""
|
||||||
import time
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from ... import build, nd, target as _target
|
|
||||||
from ...rpc.tracker import Tracker
|
|
||||||
from ...rpc.server import Server
|
|
||||||
|
|
||||||
from ..util import get_const_tuple
|
|
||||||
from .local_executor import LocalExecutor
|
|
||||||
|
|
||||||
|
|
||||||
class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
|
class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
|
||||||
"""
|
"""
|
||||||
Stores all the necessary inputs for a measurement.
|
Stores all the necessary inputs for a measurement.
|
||||||
|
@ -44,6 +33,7 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
|
||||||
The absolute time stamp when we finish measurement.
|
The absolute time stamp when we finish measurement.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MeasureErrorNo(object):
|
class MeasureErrorNo(object):
|
||||||
"""Error type for MeasureResult"""
|
"""Error type for MeasureResult"""
|
||||||
NO_ERROR = 0 # no error
|
NO_ERROR = 0 # no error
|
||||||
|
@ -55,38 +45,25 @@ class MeasureErrorNo(object):
|
||||||
FLEET_ERROR = 6 # error of measure infrastructure
|
FLEET_ERROR = 6 # error of measure infrastructure
|
||||||
|
|
||||||
|
|
||||||
def measure_option(mode,
|
def measure_option(measure_func,
|
||||||
number=1,
|
number=1,
|
||||||
repeat=1,
|
repeat=1,
|
||||||
timeout=60,
|
timeout=60,
|
||||||
parallel_num=1,
|
parallel_num=1,
|
||||||
pack_size=1,
|
do_fork=True,
|
||||||
|
build_func='default',
|
||||||
check_correctness=False,
|
check_correctness=False,
|
||||||
build_option=None,
|
replay_db=None):
|
||||||
replay_db=None,
|
|
||||||
save_to_replay_db=True,
|
|
||||||
rpc_device_key=None,
|
|
||||||
rpc_priority=1,
|
|
||||||
rpc_timeout=60,
|
|
||||||
rpc_tracker_addr=None,
|
|
||||||
use_ndk=False,
|
|
||||||
custom_measure_batch=None):
|
|
||||||
"""Configure how to do measurement
|
"""Configure how to do measurement
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
mode: str
|
measure_func: str or callable
|
||||||
'local': use the local device for measurement. In this mode,
|
'local': use the local device for measurement. The tuner will start a tracker
|
||||||
the tuner starts a tracker and a RPC server silently for the user.
|
and a RPC server silently for the user.
|
||||||
|
|
||||||
'rpc': request devices for measurement from rpc tracker. In this mode,
|
|
||||||
you should start a rpc tracker in a separate processing.
|
|
||||||
|
|
||||||
'custom': use custom measure function
|
|
||||||
|
|
||||||
'local-nofork': use local device for measure but does not use multiprocessing.
|
|
||||||
This mode is suitable for debug, but does not support timeout and parallel.
|
|
||||||
|
|
||||||
|
callable: It is a callable function for measurement.
|
||||||
|
See the return value of measure/measure_methods.py::use_rpc for example.
|
||||||
number : int, optional
|
number : int, optional
|
||||||
Number of times to do the measurement for average
|
Number of times to do the measurement for average
|
||||||
repeat : int, optional
|
repeat : int, optional
|
||||||
|
@ -101,235 +78,50 @@ def measure_option(mode,
|
||||||
The number of measurement task that can run in parallel.
|
The number of measurement task that can run in parallel.
|
||||||
Set this according to the number of cpu cores (for compilation) and
|
Set this according to the number of cpu cores (for compilation) and
|
||||||
the number of devices you have (for measuring generate code).
|
the number of devices you have (for measuring generate code).
|
||||||
pack_size : int, optional
|
do_fork: bool, optional
|
||||||
Number of configs to measure in one RPC call.
|
Whether use multiprocessing (based on fork) for running measure jobs in parallel.
|
||||||
Usually this can be set to 1. If your device has high cost to establish a rpc connection,
|
Set this to False if you want to debug (see trackback) or using fork is not suitable.
|
||||||
set this higher.
|
NOTE: If this is False, parallel and timeout do not work.
|
||||||
|
build_func: str or callable, optional
|
||||||
|
'default': call default builder. This works for normal target (llvm, cuda)
|
||||||
|
|
||||||
|
'ndk': use Android NDK to create shared library. Use this for android target.
|
||||||
|
|
||||||
|
callable: customized build function for other backends (e.g. VTA).
|
||||||
|
See measure/measure_methods.py::default_build_func for example.
|
||||||
check_correctness: bool
|
check_correctness: bool
|
||||||
Whether check correctness after measurement.
|
Whether check correctness after measurement. This will use llvm cpu as reference.
|
||||||
build_option: Dict, optional
|
|
||||||
Build options for tvm.build_config
|
|
||||||
|
|
||||||
replay_db : Database, optional
|
replay_db : Database, optional
|
||||||
The database that we retrieve saved MeasureResults from
|
The database that we retrieve saved MeasureResult from.
|
||||||
save_to_replay_db: bool, optional
|
|
||||||
Whether save measure result to database. This is useless when replay_db is None
|
|
||||||
|
|
||||||
rpc_priority: int, optional
|
|
||||||
Priority of this task, used by scheduler in tracker
|
|
||||||
rpc_device_key: str, optional
|
|
||||||
The device key of registered devices in tracker
|
|
||||||
rpc_timeout: int, optional
|
|
||||||
Timeout of rpc session
|
|
||||||
rpc_tracker_addr: Tuple(str, int), optional
|
|
||||||
The address of rpc tracker in Tuple(host, port) format.
|
|
||||||
If is set, will use this address.
|
|
||||||
If is not set, will use environment variable "TVM_TRACKER_HOST" and "TVM_TRACKER_PORT"
|
|
||||||
|
|
||||||
use_ndk: bool, option
|
|
||||||
Whether export requires ndk
|
|
||||||
custom_measure_batch: callable, optional
|
|
||||||
custom measure function
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
options: dict
|
options: dict
|
||||||
A dict to store all options
|
A dict to store all options
|
||||||
|
|
||||||
|
Note
|
||||||
|
----
|
||||||
|
To support customized measure, you can pass callable `measure_func` or
|
||||||
|
`build_func` in. The `measure_func` will call `build_func` to build binary library
|
||||||
|
and handle the logic of measurement.
|
||||||
|
|
||||||
|
Signature:
|
||||||
|
* measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
|
||||||
|
def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
|
||||||
|
return measure_results
|
||||||
|
|
||||||
|
* build_func (see measure/measure_methods.py::default_build_func for example)
|
||||||
|
def build_func(inp, tmp_dir, **kwargs):
|
||||||
|
return func, args, filename
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
'mode': mode,
|
'measure_func': measure_func,
|
||||||
'number': number,
|
'number': number,
|
||||||
'repeat': repeat,
|
'repeat': repeat,
|
||||||
'timeout': timeout,
|
'timeout': timeout,
|
||||||
'parallel_num': parallel_num,
|
'parallel_num': parallel_num,
|
||||||
'pack_size': pack_size,
|
'do_fork': do_fork,
|
||||||
|
'build_func': build_func,
|
||||||
'check_correctness': check_correctness,
|
'check_correctness': check_correctness,
|
||||||
'build_option': build_option,
|
|
||||||
|
|
||||||
'replay_db': replay_db,
|
'replay_db': replay_db,
|
||||||
'save_to_replay_db': save_to_replay_db,
|
|
||||||
|
|
||||||
'rpc_device_key': rpc_device_key,
|
|
||||||
'rpc_priority': rpc_priority,
|
|
||||||
'rpc_timeout': rpc_timeout,
|
|
||||||
'rpc_tracker_addr': rpc_tracker_addr,
|
|
||||||
|
|
||||||
'use_ndk': use_ndk,
|
|
||||||
'custom_measure_batch': custom_measure_batch
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def create_measure_batch(task, options):
|
|
||||||
"""Get a standard measure_batch function.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
task: tvm.autotvm.task.Task
|
|
||||||
The tuning task
|
|
||||||
options: dict
|
|
||||||
The option for measuring generated code.
|
|
||||||
You should use the return value of :any:`autotvm.measure_option` for this argument
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
measure_batch: callable
|
|
||||||
a callback function to measure a batch of configs
|
|
||||||
"""
|
|
||||||
from . import measure_methods
|
|
||||||
from ..database import filter_inputs
|
|
||||||
|
|
||||||
mode = options['mode']
|
|
||||||
number, repeat = options['number'], options['repeat']
|
|
||||||
timeout, parallel_num = options['timeout'], options['parallel_num']
|
|
||||||
pack_size = options['pack_size']
|
|
||||||
check_correctness = options['check_correctness']
|
|
||||||
build_option = options['build_option']
|
|
||||||
replay_db = options['replay_db']
|
|
||||||
save_to_replay_db = options['save_to_replay_db']
|
|
||||||
rpc_device_key = options['rpc_device_key']
|
|
||||||
rpc_priority, rpc_timeout = options['rpc_priority'], options['rpc_timeout']
|
|
||||||
use_ndk = options['use_ndk']
|
|
||||||
custom_measure_batch = options['custom_measure_batch']
|
|
||||||
|
|
||||||
kwargs = {}
|
|
||||||
executor = LocalExecutor(timeout=timeout)
|
|
||||||
|
|
||||||
if mode == 'local':
|
|
||||||
# start temporary rpc tracker and rpc server for the user
|
|
||||||
tracker = Tracker('localhost', port=9000, port_end=10000,
|
|
||||||
silent=True)
|
|
||||||
rpc_device_key = '$local$device$%d' % tracker.port
|
|
||||||
server = Server('localhost', port=9000, port_end=10000,
|
|
||||||
key=rpc_device_key,
|
|
||||||
use_popen=True, silent=True,
|
|
||||||
tracker_addr=(tracker.host, tracker.port))
|
|
||||||
|
|
||||||
fmeasure = measure_methods.measure_rpc
|
|
||||||
kwargs['rpc_device_key'] = rpc_device_key
|
|
||||||
kwargs['rpc_tracker_addr'] = (tracker.host, tracker.port)
|
|
||||||
kwargs['rpc_timeout'] = timeout
|
|
||||||
elif mode == 'rpc':
|
|
||||||
fmeasure = measure_methods.measure_rpc
|
|
||||||
kwargs['rpc_device_key'] = rpc_device_key
|
|
||||||
kwargs['rpc_priority'] = rpc_priority
|
|
||||||
kwargs['rpc_timeout'] = rpc_timeout
|
|
||||||
kwargs['use_ndk'] = use_ndk
|
|
||||||
assert rpc_device_key, "In rpc mode, a rpc_device_key must be provided"
|
|
||||||
elif mode == "custom":
|
|
||||||
assert callable(custom_measure_batch), "In custom mode, custom_measure_func " \
|
|
||||||
"must be a callable object"
|
|
||||||
elif mode == 'local-nofork':
|
|
||||||
fmeasure = measure_methods.measure_local
|
|
||||||
kwargs['fork_new_process'] = False
|
|
||||||
else:
|
|
||||||
raise RuntimeError("Invalid mode: " + mode)
|
|
||||||
|
|
||||||
if 'cuda' in task.target.keys and 'rpc_device_key' in kwargs: # query cuda device info
|
|
||||||
add_cuda_device_info(kwargs['rpc_device_key'], kwargs.get('rpc_tracker_addr'), kwargs)
|
|
||||||
if 'opencl' in task.target.keys and 'rpc_device_key' in kwargs:
|
|
||||||
add_opencl_device_info(kwargs['rpc_device_key'], kwargs.get('rpc_tracker_addr'), kwargs)
|
|
||||||
|
|
||||||
if check_correctness:
|
|
||||||
# use llvm to generate a reference input/output
|
|
||||||
# this option works for tuning topi, but might not work for you custom op
|
|
||||||
with _target.create("llvm"):
|
|
||||||
s, arg_bufs = task.instantiate(task.config_space.get(0))
|
|
||||||
ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
|
|
||||||
for x in arg_bufs]
|
|
||||||
func = build(s, arg_bufs, "llvm")
|
|
||||||
tvm_buf = [nd.array(x) for x in ref_input]
|
|
||||||
func(*tvm_buf)
|
|
||||||
ref_output = [x.asnumpy() for x in tvm_buf]
|
|
||||||
kwargs['ref_input'], kwargs['ref_output'] = ref_input, ref_output
|
|
||||||
|
|
||||||
def measure_batch(measure_inputs):
|
|
||||||
"""measure the time cost for a batch of configs in real machines"""
|
|
||||||
if replay_db is not None:
|
|
||||||
partial_results, measure_inputs =\
|
|
||||||
filter_inputs(replay_db, measure_inputs, retry=False)
|
|
||||||
|
|
||||||
# pack configs
|
|
||||||
input_packs = []
|
|
||||||
for i in range(0, len(measure_inputs), pack_size):
|
|
||||||
input_packs.append(measure_inputs[i:i + pack_size])
|
|
||||||
|
|
||||||
# send to measure
|
|
||||||
futures = []
|
|
||||||
for input_pack in input_packs:
|
|
||||||
future = executor.submit(
|
|
||||||
fmeasure, input_pack,
|
|
||||||
number=number,
|
|
||||||
repeat=repeat,
|
|
||||||
build_option=build_option,
|
|
||||||
**kwargs
|
|
||||||
)
|
|
||||||
futures.append(future)
|
|
||||||
|
|
||||||
# transform results
|
|
||||||
results = []
|
|
||||||
for future in futures:
|
|
||||||
result = future.get()
|
|
||||||
if isinstance(result, Exception):
|
|
||||||
if mode == 'local-nofork':
|
|
||||||
# debug usage, raise exception
|
|
||||||
raise result
|
|
||||||
tstamp = time.time()
|
|
||||||
results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
|
|
||||||
timeout, tstamp)] * pack_size)
|
|
||||||
else:
|
|
||||||
results.extend(result)
|
|
||||||
|
|
||||||
if replay_db is not None:
|
|
||||||
if save_to_replay_db: # save result to database
|
|
||||||
for measure_input, result in zip(measure_inputs, results):
|
|
||||||
replay_db.save(measure_input, result)
|
|
||||||
|
|
||||||
result_idx = 0
|
|
||||||
for i in range(len(partial_results)):
|
|
||||||
if partial_results[i] is None:
|
|
||||||
partial_results[i] = results[result_idx]
|
|
||||||
result_idx += 1
|
|
||||||
return partial_results
|
|
||||||
return results
|
|
||||||
|
|
||||||
if mode == 'custom':
|
|
||||||
measure_batch = custom_measure_batch
|
|
||||||
|
|
||||||
measure_batch.parallel_num = parallel_num
|
|
||||||
if mode == 'local':
|
|
||||||
measure_batch.aux_objects = {"server": server, "tracker": tracker}
|
|
||||||
return measure_batch
|
|
||||||
|
|
||||||
|
|
||||||
def add_cuda_device_info(device_key, rpc_tracker_addr, kwargs):
|
|
||||||
"""Query cuda device info. This is used to set the flags for nvcc compiler
|
|
||||||
and check the validity of a generated code."""
|
|
||||||
from .measure_methods import request_remote
|
|
||||||
|
|
||||||
remote = request_remote(device_key, rpc_tracker_addr)
|
|
||||||
ctx = remote.context('cuda', 0)
|
|
||||||
max_dims = ctx.max_thread_dimensions
|
|
||||||
kwargs['check_gpu'] = {
|
|
||||||
'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
|
|
||||||
'max_threads_per_block': ctx.max_threads_per_block,
|
|
||||||
'max_thread_x': max_dims[0],
|
|
||||||
'max_thread_y': max_dims[1],
|
|
||||||
'max_thread_z': max_dims[2],
|
|
||||||
}
|
|
||||||
|
|
||||||
kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
|
|
||||||
|
|
||||||
def add_opencl_device_info(device_key, rpc_tracker_addr, kwargs):
|
|
||||||
"""Query opencl device info. This is used to check the validity of a generated code."""
|
|
||||||
from .measure_methods import request_remote
|
|
||||||
|
|
||||||
remote = request_remote(device_key, rpc_tracker_addr)
|
|
||||||
ctx = remote.context('opencl', 0)
|
|
||||||
max_dims = ctx.max_thread_dimensions
|
|
||||||
kwargs['check_gpu'] = {
|
|
||||||
'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
|
|
||||||
'max_threads_per_block': ctx.max_threads_per_block,
|
|
||||||
'max_thread_x': max_dims[0],
|
|
||||||
'max_thread_y': max_dims[1],
|
|
||||||
'max_thread_z': max_dims[2],
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,20 +12,24 @@ from random import getrandbits
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...contrib import ndk, nvcc, util
|
from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
|
||||||
from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func
|
target as _target
|
||||||
|
from ...contrib import nvcc, util, ndk
|
||||||
|
|
||||||
from ..util import get_const_tuple
|
from ..util import get_const_tuple
|
||||||
from ..env import AutotvmGlobalScope
|
from ..env import AutotvmGlobalScope
|
||||||
from .measure import MeasureResult, MeasureErrorNo
|
|
||||||
from ..task.space import InstantiationError
|
from ..task.space import InstantiationError
|
||||||
|
|
||||||
|
from .measure import MeasureResult, MeasureErrorNo
|
||||||
|
from .local_executor import LocalExecutor
|
||||||
|
|
||||||
|
|
||||||
class HashMismatchError(ValueError):
|
class HashMismatchError(ValueError):
|
||||||
"""Raised when the code hash of a submitted config doesn't match that on the
|
"""Raised when the code hash of a submitted config doesn't match that on the
|
||||||
measure side """
|
measure side """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
|
def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
|
||||||
"""request a remote session
|
"""request a remote session
|
||||||
|
|
||||||
|
@ -34,7 +38,9 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
|
||||||
device_key: string
|
device_key: string
|
||||||
device key of registered device in tracker
|
device key of registered device in tracker
|
||||||
tracker_addr: Tuple(string, int), optional
|
tracker_addr: Tuple(string, int), optional
|
||||||
The address of rpc tracker in (host, port) format
|
The address of rpc tracker in (host, port) format.
|
||||||
|
If is none, will use environment variable "TVM_TRACKER_HOST"
|
||||||
|
and "TVM_TRACKER_PORT"
|
||||||
priority: int, optional
|
priority: int, optional
|
||||||
priority of this request, larger is more prior
|
priority of this request, larger is more prior
|
||||||
timeout: float, optional
|
timeout: float, optional
|
||||||
|
@ -46,8 +52,8 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
|
||||||
"""
|
"""
|
||||||
# connect to the tracker
|
# connect to the tracker
|
||||||
if tracker_addr:
|
if tracker_addr:
|
||||||
host = tracker_addr[0]
|
host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
|
||||||
port = tracker_addr[1]
|
port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
|
||||||
else:
|
else:
|
||||||
host = os.environ['TVM_TRACKER_HOST']
|
host = os.environ['TVM_TRACKER_HOST']
|
||||||
port = int(os.environ['TVM_TRACKER_PORT'])
|
port = int(os.environ['TVM_TRACKER_PORT'])
|
||||||
|
@ -58,30 +64,234 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
|
||||||
return remote
|
return remote
|
||||||
|
|
||||||
|
|
||||||
def _measure_generic(fbuild, input_pack, ref_input, ref_output):
|
def create_measure_batch(task, option):
|
||||||
"""Generic measurement function
|
"""Get a standard measure_batch function.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
fbuild : function takes MeasureInput returns tuple of (time_func, ctx)
|
task: tvm.autotvm.task.Task
|
||||||
The build function used to build each input.
|
The tuning task
|
||||||
input_pack : list of MeasureInput
|
option: dict
|
||||||
The inputs we need to evaluate
|
The option for measuring generated code.
|
||||||
ref_input: Array of np.ndarray
|
You should use the return value of function :any:`measure_option` for this argument.
|
||||||
Reference input for checking correctness
|
|
||||||
ref_output: Array of np.ndarray
|
|
||||||
Reference output for checking correctness
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
res_pack : array of MeasureResult
|
measure_batch: callable
|
||||||
The list of execution result of measurement.
|
a callback function to measure a batch of configs
|
||||||
|
"""
|
||||||
|
from ..database import filter_inputs
|
||||||
|
|
||||||
|
measure_func = option['measure_func']
|
||||||
|
number, repeat = option['number'], option['repeat']
|
||||||
|
timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
|
||||||
|
build_func = option['build_func']
|
||||||
|
check_correctness = option['check_correctness']
|
||||||
|
replay_db = option['replay_db']
|
||||||
|
|
||||||
|
executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
|
||||||
|
|
||||||
|
# convert convenient string to function object
|
||||||
|
attach_objects = None
|
||||||
|
if measure_func == 'local':
|
||||||
|
# start temporary rpc tracker and rpc server for the user
|
||||||
|
tracker = rpc.Tracker('localhost', port=9000, port_end=10000, silent=True)
|
||||||
|
device_key = '$local$device$%d' % tracker.port
|
||||||
|
server = rpc.Server('localhost', port=9000, port_end=10000,
|
||||||
|
key=device_key,
|
||||||
|
use_popen=True, silent=True,
|
||||||
|
tracker_addr=(tracker.host, tracker.port))
|
||||||
|
|
||||||
|
measure_func = use_rpc(device_key, tracker.host, tracker.port)
|
||||||
|
attach_objects = (server, tracker)
|
||||||
|
|
||||||
|
build_kwargs = {}
|
||||||
|
if build_func == 'default':
|
||||||
|
build_func = default_build_func
|
||||||
|
if build_func == 'ndk':
|
||||||
|
build_func = default_build_func
|
||||||
|
build_kwargs['use_ndk'] = True
|
||||||
|
|
||||||
|
# add device info of cuda and opencl target
|
||||||
|
if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
|
||||||
|
and hasattr(measure_func, 'rpc_info'):
|
||||||
|
rpc_info = measure_func.rpc_info
|
||||||
|
add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
|
||||||
|
build_kwargs)
|
||||||
|
|
||||||
|
if check_correctness:
|
||||||
|
# use llvm cpu to generate a reference input/output
|
||||||
|
# this option works for tuning topi, but might not work for you custom op
|
||||||
|
with _target.create("llvm"):
|
||||||
|
s, arg_bufs = task.instantiate(task.config_space.get(0))
|
||||||
|
ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
|
||||||
|
for x in arg_bufs]
|
||||||
|
func = build(s, arg_bufs, "llvm")
|
||||||
|
tvm_buf = [nd.array(x) for x in ref_input]
|
||||||
|
func(*tvm_buf)
|
||||||
|
ref_output = [x.asnumpy() for x in tvm_buf]
|
||||||
|
else:
|
||||||
|
ref_input = ref_output = None
|
||||||
|
|
||||||
|
def measure_batch(measure_inputs):
|
||||||
|
"""measure the time cost for a batch of configs in real machines"""
|
||||||
|
if replay_db is not None:
|
||||||
|
partial_results, measure_inputs = \
|
||||||
|
filter_inputs(replay_db, measure_inputs, retry=False)
|
||||||
|
|
||||||
|
# launch measure jobs in parallel
|
||||||
|
pack_size = getattr(measure_func, "pack_size", 1) # measure `pack_size` inputs in one job
|
||||||
|
futures = []
|
||||||
|
for i in range(0, len(measure_inputs), pack_size):
|
||||||
|
input_pack = measure_inputs[i:i + pack_size]
|
||||||
|
ret = executor.submit(
|
||||||
|
measure_func,
|
||||||
|
input_pack,
|
||||||
|
build_func,
|
||||||
|
build_kwargs,
|
||||||
|
number,
|
||||||
|
repeat,
|
||||||
|
ref_input,
|
||||||
|
ref_output)
|
||||||
|
futures.append(ret)
|
||||||
|
|
||||||
|
# transform results
|
||||||
|
results = []
|
||||||
|
for future in futures:
|
||||||
|
result = future.get()
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
tstamp = time.time()
|
||||||
|
results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
|
||||||
|
timeout, tstamp)] * pack_size)
|
||||||
|
else:
|
||||||
|
results.extend(result)
|
||||||
|
|
||||||
|
if replay_db is not None:
|
||||||
|
result_idx = 0
|
||||||
|
for i in range(len(partial_results)):
|
||||||
|
if partial_results[i] is None:
|
||||||
|
partial_results[i] = results[result_idx]
|
||||||
|
result_idx += 1
|
||||||
|
return partial_results
|
||||||
|
return results
|
||||||
|
|
||||||
|
measure_batch.parallel_num = parallel_num
|
||||||
|
# attach server and tracker object to avoid them of being garbage-collected
|
||||||
|
measure_batch.attach_objects = attach_objects
|
||||||
|
return measure_batch
|
||||||
|
|
||||||
|
|
||||||
|
def use_rpc(key,
|
||||||
|
host=None,
|
||||||
|
port=None,
|
||||||
|
priority=1,
|
||||||
|
session_timeout=60,
|
||||||
|
pack_size=1):
|
||||||
|
"""
|
||||||
|
Create a standard measure_func which uses RPC Tracker for measurement.
|
||||||
|
This measure_func will request a device from the RPC Tracker and
|
||||||
|
upload the built binary library to that device for measurement.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
key: str
|
||||||
|
The registered key of the device in tracker. The tuner will request devices for
|
||||||
|
measurement by this key.
|
||||||
|
host: str, optional
|
||||||
|
The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
|
||||||
|
port: int, optional
|
||||||
|
The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
|
||||||
|
priority: int, optional
|
||||||
|
Priority of this task, used by scheduler in tracker
|
||||||
|
session_timeout: int, optional
|
||||||
|
Timeout of rpc session
|
||||||
|
pack_size: int, optional
|
||||||
|
The number of configs measure in one RPC session.
|
||||||
|
Usually this can be set to 1. If your device has high overhead to establish a
|
||||||
|
rpc connection, set this higher.
|
||||||
|
"""
|
||||||
|
def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
|
||||||
|
"""Do measurement for a list of inputs inside a same RPC session.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input_pack: List of MeasureInput
|
||||||
|
The inputs of measurement
|
||||||
|
build_func: callable
|
||||||
|
Function for building the code. see :any:`default_build_func` for example
|
||||||
|
build_kwargs: dict
|
||||||
|
Extra arguments for build_func
|
||||||
|
number : int, optional
|
||||||
|
Number of times to do the measurement for average
|
||||||
|
repeat : int, optional
|
||||||
|
Number of times to repeat the measurement.
|
||||||
|
In total, the generated code will be run (1 + number x repeat) times,
|
||||||
|
where the first one is warm up. The returned result contains `repeat` costs,
|
||||||
|
each of which is the average of `number` test run.
|
||||||
|
ref_input: List of numpy array
|
||||||
|
Reference input for correctness check
|
||||||
|
ref_output: List of numpy array
|
||||||
|
Reference output for correctness check
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
results: List of MeasureResult
|
||||||
|
The results for input_pack
|
||||||
|
"""
|
||||||
|
remote = request_remote(key, (host, port), priority, session_timeout)
|
||||||
|
|
||||||
|
res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
|
||||||
|
ref_input, ref_output,
|
||||||
|
remote)
|
||||||
|
return res
|
||||||
|
|
||||||
|
fmeasure.pack_size = pack_size
|
||||||
|
fmeasure.rpc_info = {"key": key, "host": host, "port": port}
|
||||||
|
return fmeasure
|
||||||
|
|
||||||
|
|
||||||
|
def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
|
||||||
|
ref_input=None, ref_output=None, remote=None):
|
||||||
|
"""Measure the time cost for a pack of inputs.
|
||||||
|
|
||||||
|
(Note: A pack is a list of inputs which will be measured inside a same RPC session)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input_pack : list of MeasureInput
|
||||||
|
The inputs we need to evaluate
|
||||||
|
build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
|
||||||
|
The build function used to build each input.
|
||||||
|
build_kwargs: Dict
|
||||||
|
The extra keyword arguments to build_func
|
||||||
|
number : int, optional
|
||||||
|
Number of times to do the measurement for average
|
||||||
|
repeat : int, optional
|
||||||
|
Number of times to repeat the measurement.
|
||||||
|
In total, the generated code will be run (1 + number x repeat) times,
|
||||||
|
where the first one is warm up. The returned result contains `repeat` costs,
|
||||||
|
each of which is the average of `number` test run.
|
||||||
|
ref_input: Array of np.ndarray, optional
|
||||||
|
Reference input for checking correctness
|
||||||
|
ref_output: Array of np.ndarray, optional
|
||||||
|
Reference output for checking correctness
|
||||||
|
remote: RPCSession, optional
|
||||||
|
The remote RPC session
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
res_pack : Array of MeasureResult
|
||||||
|
The list of results of measurement.
|
||||||
"""
|
"""
|
||||||
res_pack = []
|
res_pack = []
|
||||||
|
tmp_dir = util.tempdir() if remote else None
|
||||||
|
|
||||||
for inp in input_pack:
|
for inp in input_pack:
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
|
|
||||||
|
# build function
|
||||||
try:
|
try:
|
||||||
time_f, ctx, arg_bufs = fbuild(inp)
|
func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
|
||||||
except TVMError as exc:
|
except TVMError as exc:
|
||||||
tstamp = time.time()
|
tstamp = time.time()
|
||||||
msg = str(exc)
|
msg = str(exc)
|
||||||
|
@ -92,9 +302,7 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
|
||||||
msg = msg.split('\n')[-2].split(": ")[1]
|
msg = msg.split('\n')[-2].split(": ")[1]
|
||||||
except Exception: # pylint: disable=broad-except
|
except Exception: # pylint: disable=broad-except
|
||||||
pass
|
pass
|
||||||
res_pack.append(MeasureResult((InstantiationError(msg),),
|
raise InstantiationError(msg)
|
||||||
MeasureErrorNo.INSTANTIATION_ERROR,
|
|
||||||
tstamp - tic, tstamp))
|
|
||||||
else:
|
else:
|
||||||
res_pack.append(MeasureResult((RuntimeError(msg),),
|
res_pack.append(MeasureResult((RuntimeError(msg),),
|
||||||
MeasureErrorNo.COMPILE_HOST,
|
MeasureErrorNo.COMPILE_HOST,
|
||||||
|
@ -107,14 +315,26 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
|
||||||
tstamp - tic, tstamp))
|
tstamp - tic, tstamp))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# upload built module
|
||||||
|
if remote:
|
||||||
|
remote.upload(tmp_dir.relpath(filename))
|
||||||
|
func = remote.load_module(filename)
|
||||||
|
ctx = remote.context(str(inp.target), 0)
|
||||||
|
time_f = func.time_evaluator(
|
||||||
|
func.entry_name, ctx, number=number, repeat=repeat)
|
||||||
|
else:
|
||||||
|
ctx = context(str(inp.target), 0)
|
||||||
|
time_f = func.time_evaluator(
|
||||||
|
func.entry_name, ctx, number=number, repeat=repeat)
|
||||||
|
|
||||||
# measure time
|
# measure time
|
||||||
errno = MeasureErrorNo.NO_ERROR
|
errno = MeasureErrorNo.NO_ERROR
|
||||||
try:
|
try:
|
||||||
if ref_input:
|
if ref_input:
|
||||||
args = [nd.array(x, ctx) for x in ref_input]
|
args = [nd.array(x, ctx=ctx) for x in ref_input]
|
||||||
else:
|
else:
|
||||||
args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype,
|
args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
|
||||||
ctx=ctx) for x in arg_bufs]
|
for x in arg_bufs]
|
||||||
costs = time_f(*args).results
|
costs = time_f(*args).results
|
||||||
if len(costs) > 2: # remove largest and smallest value to reduce variance
|
if len(costs) > 2: # remove largest and smallest value to reduce variance
|
||||||
costs = list(costs)
|
costs = list(costs)
|
||||||
|
@ -135,10 +355,35 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
|
||||||
res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
|
res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
|
||||||
return res_pack
|
return res_pack
|
||||||
|
|
||||||
def _build_func(inp, build_option, kwargs):
|
|
||||||
"""Build function module. Exception will be raised when error occurs"""
|
def default_build_func(inp, tmp_dir=None, **kwargs):
|
||||||
|
"""Build function module. Exception will be raised when any error occurs
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
inp: MeasureInput
|
||||||
|
The input of this measurement
|
||||||
|
tmp_dir: tvm.contrib.util.TempDirectory, optional
|
||||||
|
The temporary directory for exporting built binary library.
|
||||||
|
If is not None (in RPC mode), the library in this directory will be uploaded to
|
||||||
|
remote devices.
|
||||||
|
kwargs: Dict, optional
|
||||||
|
Other extra arguments
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
func: Function
|
||||||
|
TVM built function. Typically this is the return value of tvm.build.
|
||||||
|
args: Array of Buffer or Tensor
|
||||||
|
The argument list for the function. Typically this is the second argument of tvm.build.
|
||||||
|
filename: str
|
||||||
|
The filename of the output build library
|
||||||
|
"""
|
||||||
|
# build function
|
||||||
with inp.target:
|
with inp.target:
|
||||||
s, args = inp.task.instantiate(inp.config)
|
s, args = inp.task.instantiate(inp.config)
|
||||||
|
|
||||||
|
# check invalidity of template and code hash consistency
|
||||||
if not inp.config.valid():
|
if not inp.config.valid():
|
||||||
raise InstantiationError(inp.config.errors)
|
raise InstantiationError(inp.config.errors)
|
||||||
code_hash = getattr(s, 'code_hash', None)
|
code_hash = getattr(s, 'code_hash', None)
|
||||||
|
@ -146,140 +391,49 @@ def _build_func(inp, build_option, kwargs):
|
||||||
raise HashMismatchError('got {0:s}, expected {1:s}'
|
raise HashMismatchError('got {0:s}, expected {1:s}'
|
||||||
.format(str(inp.config.code_hash), str(code_hash)))
|
.format(str(inp.config.code_hash), str(code_hash)))
|
||||||
|
|
||||||
opts = build_option or {}
|
opts = {}
|
||||||
if "check_gpu" in kwargs:
|
if "check_gpu" in kwargs: # Add verify pass to filter out invalid configs in advance.
|
||||||
values = kwargs['check_gpu']
|
opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
|
||||||
# Add gpu verify pass to filter out invalid configs in advance.
|
|
||||||
# This can accelerate the tuning process
|
|
||||||
check_keys = ['max_shared_memory_per_block', 'max_threads_per_block',
|
|
||||||
'max_thread_x', 'max_thread_y', 'max_thread_z']
|
|
||||||
opts["add_lower_pass"] = [
|
|
||||||
(2, gpu_verify_pass(**{key: values[key] for key in check_keys}))]
|
|
||||||
|
|
||||||
if 'cuda_arch' in kwargs:
|
if 'cuda_arch' in kwargs:
|
||||||
set_cuda_target_arch(kwargs['cuda_arch'])
|
set_cuda_target_arch(kwargs['cuda_arch'])
|
||||||
|
|
||||||
with build_config(**opts):
|
with build_config(**opts):
|
||||||
func = build(s, args, target_host=inp.task.target_host)
|
func = build(s, args, target_host=inp.task.target_host)
|
||||||
|
|
||||||
return func, args
|
# export library to temp directory
|
||||||
|
if tmp_dir:
|
||||||
|
if kwargs.get('use_ndk', False): # for Android NDK
|
||||||
def measure_rpc(input_pack,
|
filename = "tmp_func_%0x.so" % getrandbits(64)
|
||||||
rpc_device_key,
|
func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
|
||||||
number,
|
|
||||||
repeat=1,
|
|
||||||
build_option=None,
|
|
||||||
rpc_tracker_addr=None,
|
|
||||||
rpc_priority=1,
|
|
||||||
rpc_timeout=60,
|
|
||||||
**kwargs):
|
|
||||||
"""Measure the time cost on a device by rpc
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
input_pack : list of MeasureInput
|
|
||||||
The inputs we need to evaluate
|
|
||||||
rpc_device_key: str
|
|
||||||
The device key of registered devices in tracker
|
|
||||||
number : int
|
|
||||||
Number of times to get the running measurement
|
|
||||||
repeat : int, optional
|
|
||||||
How many times we want to repeat the measurement.
|
|
||||||
build_option: Dict
|
|
||||||
build options for tvm.build_config
|
|
||||||
|
|
||||||
rpc_tracker_addr: Tuple(string, int), optional
|
|
||||||
The address of rpc tracker in (host, port) format
|
|
||||||
If is none, will use environment variable
|
|
||||||
rpc_priority: int, optional
|
|
||||||
priority of this task, used by scheduler in tracker
|
|
||||||
rpc_timeout: int, optional
|
|
||||||
timeout of the rpc session
|
|
||||||
|
|
||||||
kwargs: dict, optional
|
|
||||||
Additional key word arguments
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
res_pack : Array of MeasureResult
|
|
||||||
The list of execution results of measurement.
|
|
||||||
"""
|
|
||||||
def _fbuild(inp):
|
|
||||||
""" Local build function."""
|
|
||||||
func, args = _build_func(inp, build_option, kwargs)
|
|
||||||
|
|
||||||
tmp_dir = util.tempdir()
|
|
||||||
if not kwargs.get('use_ndk', False):
|
|
||||||
file_name = "tmp_func_%0x.tar" % getrandbits(64)
|
|
||||||
path = tmp_dir.relpath(file_name)
|
|
||||||
func.export_library(path)
|
|
||||||
else:
|
else:
|
||||||
file_name = "tmp_func_%0x.so" % getrandbits(64)
|
filename = "tmp_func_%0x.tar" % getrandbits(64)
|
||||||
path = tmp_dir.relpath(file_name)
|
func.export_library(tmp_dir.relpath(filename))
|
||||||
func.export_library(path, ndk.create_shared)
|
else:
|
||||||
remote = request_remote(rpc_device_key, rpc_tracker_addr, rpc_priority, rpc_timeout)
|
filename = None
|
||||||
remote.upload(path)
|
|
||||||
func = remote.load_module(file_name)
|
|
||||||
ctx = remote.context(str(inp.target), 0)
|
|
||||||
time_f = func.time_evaluator(
|
|
||||||
func.entry_name, ctx, number=number, repeat=repeat)
|
|
||||||
return time_f, ctx, args
|
|
||||||
|
|
||||||
ret = _measure_generic(_fbuild, input_pack,
|
return func, args, filename
|
||||||
kwargs.get("ref_input", None), kwargs.get("ref_output", None))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def measure_local(input_pack,
|
def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
|
||||||
number,
|
"""Add device info for gpu target.
|
||||||
repeat=1,
|
The info will be used to check the validity of generated code."""
|
||||||
build_option=None,
|
remote = request_remote(device_key, rpc_tracker_addr)
|
||||||
**kwargs):
|
ctx = remote.context(str(target), 0)
|
||||||
"""Measure the time cost on a local machine.
|
max_dims = ctx.max_thread_dimensions
|
||||||
|
kwargs['check_gpu'] = {
|
||||||
|
'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
|
||||||
|
'max_threads_per_block': ctx.max_threads_per_block,
|
||||||
|
'max_thread_x': max_dims[0],
|
||||||
|
'max_thread_y': max_dims[1],
|
||||||
|
'max_thread_z': max_dims[2],
|
||||||
|
}
|
||||||
|
|
||||||
Parameters
|
if 'cuda' in target.keys:
|
||||||
----------
|
kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
|
||||||
input_pack : list of MeasureInput
|
|
||||||
The inputs we need to evaluate
|
|
||||||
number : int
|
|
||||||
Number of times to get the running measurement
|
|
||||||
repeat : int, optional
|
|
||||||
How many times we want to repeat the measurement.
|
|
||||||
build_option: dict, optional
|
|
||||||
Build options for tvm.build_config
|
|
||||||
kwargs: dict, optional
|
|
||||||
Additional key word arguments
|
|
||||||
|
|
||||||
Returns
|
def set_cuda_target_arch(arch):
|
||||||
-------
|
"""set target architecture of nvcc compiler"""
|
||||||
res_pack : Array of MeasureResult
|
AutotvmGlobalScope.current.cuda_target_arch = arch
|
||||||
The list of execution results of measurement.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _fbuild(inp):
|
|
||||||
""" Local build function """
|
|
||||||
func, args = _build_func(inp, build_option, kwargs)
|
|
||||||
ctx = context(str(inp.target), 0)
|
|
||||||
time_f = func.time_evaluator(
|
|
||||||
func.entry_name, ctx, number=number, repeat=repeat)
|
|
||||||
return time_f, ctx, args
|
|
||||||
|
|
||||||
ret = _measure_generic(_fbuild, input_pack,
|
|
||||||
kwargs.get("ref_input", None), kwargs.get("ref_output", None))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_verify_pass(**kwargs):
|
|
||||||
"""Verify the validity of a gpu kernel
|
|
||||||
This pass will check shared memory size and number of threads per block.
|
|
||||||
"""
|
|
||||||
def verify_pass(stmt):
|
|
||||||
valid = ir_pass.VerifyGPUCode(stmt, kwargs)
|
|
||||||
if not valid:
|
|
||||||
raise InstantiationError("Skipped because of invalid gpu kernel")
|
|
||||||
return stmt
|
|
||||||
return verify_pass
|
|
||||||
|
|
||||||
|
|
||||||
@register_func
|
@register_func
|
||||||
|
@ -288,6 +442,14 @@ def tvm_callback_cuda_compile(code):
|
||||||
ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
|
ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
|
||||||
return ptx
|
return ptx
|
||||||
|
|
||||||
def set_cuda_target_arch(arch):
|
|
||||||
"""set target architecture of nvcc compiler"""
|
def gpu_verify_pass(**kwargs):
|
||||||
AutotvmGlobalScope.current.cuda_target_arch = arch
|
"""Verify the validity of a gpu kernel.
|
||||||
|
This pass will check memory usage and number of threads per block.
|
||||||
|
"""
|
||||||
|
def verify_pass(stmt):
|
||||||
|
valid = ir_pass.VerifyGPUCode(stmt, kwargs)
|
||||||
|
if not valid:
|
||||||
|
raise InstantiationError("Skipped because of invalid gpu kernel")
|
||||||
|
return stmt
|
||||||
|
return verify_pass
|
||||||
|
|
|
@ -9,15 +9,12 @@ import multiprocessing
|
||||||
import pickle
|
import pickle
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import os
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from .. import build, lower, target as _target
|
from .. import build, lower, target as _target
|
||||||
|
|
||||||
from . import task
|
from . import task
|
||||||
from .task import DispatchContext, ConfigEntity
|
from .task import ConfigEntity, ApplyHistoryBest
|
||||||
from .measure import MeasureInput, MeasureResult
|
from .measure import MeasureInput, MeasureResult
|
||||||
|
|
||||||
AUTOTVM_LOG_VERSION = 0.1
|
AUTOTVM_LOG_VERSION = 0.1
|
||||||
|
@ -120,8 +117,8 @@ def decode(row, protocol='json'):
|
||||||
tgt = _target.create(str(tgt))
|
tgt = _target.create(str(tgt))
|
||||||
|
|
||||||
def clean_json_to_python(x):
|
def clean_json_to_python(x):
|
||||||
"""1. convert all list in x to tuple (hashable)
|
"""1. Convert all list in x to tuple (hashable)
|
||||||
2. convert unicode to str for python2
|
2. Convert unicode to str for python2
|
||||||
"""
|
"""
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
return tuple([clean_json_to_python(a) for a in x])
|
return tuple([clean_json_to_python(a) for a in x])
|
||||||
|
@ -151,6 +148,7 @@ def decode(row, protocol='json'):
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Invalid log protocol: " + protocol)
|
raise RuntimeError("Invalid log protocol: " + protocol)
|
||||||
|
|
||||||
|
|
||||||
def load_from_file(filename):
|
def load_from_file(filename):
|
||||||
"""Generator: load records from file.
|
"""Generator: load records from file.
|
||||||
This is a generator that yields the records.
|
This is a generator that yields the records.
|
||||||
|
@ -168,105 +166,6 @@ def load_from_file(filename):
|
||||||
yield decode(row)
|
yield decode(row)
|
||||||
|
|
||||||
|
|
||||||
class ApplyHistoryBest(DispatchContext):
|
|
||||||
"""
|
|
||||||
Apply the history best config
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
records : str or iterator of (MeasureInput, MeasureResult)
|
|
||||||
Collection of tuning records.
|
|
||||||
If is str, then it should be the filename of a records log file.
|
|
||||||
Each row of this file is an encoded record pair.
|
|
||||||
Otherwise, it is an iterator.
|
|
||||||
default: ConfigEntity, optional
|
|
||||||
The default config to return when no history records
|
|
||||||
"""
|
|
||||||
def __init__(self, records, default=None):
|
|
||||||
super(ApplyHistoryBest, self).__init__()
|
|
||||||
|
|
||||||
self.best_by_targetkey = {}
|
|
||||||
self.best_by_model = {}
|
|
||||||
self._default = default
|
|
||||||
|
|
||||||
self.load(records)
|
|
||||||
|
|
||||||
def load(self, records):
|
|
||||||
"""Load records to this dispatch context
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
records : str or iterator of (MeasureInput, MeasureResult)
|
|
||||||
Collection of tuning records.
|
|
||||||
If is str, then it should be the filename of a records log file.
|
|
||||||
Each row of this file is an encoded record pair.
|
|
||||||
Otherwise, it is an iterator.
|
|
||||||
"""
|
|
||||||
if isinstance(records, str):
|
|
||||||
records = load_from_file(records)
|
|
||||||
if not records:
|
|
||||||
return
|
|
||||||
|
|
||||||
best_by_targetkey = self.best_by_targetkey
|
|
||||||
best_by_model = self.best_by_model
|
|
||||||
|
|
||||||
counter = 0
|
|
||||||
for inp, res in records:
|
|
||||||
counter += 1
|
|
||||||
if res.error_no != 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# use target keys in tvm target system as key to build best map
|
|
||||||
for k in inp.target.keys:
|
|
||||||
key = (k, inp.task.workload)
|
|
||||||
if key not in best_by_targetkey:
|
|
||||||
best_by_targetkey[key] = (inp, res)
|
|
||||||
else:
|
|
||||||
_, other_res = best_by_targetkey[key]
|
|
||||||
if np.mean(other_res.costs) > np.mean(res.costs):
|
|
||||||
best_by_targetkey[key] = (inp, res)
|
|
||||||
|
|
||||||
# use model as key to build best map
|
|
||||||
for opt in inp.target.options:
|
|
||||||
if opt.startswith("-model"):
|
|
||||||
model = opt[7:]
|
|
||||||
key = (model, inp.task.workload)
|
|
||||||
if key not in best_by_model:
|
|
||||||
best_by_model[key] = (inp, res)
|
|
||||||
else:
|
|
||||||
_, other_res = best_by_model[key]
|
|
||||||
if np.mean(other_res.costs) > np.mean(res.costs):
|
|
||||||
best_by_model[key] = (inp, res)
|
|
||||||
break
|
|
||||||
|
|
||||||
logging.info("Finish loading %d records", counter)
|
|
||||||
|
|
||||||
def query(self, target, workload):
|
|
||||||
if target is None:
|
|
||||||
raise RuntimeError("Need a target context to find the history best. "
|
|
||||||
"Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
|
|
||||||
" above the dispatcher call. So does other target. ")
|
|
||||||
|
|
||||||
# first try matching by model
|
|
||||||
for opt in target.options:
|
|
||||||
if opt.startswith("-model"):
|
|
||||||
model = opt[7:]
|
|
||||||
key = (model, workload)
|
|
||||||
if key in self.best_by_model:
|
|
||||||
return self.best_by_model[key][0].config
|
|
||||||
|
|
||||||
# then try matching by target key
|
|
||||||
for k in target.keys:
|
|
||||||
key = (k, workload)
|
|
||||||
if key in self.best_by_targetkey:
|
|
||||||
return self.best_by_targetkey[key][0].config
|
|
||||||
|
|
||||||
if self._default:
|
|
||||||
return self._default
|
|
||||||
raise RuntimeError(
|
|
||||||
"Cannot find config for target=%s, workload=%s" % (target, workload))
|
|
||||||
|
|
||||||
|
|
||||||
def split_workload(in_file, clean=True):
|
def split_workload(in_file, clean=True):
|
||||||
"""Split a log file into separate files, each of which contains only a single workload
|
"""Split a log file into separate files, each of which contains only a single workload
|
||||||
This function can also delete duplicated records in log file
|
This function can also delete duplicated records in log file
|
||||||
|
@ -326,7 +225,7 @@ def pick_best(in_file, out_file):
|
||||||
----------
|
----------
|
||||||
in_file: str
|
in_file: str
|
||||||
The filename of input
|
The filename of input
|
||||||
out_file:
|
out_file: str or file
|
||||||
The filename of output
|
The filename of output
|
||||||
"""
|
"""
|
||||||
best_context = ApplyHistoryBest(load_from_file(in_file))
|
best_context = ApplyHistoryBest(load_from_file(in_file))
|
||||||
|
@ -338,31 +237,13 @@ def pick_best(in_file, out_file):
|
||||||
for v in best_context.best_by_targetkey.values():
|
for v in best_context.best_by_targetkey.values():
|
||||||
best_set.add(measure_str_key(v[0]))
|
best_set.add(measure_str_key(v[0]))
|
||||||
|
|
||||||
logging.info("Extract %d best records from the log file", len(best_set))
|
logging.info("Extract %d best records from the %s", len(best_set), in_file)
|
||||||
|
fout = open(out_file, 'w') if isinstance(out_file, str) else out_file
|
||||||
|
|
||||||
fout = open(out_file, 'w')
|
|
||||||
for inp, res in load_from_file(in_file):
|
for inp, res in load_from_file(in_file):
|
||||||
if measure_str_key(inp) in best_set:
|
if measure_str_key(inp) in best_set:
|
||||||
fout.write(encode(inp, res) + "\n")
|
fout.write(encode(inp, res) + "\n")
|
||||||
|
best_set.remove(measure_str_key(inp))
|
||||||
|
|
||||||
def load_op_param(rootpath=os.path.join(os.path.expanduser('~'), ".tvm", "op_params")):
|
|
||||||
"""Load pre-tuned parameters of operators.
|
|
||||||
This function will load all "*.log" file under root path and select best configs.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
rootpath: str
|
|
||||||
The root path of stored parameters
|
|
||||||
"""
|
|
||||||
best_context = ApplyHistoryBest([])
|
|
||||||
for dirpath, _, filenames in os.walk(rootpath):
|
|
||||||
for filename in filenames:
|
|
||||||
if os.path.splitext(filename)[1] == '.log':
|
|
||||||
best_context.load(os.path.join(dirpath, filename))
|
|
||||||
|
|
||||||
assert not DispatchContext.current, "Cannot load pre-tuned parameters inside a dispatch context"
|
|
||||||
DispatchContext.current = best_context
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Usage:
|
Usage:
|
||||||
|
|
|
@ -9,4 +9,7 @@ of typical tasks of interest.
|
||||||
from .task import Task, create, register, template, get_config, args_to_workload
|
from .task import Task, create, register, template, get_config, args_to_workload
|
||||||
from .space import ConfigSpace, ConfigEntity
|
from .space import ConfigSpace, ConfigEntity
|
||||||
from .code_hash import attach_code_hash, attach_code_hash_to_arg
|
from .code_hash import attach_code_hash, attach_code_hash_to_arg
|
||||||
from .dispatcher import DispatchContext, ApplyConfig, dispatcher
|
from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
|
||||||
|
|
||||||
|
from .topi_integration import register_topi_compute, register_topi_schedule
|
||||||
|
from .nnvm_integration import extract_from_graph
|
||||||
|
|
|
@ -12,7 +12,10 @@ of the DispatchContext base class.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import as _abs
|
from __future__ import absolute_import as _abs
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
from decorator import decorate
|
from decorator import decorate
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from tvm import target as _target
|
from tvm import target as _target
|
||||||
|
|
||||||
|
@ -52,25 +55,6 @@ class DispatchContext(object):
|
||||||
DispatchContext.current = self._old_ctx
|
DispatchContext.current = self._old_ctx
|
||||||
|
|
||||||
|
|
||||||
class ApplyConfig(DispatchContext):
|
|
||||||
"""Apply a specific config entity during query.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
config : ConfigSpace or ConfigEntity
|
|
||||||
The specific configuration we care about.
|
|
||||||
"""
|
|
||||||
def __init__(self, config):
|
|
||||||
super(ApplyConfig, self).__init__()
|
|
||||||
self._config = config
|
|
||||||
self.workload = None
|
|
||||||
|
|
||||||
def query(self, target, workload):
|
|
||||||
"""Override query"""
|
|
||||||
self.workload = workload
|
|
||||||
return self._config
|
|
||||||
|
|
||||||
|
|
||||||
def dispatcher(fworkload):
|
def dispatcher(fworkload):
|
||||||
"""Wrap a workload dispatcher function.
|
"""Wrap a workload dispatcher function.
|
||||||
|
|
||||||
|
@ -137,3 +121,124 @@ def dispatcher(fworkload):
|
||||||
fdecorate = decorate(fworkload, dispatch_func)
|
fdecorate = decorate(fworkload, dispatch_func)
|
||||||
fdecorate.register = register
|
fdecorate.register = register
|
||||||
return fdecorate
|
return fdecorate
|
||||||
|
|
||||||
|
|
||||||
|
class ApplyConfig(DispatchContext):
|
||||||
|
"""Apply a specific config entity during query.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config : ConfigSpace or ConfigEntity
|
||||||
|
The specific configuration we care about.
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(ApplyConfig, self).__init__()
|
||||||
|
self._config = config
|
||||||
|
self.workload = None
|
||||||
|
|
||||||
|
def query(self, target, workload):
|
||||||
|
"""Override query"""
|
||||||
|
self.workload = workload
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
|
||||||
|
class ApplyHistoryBest(DispatchContext):
|
||||||
|
"""
|
||||||
|
Apply the history best config
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
records : str or iterator of (MeasureInput, MeasureResult)
|
||||||
|
Collection of tuning records.
|
||||||
|
If is str, then it should be the filename of a records log file.
|
||||||
|
Each row of this file is an encoded record pair.
|
||||||
|
Otherwise, it is an iterator.
|
||||||
|
default: ConfigEntity, optional
|
||||||
|
The default config to return when no history records
|
||||||
|
"""
|
||||||
|
def __init__(self, records, default=None):
|
||||||
|
super(ApplyHistoryBest, self).__init__()
|
||||||
|
|
||||||
|
self.best_by_targetkey = {}
|
||||||
|
self.best_by_model = {}
|
||||||
|
self._default = default
|
||||||
|
|
||||||
|
if records:
|
||||||
|
self.load(records)
|
||||||
|
|
||||||
|
def load(self, records):
|
||||||
|
"""Load records to this dispatch context
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
records : str or iterator of (MeasureInput, MeasureResult)
|
||||||
|
Collection of tuning records.
|
||||||
|
If is str, then it should be the filename of a records log file.
|
||||||
|
Each row of this file is an encoded record pair.
|
||||||
|
Otherwise, it is an iterator.
|
||||||
|
"""
|
||||||
|
from ..record import load_from_file
|
||||||
|
|
||||||
|
if isinstance(records, str):
|
||||||
|
records = load_from_file(records)
|
||||||
|
if not records:
|
||||||
|
return
|
||||||
|
|
||||||
|
best_by_targetkey = self.best_by_targetkey
|
||||||
|
best_by_model = self.best_by_model
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
for inp, res in records:
|
||||||
|
counter += 1
|
||||||
|
if res.error_no != 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# use target keys in tvm target system as key to build best map
|
||||||
|
for k in inp.target.keys:
|
||||||
|
key = (k, inp.task.workload)
|
||||||
|
if key not in best_by_targetkey:
|
||||||
|
best_by_targetkey[key] = (inp, res)
|
||||||
|
else:
|
||||||
|
_, other_res = best_by_targetkey[key]
|
||||||
|
if np.mean(other_res.costs) > np.mean(res.costs):
|
||||||
|
best_by_targetkey[key] = (inp, res)
|
||||||
|
|
||||||
|
# use model as key to build best map
|
||||||
|
for opt in inp.target.options:
|
||||||
|
if opt.startswith("-model"):
|
||||||
|
model = opt[7:]
|
||||||
|
key = (model, inp.task.workload)
|
||||||
|
if key not in best_by_model:
|
||||||
|
best_by_model[key] = (inp, res)
|
||||||
|
else:
|
||||||
|
_, other_res = best_by_model[key]
|
||||||
|
if np.mean(other_res.costs) > np.mean(res.costs):
|
||||||
|
best_by_model[key] = (inp, res)
|
||||||
|
break
|
||||||
|
|
||||||
|
logging.debug("Finish loading %d records", counter)
|
||||||
|
|
||||||
|
def query(self, target, workload):
|
||||||
|
if target is None:
|
||||||
|
raise RuntimeError("Need a target context to find the history best. "
|
||||||
|
"Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
|
||||||
|
" above the dispatcher call. So does other target. ")
|
||||||
|
|
||||||
|
# first try matching by model
|
||||||
|
for opt in target.options:
|
||||||
|
if opt.startswith("-model"):
|
||||||
|
model = opt[7:]
|
||||||
|
key = (model, workload)
|
||||||
|
if key in self.best_by_model:
|
||||||
|
return self.best_by_model[key][0].config
|
||||||
|
|
||||||
|
# then try matching by target key
|
||||||
|
for k in target.keys:
|
||||||
|
key = (k, workload)
|
||||||
|
if key in self.best_by_targetkey:
|
||||||
|
return self.best_by_targetkey[key][0].config
|
||||||
|
|
||||||
|
if self._default:
|
||||||
|
return self._default
|
||||||
|
raise RuntimeError(
|
||||||
|
"Cannot find config for target=%s, workload=%s" % (target, workload))
|
||||||
|
|
|
@ -0,0 +1,177 @@
|
||||||
|
# pylint: disable=unused-variable,invalid-name
|
||||||
|
"""
|
||||||
|
Decorator and utilities for the integration with TOPI and NNVM
|
||||||
|
|
||||||
|
"""
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from ... import tensor, placeholder, target as _target
|
||||||
|
|
||||||
|
from ..util import get_const_tuple
|
||||||
|
from .task import create, register
|
||||||
|
|
||||||
|
|
||||||
|
def serialize_args(args):
|
||||||
|
"""serialize arguments of a topi function to a hashable tuple.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
args: list of hashable or Tensor
|
||||||
|
"""
|
||||||
|
ret = []
|
||||||
|
for t in args:
|
||||||
|
if isinstance(t, tensor.Tensor):
|
||||||
|
ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
|
||||||
|
else:
|
||||||
|
ret.append(t)
|
||||||
|
return tuple(ret)
|
||||||
|
|
||||||
|
|
||||||
|
def deserialize_args(args):
|
||||||
|
"""The inverse function of :code:`serialize_args`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
args: list of hashable or Tensor
|
||||||
|
"""
|
||||||
|
ret = []
|
||||||
|
for t in args:
|
||||||
|
if isinstance(t, tuple) and t[0] == 'TENSOR':
|
||||||
|
ret.append(placeholder(shape=t[1], dtype=t[2]))
|
||||||
|
else:
|
||||||
|
ret.append(t)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
# Task extractor for nnvm graph
|
||||||
|
class TaskExtractEnv:
|
||||||
|
"""Global environment for extracting tuning tasks from nnvm graph"""
|
||||||
|
current = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
import topi
|
||||||
|
import nnvm
|
||||||
|
|
||||||
|
self.symbol2topi = {
|
||||||
|
nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
|
||||||
|
}
|
||||||
|
|
||||||
|
self.topi_to_task = {
|
||||||
|
topi.nn.conv2d: "topi_nn_conv2d",
|
||||||
|
topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
|
||||||
|
}
|
||||||
|
|
||||||
|
self._register_dummy()
|
||||||
|
self._register_topi_task()
|
||||||
|
self.task_collection = []
|
||||||
|
|
||||||
|
def _register_dummy(self):
|
||||||
|
"""Register dummy function to track the topi function call"""
|
||||||
|
for func in self.topi_to_task:
|
||||||
|
def _local_scope(local_func):
|
||||||
|
"""build a scope to holds the function"""
|
||||||
|
@local_func.register("dummy", )
|
||||||
|
def _dummy_func(*args, **kwargs):
|
||||||
|
assert not kwargs, "Do not support extracting tuning tasks when" \
|
||||||
|
"kwargs is used in TOPI function call." \
|
||||||
|
"Please modify it to use only positional args."
|
||||||
|
|
||||||
|
if (self.topi_to_task[local_func], serialize_args(args)) \
|
||||||
|
not in self.task_collection:
|
||||||
|
self.task_collection.append((self.topi_to_task[local_func],
|
||||||
|
serialize_args(args)))
|
||||||
|
with _target.create("opencl"):
|
||||||
|
return local_func(*args)
|
||||||
|
|
||||||
|
_local_scope(func)
|
||||||
|
|
||||||
|
def _register_topi_task(self):
|
||||||
|
"""register tuning wrapper for topi function"""
|
||||||
|
import topi
|
||||||
|
|
||||||
|
# Tuning wrapper for topi functions
|
||||||
|
@register("topi_nn_conv2d")
|
||||||
|
def _topi_nn_conv2d(*args, **kwargs):
|
||||||
|
assert not kwargs, "Do not support kwargs in template function call"
|
||||||
|
args = deserialize_args(args)
|
||||||
|
A, W = args[:2]
|
||||||
|
layout = args[-2]
|
||||||
|
assert layout == 'NCHW', "only support NCHW currently"
|
||||||
|
C = topi.nn.conv2d(*args, **kwargs)
|
||||||
|
s = topi.generic.schedule_conv2d_nchw([C])
|
||||||
|
return s, [A, W, C]
|
||||||
|
|
||||||
|
@register("topi_nn_depthwise_conv2d_nchw")
|
||||||
|
def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
|
||||||
|
assert not kwargs, "Do not support kwargs in template function call"
|
||||||
|
args = deserialize_args(args)
|
||||||
|
A, W = args[:2]
|
||||||
|
C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
|
||||||
|
s = topi.generic.schedule_depthwise_conv2d_nchw([C])
|
||||||
|
return s, [A, W, C]
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset task collections"""
|
||||||
|
self.task_collection = []
|
||||||
|
|
||||||
|
def get_tasks(self):
|
||||||
|
"""Get collected tasks"""
|
||||||
|
return self.task_collection
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get():
|
||||||
|
"""Get the single instance of TaskExtractEnv"""
|
||||||
|
if not TaskExtractEnv.current:
|
||||||
|
TaskExtractEnv.current = TaskExtractEnv()
|
||||||
|
return TaskExtractEnv.current
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
|
||||||
|
""" Extract tuning tasks from a nnvm graph.
|
||||||
|
|
||||||
|
This function collects tunning tasks by building the graph
|
||||||
|
with a "dummy" target and tracing all the calls to topi.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
graph : Graph
|
||||||
|
The graph to tune
|
||||||
|
shape : dict of str to tuple, optional
|
||||||
|
The input shape to the graph
|
||||||
|
dtype : str or dict of str to str
|
||||||
|
The input types to the graph
|
||||||
|
target: tvm.target.Target
|
||||||
|
The compilation target
|
||||||
|
symbols : Array of nnvm.symbol
|
||||||
|
Array of nnvm symbols
|
||||||
|
target_host: tvm.target.Target
|
||||||
|
The host compilation target
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
task: Array of autotvm.task.Task
|
||||||
|
collected tasks
|
||||||
|
"""
|
||||||
|
import nnvm.compiler
|
||||||
|
|
||||||
|
env = TaskExtractEnv.get()
|
||||||
|
|
||||||
|
topi_funcs = []
|
||||||
|
for sym_name in symbols:
|
||||||
|
if sym_name in env.symbol2topi:
|
||||||
|
topi_funcs.extend(env.symbol2topi[sym_name])
|
||||||
|
else:
|
||||||
|
warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
|
||||||
|
|
||||||
|
# run compiler to collect all TOPI calls during compilation
|
||||||
|
env.reset()
|
||||||
|
dummy_target = _target.create("opencl -device=dummy")
|
||||||
|
nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
|
||||||
|
|
||||||
|
tasks = []
|
||||||
|
for task_name, args in env.get_tasks():
|
||||||
|
tasks.append(create(task_name, args,
|
||||||
|
target=target, target_host=target_host,
|
||||||
|
template_key='direct'))
|
||||||
|
|
||||||
|
return tasks
|
|
@ -21,6 +21,11 @@ from tvm.autotvm.util import get_const_int
|
||||||
|
|
||||||
Axis = namedtuple('Axis', ['space', 'index'])
|
Axis = namedtuple('Axis', ['space', 'index'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
_long = long
|
||||||
|
except NameError:
|
||||||
|
_long = int
|
||||||
|
|
||||||
|
|
||||||
class InstantiationError(ValueError):
|
class InstantiationError(ValueError):
|
||||||
"""Actively detected error in instantiating a template with a config,
|
"""Actively detected error in instantiating a template with a config,
|
||||||
|
@ -103,7 +108,7 @@ class VirtualAxis(TransformSpace):
|
||||||
VirtualAxis.name_ct += 1
|
VirtualAxis.name_ct += 1
|
||||||
|
|
||||||
self.name = name
|
self.name = name
|
||||||
if isinstance(var, int):
|
if isinstance(var, (int, _long)):
|
||||||
self.length = var
|
self.length = var
|
||||||
elif isinstance(var, schedule.IterVar):
|
elif isinstance(var, schedule.IterVar):
|
||||||
self.name = var.var.name
|
self.name = var.var.name
|
||||||
|
@ -114,7 +119,7 @@ class VirtualAxis(TransformSpace):
|
||||||
elif isinstance(var, VirtualAxis):
|
elif isinstance(var, VirtualAxis):
|
||||||
self.length = var.length
|
self.length = var.length
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Invalid type of axis")
|
raise RuntimeError("Invalid type of axis: " + str(type(var)))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_num_output(var, name=None):
|
def get_num_output(var, name=None):
|
||||||
|
|
|
@ -362,7 +362,7 @@ def compute_flop(sch):
|
||||||
exp = body[0]
|
exp = body[0]
|
||||||
|
|
||||||
ret += num_element * _count_flop(exp)
|
ret += num_element * _count_flop(exp)
|
||||||
ret += traverse([sch[t].op for t in op.input_tensors])
|
ret += traverse([t.op for t in op.input_tensors])
|
||||||
|
|
||||||
elif isinstance(op, tensor.PlaceholderOp):
|
elif isinstance(op, tensor.PlaceholderOp):
|
||||||
pass
|
pass
|
||||||
|
@ -382,5 +382,4 @@ def compute_flop(sch):
|
||||||
raise RuntimeError("Cannot find float number operation in this operator. "
|
raise RuntimeError("Cannot find float number operation in this operator. "
|
||||||
"Please use `cfg.add_flop` to manually set "
|
"Please use `cfg.add_flop` to manually set "
|
||||||
"FLOP for this operator")
|
"FLOP for this operator")
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
# pylint: disable=unused-variable,invalid-name
|
||||||
|
"""
|
||||||
|
Decorators for registering tunable templates to TOPI.
|
||||||
|
|
||||||
|
These decorators can make your simple implementation be able to use different configurations
|
||||||
|
for different workloads.
|
||||||
|
Here we directly use all arguments to the TOPI call as "workload", so make sure all the arguments
|
||||||
|
(except tvm.Tensor) in you calls are hashable. For tvm.Tensor, we will serialize it to a hashable
|
||||||
|
tuple.
|
||||||
|
|
||||||
|
See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from ... import _api_internal, tensor
|
||||||
|
|
||||||
|
from ..util import get_func_name
|
||||||
|
from .task import args_to_workload, dispatcher
|
||||||
|
|
||||||
|
|
||||||
|
# A table that records all registered dispatcher for all targets
|
||||||
|
_REGISTED_DISPATHCER = {
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
|
||||||
|
"""Register a tunable template for a topi compute function.
|
||||||
|
|
||||||
|
After the registration. This topi compute will become a configuration dispatcher. It uses
|
||||||
|
all its argument as workload and dispatches configurations according to the input workload.
|
||||||
|
|
||||||
|
It also stores this "workload" to its final ComputeOp, which can be used to reconstruct
|
||||||
|
"workload" in the following topi_schedule call.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
topi_compute: GenericFunc
|
||||||
|
The topi compute function that will be overloaded
|
||||||
|
target_keys: str or list of str
|
||||||
|
The compilation target. The same as the argument of GenericFunc.register.
|
||||||
|
template_keys: str or list of str
|
||||||
|
The template key.
|
||||||
|
We might have several strategies for a single operator (e.g. direct, im2col, winograd).
|
||||||
|
The template key is used to identity the algorithm strategy.
|
||||||
|
Every operator must have a "direct" template, which is used by default.
|
||||||
|
func: None or callable
|
||||||
|
If it is None, return a decorator.
|
||||||
|
If is callable, decorate this function.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
decorator: callable
|
||||||
|
A decorator
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
|
||||||
|
"""
|
||||||
|
fname = get_func_name(topi_compute)
|
||||||
|
|
||||||
|
def _decorator(f):
|
||||||
|
targets = [target_keys] if isinstance(target_keys, str) else target_keys
|
||||||
|
for target_key in targets:
|
||||||
|
if target_key not in _REGISTED_DISPATHCER:
|
||||||
|
_REGISTED_DISPATHCER[target_key] = {}
|
||||||
|
if topi_compute not in _REGISTED_DISPATHCER:
|
||||||
|
@topi_compute.register(target_key)
|
||||||
|
@dispatcher
|
||||||
|
def config_dispatcher(*args, **kwargs):
|
||||||
|
"""override topi call as a config dispatcher"""
|
||||||
|
assert not kwargs, "Do not support kwargs in template function call"
|
||||||
|
return (fname, ) + args_to_workload(args)
|
||||||
|
_REGISTED_DISPATHCER[target_key][topi_compute] = config_dispatcher
|
||||||
|
|
||||||
|
config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_compute]
|
||||||
|
|
||||||
|
@config_dispatcher.register(template_keys)
|
||||||
|
def template_call(cfg, *args, **kwargs):
|
||||||
|
"""call the topi func and attach workload to compute node"""
|
||||||
|
assert not kwargs, "Do not support kwargs in template function call"
|
||||||
|
|
||||||
|
if f == topi_compute.fdefault:
|
||||||
|
node = f(*args, **kwargs)
|
||||||
|
else:
|
||||||
|
node = f(cfg, *args, **kwargs)
|
||||||
|
|
||||||
|
# attach workload to return op
|
||||||
|
op = node.op
|
||||||
|
attrs = {}
|
||||||
|
for k, v in node.op.attrs.items():
|
||||||
|
attrs[k] = v
|
||||||
|
attrs['workload'] = (fname, ) + args_to_workload(args)
|
||||||
|
if isinstance(op, tensor.ComputeOp):
|
||||||
|
op = _api_internal._ComputeOp(
|
||||||
|
op.name, op.tag, attrs, op.axis, op.body)
|
||||||
|
elif isinstance(op, tensor.ExternOp):
|
||||||
|
op = _api_internal._ExternOp(
|
||||||
|
op.name, op.tag, attrs,
|
||||||
|
op.inputs, op.input_placeholders,
|
||||||
|
op.output_placeholders, op.body)
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Unsupported op type: " + str(type(op)))
|
||||||
|
|
||||||
|
if isinstance(node, tensor.Tensor):
|
||||||
|
return op.output(0)
|
||||||
|
return [op.output(i) for i in range(len(node))]
|
||||||
|
|
||||||
|
return f
|
||||||
|
|
||||||
|
if func:
|
||||||
|
_decorator(func)
|
||||||
|
|
||||||
|
return _decorator
|
||||||
|
|
||||||
|
|
||||||
|
def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None):
|
||||||
|
"""Register a tunable template for a topi schedule function.
|
||||||
|
|
||||||
|
After the registration. This topi schedule will become a configuration dispatcher. It dispatches
|
||||||
|
configurations according to the input workload.
|
||||||
|
|
||||||
|
Note that this function will try to find "workload" from all the ComputeOp in the input.
|
||||||
|
You can attach "workload" to your compute op by using :any:`register_topi_compute`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
topi_schedule: GenericFunc
|
||||||
|
The topi schedule function that will be overloaded
|
||||||
|
target_keys: str or list of str
|
||||||
|
The compilation target
|
||||||
|
template_keys: str or list of str
|
||||||
|
The template key.
|
||||||
|
We might have several strategies for a single operator (e.g. direct, im2col, winograd).
|
||||||
|
The template key is used to identity the algorithm strategy.
|
||||||
|
Every operator must have a "direct" template, which is used by default.
|
||||||
|
func: None or callable
|
||||||
|
If it is None, return a decorator.
|
||||||
|
If is callable, decorate this function.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
decorator: callable
|
||||||
|
A decorator
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
|
||||||
|
"""
|
||||||
|
def _decorator(f):
|
||||||
|
targets = [target_keys] if isinstance(target_keys, str) else target_keys
|
||||||
|
for target_key in targets:
|
||||||
|
if target_key not in _REGISTED_DISPATHCER:
|
||||||
|
_REGISTED_DISPATHCER[target_key] = {}
|
||||||
|
if topi_schedule not in _REGISTED_DISPATHCER[target_key]:
|
||||||
|
@topi_schedule.register(target_key)
|
||||||
|
@dispatcher
|
||||||
|
def config_dispatcher(outs):
|
||||||
|
"""override topi call as a workload dispatcher"""
|
||||||
|
def traverse(tensors):
|
||||||
|
"""traverse all ops to find attached workload"""
|
||||||
|
for t in tensors:
|
||||||
|
op = t.op
|
||||||
|
if 'workload' in op.attrs:
|
||||||
|
return op.attrs['workload']
|
||||||
|
wkl = traverse(op.input_tensors)
|
||||||
|
if wkl:
|
||||||
|
return wkl
|
||||||
|
return None
|
||||||
|
|
||||||
|
outs = [outs] if isinstance(outs, tensor.Tensor) else outs
|
||||||
|
workload = traverse(outs)
|
||||||
|
|
||||||
|
if workload is None:
|
||||||
|
raise RuntimeError("Cannot find workload in attribute of this schedule")
|
||||||
|
|
||||||
|
return args_to_workload(workload)
|
||||||
|
|
||||||
|
_REGISTED_DISPATHCER[target_key][topi_schedule] = config_dispatcher
|
||||||
|
|
||||||
|
config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_schedule]
|
||||||
|
|
||||||
|
@config_dispatcher.register(template_keys)
|
||||||
|
def template_call(cfg, outs):
|
||||||
|
"""call the schedule func"""
|
||||||
|
if f == topi_schedule.fdefault:
|
||||||
|
return f(outs)
|
||||||
|
return f(cfg, outs)
|
||||||
|
|
||||||
|
return f
|
||||||
|
|
||||||
|
if func:
|
||||||
|
_decorator(func)
|
||||||
|
|
||||||
|
return _decorator
|
|
@ -0,0 +1,123 @@
|
||||||
|
"""
|
||||||
|
TopHub: Tensor Operator Hub
|
||||||
|
To get the best performance, we typically need auto-tuning for the specific devices.
|
||||||
|
TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
|
||||||
|
TVM will download these parameters for you when you create the target for the first time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .task import ApplyHistoryBest
|
||||||
|
from .. import target as _target
|
||||||
|
from ..contrib.util import tempdir
|
||||||
|
from ..contrib.download import download
|
||||||
|
|
||||||
|
AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
|
||||||
|
|
||||||
|
|
||||||
|
def _alias(name):
|
||||||
|
"""convert alias for some packages"""
|
||||||
|
table = {
|
||||||
|
'vtacpu': 'vta',
|
||||||
|
}
|
||||||
|
return table.get(name, name)
|
||||||
|
|
||||||
|
|
||||||
|
def context(target, extra_files=None):
|
||||||
|
"""Return the dispatch context with pre-tuned parameters.
|
||||||
|
The corresponding downloaded *.log files under tophub root path will be loaded.
|
||||||
|
Users can also add their own files in argument `extra_files`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
target: Target
|
||||||
|
The compilation target
|
||||||
|
extra_files: list of str, optional
|
||||||
|
Extra log files to load
|
||||||
|
"""
|
||||||
|
rootpath = AUTOTVM_TOPHUB_ROOT_PATH
|
||||||
|
best_context = ApplyHistoryBest([])
|
||||||
|
|
||||||
|
if isinstance(target, str):
|
||||||
|
target = _target.create(target)
|
||||||
|
|
||||||
|
big_target = str(target).split()[0]
|
||||||
|
if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
|
||||||
|
best_context.load(os.path.join(rootpath, big_target + ".log"))
|
||||||
|
|
||||||
|
for opt in target.options:
|
||||||
|
if opt.startswith("-device"):
|
||||||
|
model = _alias(opt[8:])
|
||||||
|
if os.path.isfile(os.path.join(rootpath, model) + ".log"):
|
||||||
|
best_context.load(os.path.join(rootpath, model) + ".log")
|
||||||
|
|
||||||
|
if extra_files:
|
||||||
|
for filename in extra_files:
|
||||||
|
best_context.load(filename)
|
||||||
|
|
||||||
|
return best_context
|
||||||
|
|
||||||
|
|
||||||
|
def download_package(backend):
|
||||||
|
"""Download pre-tuned parameters of operators for a backend
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
backend: str
|
||||||
|
The name of package
|
||||||
|
"""
|
||||||
|
rootpath = AUTOTVM_TOPHUB_ROOT_PATH
|
||||||
|
|
||||||
|
if not os.path.isdir(rootpath):
|
||||||
|
# make directory
|
||||||
|
splits = os.path.split(rootpath)
|
||||||
|
for j in range(1, len(splits)+1):
|
||||||
|
path = os.path.join(*splits[:j])
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.mkdir(path)
|
||||||
|
|
||||||
|
backend = _alias(backend)
|
||||||
|
logging.info("Download pre-tuned parameters for %s", backend)
|
||||||
|
download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
|
||||||
|
os.path.join(rootpath, backend + ".log"), True, verbose=0)
|
||||||
|
|
||||||
|
|
||||||
|
def check_package(backend):
|
||||||
|
"""Check whether have pre-tuned parameters of the certain target.
|
||||||
|
If not, will download it.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
backend: str
|
||||||
|
The name of package
|
||||||
|
"""
|
||||||
|
backend = _alias(backend)
|
||||||
|
|
||||||
|
if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
|
||||||
|
return
|
||||||
|
download_package(backend)
|
||||||
|
|
||||||
|
|
||||||
|
def list_packages():
|
||||||
|
"""List all available pre-tuned op parameters for targets
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ret: List
|
||||||
|
All available packets
|
||||||
|
"""
|
||||||
|
path = tempdir()
|
||||||
|
filename = path.relpath("info.json")
|
||||||
|
logging.info("Download meta info for pre-tuned parameters")
|
||||||
|
download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
|
||||||
|
filename, True, verbose=0)
|
||||||
|
|
||||||
|
with open(filename, "r") as fin:
|
||||||
|
text = "".join(fin.readlines())
|
||||||
|
info = json.loads(text)
|
||||||
|
keys = list(info.keys())
|
||||||
|
keys.sort()
|
||||||
|
|
||||||
|
return [(k, info[k]) for k in keys]
|
|
@ -1,10 +1,13 @@
|
||||||
# pylint: disable=consider-using-enumerate,invalid-name
|
# pylint: disable=consider-using-enumerate,invalid-name
|
||||||
"""Namespace of callback utilities of AutoTVM"""
|
"""Namespace of callback utilities of AutoTVM"""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .. import record
|
from .. import record
|
||||||
|
|
||||||
|
|
||||||
def log_to_file(file_out, protocol='json'):
|
def log_to_file(file_out, protocol='json'):
|
||||||
"""Log the tuning records into file.
|
"""Log the tuning records into file.
|
||||||
The rows of the log are stored in the format of autotvm.record.encode.
|
The rows of the log are stored in the format of autotvm.record.encode.
|
||||||
|
@ -21,7 +24,6 @@ def log_to_file(file_out, protocol='json'):
|
||||||
callback : callable
|
callback : callable
|
||||||
Callback function to do the logging.
|
Callback function to do the logging.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _callback(_, inputs, results):
|
def _callback(_, inputs, results):
|
||||||
"""Callback implementation"""
|
"""Callback implementation"""
|
||||||
if isinstance(file_out, str):
|
if isinstance(file_out, str):
|
||||||
|
@ -34,55 +36,21 @@ def log_to_file(file_out, protocol='json'):
|
||||||
return _callback
|
return _callback
|
||||||
|
|
||||||
|
|
||||||
def save_tuner_state(prefix, save_every_sample=100):
|
def log_to_database(db):
|
||||||
"""Save the state of tuner
|
"""Save the tuning records to a database object.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
prefix : srt
|
db: Database
|
||||||
prefix of the filename to store state
|
The database
|
||||||
save_every_sample: int
|
|
||||||
save the state every x samples
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
callback : function
|
|
||||||
Callback function to do the auto saving.
|
|
||||||
"""
|
"""
|
||||||
def _callback(tuner, inputs, results):
|
|
||||||
for _, __ in zip(inputs, results):
|
|
||||||
try:
|
|
||||||
ct = len(tuner.visited)
|
|
||||||
except AttributeError:
|
|
||||||
ct = 0
|
|
||||||
if ct % save_every_sample == 0:
|
|
||||||
tuner.save_state(prefix + "_%d.state" % ct)
|
|
||||||
|
|
||||||
return _callback
|
|
||||||
|
|
||||||
|
|
||||||
def log_to_redis(host="localhost", port=6379, dbn=11):
|
|
||||||
"""Record the tuning record to a redis DB.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
host: str, optional
|
|
||||||
Host address of redis db
|
|
||||||
port: int, optional
|
|
||||||
Port of redis db
|
|
||||||
dbn: int, optional
|
|
||||||
which redis db to use, default 11
|
|
||||||
"""
|
|
||||||
# import here so only depend on redis when necessary
|
|
||||||
import redis
|
|
||||||
red = redis.StrictRedis(host=host, port=port, db=dbn)
|
|
||||||
|
|
||||||
def _callback(_, inputs, results):
|
def _callback(_, inputs, results):
|
||||||
"""Callback implementation"""
|
"""Callback implementation"""
|
||||||
for inp, result in zip(inputs, results):
|
for inp, result in zip(inputs, results):
|
||||||
red.set(inp, result)
|
db.save(inp, result)
|
||||||
return _callback
|
return _callback
|
||||||
|
|
||||||
|
|
||||||
class Monitor(object):
|
class Monitor(object):
|
||||||
"""A monitor to collect statistic during tuning"""
|
"""A monitor to collect statistic during tuning"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -110,3 +78,47 @@ class Monitor(object):
|
||||||
def trial_timestamps(self):
|
def trial_timestamps(self):
|
||||||
"""get wall clock time stamp of all trials"""
|
"""get wall clock time stamp of all trials"""
|
||||||
return np.array(self.timestamps)
|
return np.array(self.timestamps)
|
||||||
|
|
||||||
|
|
||||||
|
def progress_bar(total, prefix=''):
|
||||||
|
"""Display progress bar for tuning
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
total: int
|
||||||
|
The total number of trials
|
||||||
|
prefix: str
|
||||||
|
The prefix of output message
|
||||||
|
"""
|
||||||
|
class _Context:
|
||||||
|
"""Context to store local variables"""
|
||||||
|
def __init__(self):
|
||||||
|
self.best_flops = 0
|
||||||
|
self.cur_flops = 0
|
||||||
|
self.ct = 0
|
||||||
|
self.total = total
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
sys.stdout.write(' Done.\n')
|
||||||
|
|
||||||
|
ctx = _Context()
|
||||||
|
tic = time.time()
|
||||||
|
|
||||||
|
def _callback(tuner, inputs, results):
|
||||||
|
ctx.ct += len(inputs)
|
||||||
|
|
||||||
|
flops = 0
|
||||||
|
for inp, res in zip(inputs, results):
|
||||||
|
if res.error_no == 0:
|
||||||
|
flops = inp.task.flop / np.mean(res.costs)
|
||||||
|
|
||||||
|
ctx.cur_flops = flops
|
||||||
|
ctx.best_flops = tuner.best_flops
|
||||||
|
|
||||||
|
sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
|
||||||
|
'| %.2f s' %
|
||||||
|
(prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
|
||||||
|
time.time() - tic))
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
return _callback
|
||||||
|
|
|
@ -117,3 +117,6 @@ class GATuner(Tuner):
|
||||||
|
|
||||||
def has_next(self):
|
def has_next(self):
|
||||||
return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
|
return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
|
||||||
|
|
||||||
|
def load_history(self, data_set):
|
||||||
|
pass
|
||||||
|
|
|
@ -25,6 +25,9 @@ class GridSearchTuner(Tuner):
|
||||||
def has_next(self):
|
def has_next(self):
|
||||||
return self.counter < len(self.task.config_space)
|
return self.counter < len(self.task.config_space)
|
||||||
|
|
||||||
|
def load_history(self, data_set):
|
||||||
|
pass
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return {"counter": self.counter}
|
return {"counter": self.counter}
|
||||||
|
|
||||||
|
@ -56,6 +59,9 @@ class RandomTuner(Tuner):
|
||||||
def has_next(self):
|
def has_next(self):
|
||||||
return len(self.visited) < len(self.task.config_space)
|
return len(self.visited) < len(self.task.config_space)
|
||||||
|
|
||||||
|
def load_history(self, data_set):
|
||||||
|
pass
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return {"visited": self.counter}
|
return {"visited": self.counter}
|
||||||
|
|
||||||
|
|
|
@ -242,7 +242,7 @@ class ModelBasedTuner(Tuner):
|
||||||
self.ys.append(flops)
|
self.ys.append(flops)
|
||||||
else:
|
else:
|
||||||
self.xs.append(index)
|
self.xs.append(index)
|
||||||
self.ys.append(0)
|
self.ys.append(0.0)
|
||||||
|
|
||||||
# if we have enough new training samples
|
# if we have enough new training samples
|
||||||
if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
|
if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
|
||||||
|
|
|
@ -26,11 +26,11 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
|
||||||
If is an Array, then perform linear cooling from temp[0] to temp[1]
|
If is an Array, then perform linear cooling from temp[0] to temp[1]
|
||||||
early_stop: int, optional
|
early_stop: int, optional
|
||||||
Stop iteration if the optimal set do not change in `early_stop` rounds
|
Stop iteration if the optimal set do not change in `early_stop` rounds
|
||||||
verbose: int, optional
|
log_interval: int, optional
|
||||||
Print log every `verbose` iterations
|
Print log every `log_interval` iterations
|
||||||
"""
|
"""
|
||||||
def __init__(self, task, n_iter=500, temp=(1, 0), persistent=True, parallel_size=128,
|
def __init__(self, task, n_iter=500, temp=(1, 0), persistent=True, parallel_size=128,
|
||||||
early_stop=50, verbose=50):
|
early_stop=50, log_interval=50):
|
||||||
super(SimulatedAnnealingOptimizer, self).__init__()
|
super(SimulatedAnnealingOptimizer, self).__init__()
|
||||||
|
|
||||||
self.task = task
|
self.task = task
|
||||||
|
@ -41,12 +41,13 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
|
||||||
self.persistent = persistent
|
self.persistent = persistent
|
||||||
self.parallel_size = min(parallel_size, len(self.task.config_space))
|
self.parallel_size = min(parallel_size, len(self.task.config_space))
|
||||||
self.early_stop = early_stop or 1e9
|
self.early_stop = early_stop or 1e9
|
||||||
self.verbose = verbose
|
self.log_interval = log_interval
|
||||||
self.points = None
|
self.points = None
|
||||||
|
|
||||||
def find_maximums(self, model, num, exclusive):
|
def find_maximums(self, model, num, exclusive):
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
temp, n_iter, early_stop, verbose = self.temp, self.n_iter, self.early_stop, self.verbose
|
temp, n_iter, early_stop, log_interval = \
|
||||||
|
self.temp, self.n_iter, self.early_stop, self.log_interval
|
||||||
|
|
||||||
if self.persistent and self.points is not None:
|
if self.persistent and self.points is not None:
|
||||||
points = self.points
|
points = self.points
|
||||||
|
@ -100,19 +101,18 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
|
||||||
k += 1
|
k += 1
|
||||||
t -= cool
|
t -= cool
|
||||||
|
|
||||||
if verbose >= 1 and k % verbose == 0:
|
if log_interval and k % log_interval == 0:
|
||||||
t_str = "%.2f" % t
|
t_str = "%.2f" % t
|
||||||
logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
|
logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
|
||||||
"elapsed: %.2f",
|
"elapsed: %.2f",
|
||||||
k, k_last_modify, heap_items[0][0],
|
k, k_last_modify, heap_items[0][0],
|
||||||
np.max([v for v, _ in heap_items]), t_str,
|
np.max([v for v, _ in heap_items]), t_str,
|
||||||
time.time() - tic)
|
time.time() - tic)
|
||||||
|
|
||||||
heap_items.sort(key=lambda item: -item[0])
|
heap_items.sort(key=lambda item: -item[0])
|
||||||
if verbose:
|
logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
|
||||||
logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
|
k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
|
||||||
k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
|
logging.debug("SA Maximums: %s", heap_items)
|
||||||
logging.info("SA Maximums: %s", heap_items)
|
|
||||||
|
|
||||||
if self.persistent:
|
if self.persistent:
|
||||||
self.points = points
|
self.points = points
|
||||||
|
|
|
@ -7,6 +7,7 @@ import numpy as np
|
||||||
from ..measure import MeasureInput
|
from ..measure import MeasureInput
|
||||||
from ..measure import create_measure_batch
|
from ..measure import create_measure_batch
|
||||||
|
|
||||||
|
from ..env import GLOBAL_SCOPE
|
||||||
|
|
||||||
class Tuner(object):
|
class Tuner(object):
|
||||||
"""Base class for tuners
|
"""Base class for tuners
|
||||||
|
@ -64,7 +65,7 @@ class Tuner(object):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tune(self, n_trial, measure_option, early_stop=None, verbose=1, callbacks=()):
|
def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
|
||||||
"""Begin tuning
|
"""Begin tuning
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -74,11 +75,8 @@ class Tuner(object):
|
||||||
measure_option: dict
|
measure_option: dict
|
||||||
The options for how to measure generated code.
|
The options for how to measure generated code.
|
||||||
You should use the return value ot autotvm.measure_option for this argument.
|
You should use the return value ot autotvm.measure_option for this argument.
|
||||||
early_stop: int
|
early_stopping: int
|
||||||
Early stop the tuning when not finding better configs in this number of trials
|
Early stop the tuning when not finding better configs in this number of trials
|
||||||
verbose: int
|
|
||||||
0: silent mode, no output
|
|
||||||
1: print every measurement result
|
|
||||||
callbacks: List of callable
|
callbacks: List of callable
|
||||||
A list of callback functions. The signature of callback function is
|
A list of callback functions. The signature of callback function is
|
||||||
(Tuner, List of MeasureInput, List of MeasureResult)
|
(Tuner, List of MeasureInput, List of MeasureResult)
|
||||||
|
@ -87,8 +85,9 @@ class Tuner(object):
|
||||||
"""
|
"""
|
||||||
measure_batch = create_measure_batch(self.task, measure_option)
|
measure_batch = create_measure_batch(self.task, measure_option)
|
||||||
parallel_num = getattr(measure_batch, 'parallel_num', 1)
|
parallel_num = getattr(measure_batch, 'parallel_num', 1)
|
||||||
early_stop = early_stop or 1e9
|
early_stopping = early_stopping or 1e9
|
||||||
|
|
||||||
|
GLOBAL_SCOPE.in_tuning = True
|
||||||
i = 0
|
i = 0
|
||||||
while i < n_trial:
|
while i < n_trial:
|
||||||
if not self.has_next():
|
if not self.has_next():
|
||||||
|
@ -99,23 +98,22 @@ class Tuner(object):
|
||||||
inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
|
inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
|
||||||
results = measure_batch(inputs)
|
results = measure_batch(inputs)
|
||||||
|
|
||||||
# print info
|
# keep best config
|
||||||
if verbose >= 1:
|
for k, (inp, res) in enumerate(zip(inputs, results)):
|
||||||
for k, (inp, res) in enumerate(zip(inputs, results)):
|
config = inp.config
|
||||||
config = inp.config
|
if res.error_no == 0:
|
||||||
if res.error_no == 0:
|
flops = inp.task.flop / np.mean(res.costs)
|
||||||
flops = inp.task.flop / np.mean(res.costs)
|
else:
|
||||||
else:
|
flops = 0
|
||||||
flops = 0
|
if flops > self.best_flops:
|
||||||
if flops > self.best_flops:
|
self.best_flops = flops
|
||||||
self.best_flops = flops
|
self.best_config = config
|
||||||
self.best_config = config
|
self.best_measure_pair = (inp, res)
|
||||||
self.best_measure_pair = (inp, res)
|
self.best_iter = i + k
|
||||||
self.best_iter = i + k
|
|
||||||
|
|
||||||
logging.info("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
|
logging.debug("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
|
||||||
i + k + 1, flops / 1e9, self.best_flops / 1e9,
|
i + k + 1, flops / 1e9, self.best_flops / 1e9,
|
||||||
res, config)
|
res, config)
|
||||||
|
|
||||||
i += len(results)
|
i += len(results)
|
||||||
|
|
||||||
|
@ -124,10 +122,12 @@ class Tuner(object):
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
callback(self, inputs, results)
|
callback(self, inputs, results)
|
||||||
|
|
||||||
if i > self.best_iter + early_stop:
|
if i > self.best_iter + early_stopping:
|
||||||
logging.info("Early stopped. Best iter: %d.", self.best_iter)
|
logging.debug("Early stopped. Best iter: %d.", self.best_iter)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
GLOBAL_SCOPE.in_tuning = False
|
||||||
|
|
||||||
del measure_batch
|
del measure_batch
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
|
|
@ -42,10 +42,10 @@ class XGBoostCostModel(CostModel):
|
||||||
The cost model predicts relative rank score.
|
The cost model predicts relative rank score.
|
||||||
num_threads: int, optional
|
num_threads: int, optional
|
||||||
The number of threads.
|
The number of threads.
|
||||||
verbose: int, optional
|
log_interval: int, optional
|
||||||
If is not none, the cost model will print training log every `verbose` iterations.
|
If is not none, the cost model will print training log every `log_interval` iterations.
|
||||||
"""
|
"""
|
||||||
def __init__(self, task, feature_type, loss_type, num_threads=None, verbose=20):
|
def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
|
||||||
super(XGBoostCostModel, self).__init__()
|
super(XGBoostCostModel, self).__init__()
|
||||||
|
|
||||||
if xgb is None:
|
if xgb is None:
|
||||||
|
@ -60,7 +60,7 @@ class XGBoostCostModel(CostModel):
|
||||||
self.fea_type = feature_type
|
self.fea_type = feature_type
|
||||||
self.loss_type = loss_type
|
self.loss_type = loss_type
|
||||||
self.num_threads = num_threads
|
self.num_threads = num_threads
|
||||||
self.verbose = verbose
|
self.log_interval = log_interval
|
||||||
|
|
||||||
if loss_type == 'reg':
|
if loss_type == 'reg':
|
||||||
self.xgb_params = {
|
self.xgb_params = {
|
||||||
|
@ -139,7 +139,8 @@ class XGBoostCostModel(CostModel):
|
||||||
|
|
||||||
x_train = self._get_feature(xs)
|
x_train = self._get_feature(xs)
|
||||||
y_train = np.array(ys)
|
y_train = np.array(ys)
|
||||||
y_train = y_train / np.max(y_train)
|
y_max = np.max(y_train)
|
||||||
|
y_train = y_train / max(y_max, 1e-8)
|
||||||
|
|
||||||
valid_index = y_train > 1e-6
|
valid_index = y_train > 1e-6
|
||||||
index = np.random.permutation(len(x_train))
|
index = np.random.permutation(len(x_train))
|
||||||
|
@ -160,19 +161,20 @@ class XGBoostCostModel(CostModel):
|
||||||
fevals=[
|
fevals=[
|
||||||
xgb_average_recalln_curve_score(plan_size),
|
xgb_average_recalln_curve_score(plan_size),
|
||||||
],
|
],
|
||||||
verbose_eval=self.verbose)])
|
verbose_eval=self.log_interval)])
|
||||||
|
|
||||||
logging.info("train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
|
logging.debug("XGB train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
|
||||||
time.time() - tic, len(xs),
|
time.time() - tic, len(xs),
|
||||||
len(xs) - np.sum(valid_index),
|
len(xs) - np.sum(valid_index),
|
||||||
self.feature_cache.size(self.fea_type))
|
self.feature_cache.size(self.fea_type))
|
||||||
|
|
||||||
def fit_log(self, records, plan_size):
|
def fit_log(self, records, plan_size):
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
self._reset_pool()
|
self._reset_pool()
|
||||||
|
|
||||||
args = list(records)
|
args = list(records)
|
||||||
logging.info("Load %d entries from history log file", len(args))
|
logging.debug("XGB load %d entries from history log file", len(args))
|
||||||
|
|
||||||
if self.fea_type == 'itervar':
|
if self.fea_type == 'itervar':
|
||||||
feature_extract_func = _extract_itervar_feature_log
|
feature_extract_func = _extract_itervar_feature_log
|
||||||
elif self.fea_type == 'knob':
|
elif self.fea_type == 'knob':
|
||||||
|
@ -187,7 +189,8 @@ class XGBoostCostModel(CostModel):
|
||||||
|
|
||||||
x_train = xs
|
x_train = xs
|
||||||
y_train = ys
|
y_train = ys
|
||||||
y_train /= np.max(y_train)
|
y_max = np.max(y_train)
|
||||||
|
y_train = y_train / max(y_max, 1e-8)
|
||||||
|
|
||||||
index = np.random.permutation(len(x_train))
|
index = np.random.permutation(len(x_train))
|
||||||
dtrain = xgb.DMatrix(x_train[index], y_train[index])
|
dtrain = xgb.DMatrix(x_train[index], y_train[index])
|
||||||
|
@ -203,9 +206,9 @@ class XGBoostCostModel(CostModel):
|
||||||
fevals=[
|
fevals=[
|
||||||
xgb_average_recalln_curve_score(plan_size),
|
xgb_average_recalln_curve_score(plan_size),
|
||||||
],
|
],
|
||||||
verbose_eval=self.verbose)])
|
verbose_eval=self.log_interval)])
|
||||||
|
|
||||||
logging.info("train: %.2f\tobs: %d", time.time() - tic, len(xs))
|
logging.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
|
||||||
|
|
||||||
def predict(self, xs, output_margin=False):
|
def predict(self, xs, output_margin=False):
|
||||||
feas = self._get_feature(xs)
|
feas = self._get_feature(xs)
|
||||||
|
@ -232,7 +235,7 @@ class XGBoostCostModel(CostModel):
|
||||||
|
|
||||||
def clone_new(self):
|
def clone_new(self):
|
||||||
return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
|
return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
|
||||||
self.num_threads, self.verbose)
|
self.num_threads, self.log_interval)
|
||||||
|
|
||||||
def _get_feature(self, indexes):
|
def _get_feature(self, indexes):
|
||||||
"""get features for indexes, run extraction if we do not have cache for them"""
|
"""get features for indexes, run extraction if we do not have cache for them"""
|
||||||
|
@ -282,7 +285,7 @@ def _extract_itervar_feature_log(arg):
|
||||||
if res.error_no == 0:
|
if res.error_no == 0:
|
||||||
y = inp.task.flop / np.mean(res.costs)
|
y = inp.task.flop / np.mean(res.costs)
|
||||||
else:
|
else:
|
||||||
y = 0
|
y = 0.0
|
||||||
return x, y
|
return x, y
|
||||||
|
|
||||||
def _extract_knob_feature_index(index):
|
def _extract_knob_feature_index(index):
|
||||||
|
@ -301,7 +304,7 @@ def _extract_knob_feature_log(arg):
|
||||||
inp.task.instantiate(config)
|
inp.task.instantiate(config)
|
||||||
y = inp.task.flop / np.mean(res.costs)
|
y = inp.task.flop / np.mean(res.costs)
|
||||||
else:
|
else:
|
||||||
y = 0
|
y = 0.0
|
||||||
return x, y
|
return x, y
|
||||||
|
|
||||||
def _extract_curve_feature_index(index):
|
def _extract_curve_feature_index(index):
|
||||||
|
@ -325,12 +328,11 @@ def _extract_curve_feature_log(arg):
|
||||||
if res.error_no == 0:
|
if res.error_no == 0:
|
||||||
y = inp.task.flop / np.mean(res.costs)
|
y = inp.task.flop / np.mean(res.costs)
|
||||||
else:
|
else:
|
||||||
y = 0
|
y = 0.0
|
||||||
return x, y
|
return x, y
|
||||||
|
|
||||||
|
|
||||||
def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
|
def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
|
||||||
save_file="xgb_checkpoint", save_every=None,
|
|
||||||
maximize=False, verbose_eval=True):
|
maximize=False, verbose_eval=True):
|
||||||
"""callback function for xgboost to support multiple custom evaluation functions"""
|
"""callback function for xgboost to support multiple custom evaluation functions"""
|
||||||
from xgboost.core import EarlyStopException
|
from xgboost.core import EarlyStopException
|
||||||
|
@ -400,18 +402,12 @@ def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
|
||||||
continue
|
continue
|
||||||
infos.append("%s: %.6f" % (item[0], item[1]))
|
infos.append("%s: %.6f" % (item[0], item[1]))
|
||||||
|
|
||||||
if not isinstance(verbose_eval, bool) and i % verbose_eval == 0:
|
if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0:
|
||||||
logging.info("\t".join(infos))
|
logging.debug("\t".join(infos))
|
||||||
if log_file:
|
if log_file:
|
||||||
with open(log_file, "a") as fout:
|
with open(log_file, "a") as fout:
|
||||||
fout.write("\t".join(infos) + '\n')
|
fout.write("\t".join(infos) + '\n')
|
||||||
|
|
||||||
##### save model #####
|
|
||||||
if save_every and i % save_every == 0:
|
|
||||||
filename = save_file + ".%05d.bst" % i
|
|
||||||
logging.info("save model to %s ...", filename)
|
|
||||||
bst.save_model(filename)
|
|
||||||
|
|
||||||
##### choose score and do early stopping #####
|
##### choose score and do early stopping #####
|
||||||
score = None
|
score = None
|
||||||
for item in eval_res:
|
for item in eval_res:
|
||||||
|
@ -439,7 +435,7 @@ def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
|
||||||
elif env.iteration - best_iteration >= stopping_rounds:
|
elif env.iteration - best_iteration >= stopping_rounds:
|
||||||
best_msg = state['best_msg']
|
best_msg = state['best_msg']
|
||||||
if verbose_eval and env.rank == 0:
|
if verbose_eval and env.rank == 0:
|
||||||
logging.info("Stopping. Best iteration: %s ", best_msg)
|
logging.debug("XGB stopped. Best iteration: %s ", best_msg)
|
||||||
raise EarlyStopException(best_iteration)
|
raise EarlyStopException(best_iteration)
|
||||||
|
|
||||||
return callback
|
return callback
|
||||||
|
|
|
@ -40,16 +40,21 @@ class XGBTuner(ModelBasedTuner):
|
||||||
If is not None, the tuner will first select
|
If is not None, the tuner will first select
|
||||||
top-(plan_size * diversity_filter_ratio) candidates according to the cost model
|
top-(plan_size * diversity_filter_ratio) candidates according to the cost model
|
||||||
and then pick batch_size of them according to the diversity metric.
|
and then pick batch_size of them according to the diversity metric.
|
||||||
|
log_interval: int, optional
|
||||||
|
The verbose level.
|
||||||
|
If is 0, output nothing.
|
||||||
|
Otherwise, output debug information every `verbose` iterations.
|
||||||
"""
|
"""
|
||||||
def __init__(self, task, plan_size=32,
|
def __init__(self, task, plan_size=32,
|
||||||
feature_type='itervar', loss_type='rank', num_threads=None,
|
feature_type='itervar', loss_type='rank', num_threads=None,
|
||||||
optimizer='sa', diversity_filter_ratio=None):
|
optimizer='sa', diversity_filter_ratio=None, log_interval=50):
|
||||||
cost_model = XGBoostCostModel(task,
|
cost_model = XGBoostCostModel(task,
|
||||||
feature_type=feature_type,
|
feature_type=feature_type,
|
||||||
loss_type=loss_type,
|
loss_type=loss_type,
|
||||||
num_threads=num_threads)
|
num_threads=num_threads,
|
||||||
|
log_interval=log_interval // 2)
|
||||||
if optimizer == 'sa':
|
if optimizer == 'sa':
|
||||||
optimizer = SimulatedAnnealingOptimizer(task)
|
optimizer = SimulatedAnnealingOptimizer(task, log_interval=log_interval)
|
||||||
else:
|
else:
|
||||||
assert isinstance(optimizer, ModelOptimizer), "Optimizer must be " \
|
assert isinstance(optimizer, ModelOptimizer), "Optimizer must be " \
|
||||||
"a supported name string" \
|
"a supported name string" \
|
||||||
|
|
|
@ -8,6 +8,16 @@ import numpy as np
|
||||||
|
|
||||||
from .. import expr, ir_pass
|
from .. import expr, ir_pass
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyContext(object):
|
||||||
|
"""An empty context"""
|
||||||
|
def __enter__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_rank(values):
|
def get_rank(values):
|
||||||
"""get rank of items
|
"""get rank of items
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def download(url, path, overwrite=False, size_compare=False):
|
def download(url, path, overwrite=False, size_compare=False, verbose=1):
|
||||||
"""Downloads the file from the internet.
|
"""Downloads the file from the internet.
|
||||||
Set the input options correctly to overwrite or do the size comparison
|
Set the input options correctly to overwrite or do the size comparison
|
||||||
|
|
||||||
|
@ -23,9 +23,10 @@ def download(url, path, overwrite=False, size_compare=False):
|
||||||
|
|
||||||
size_compare : bool, optional
|
size_compare : bool, optional
|
||||||
Whether to do size compare to check downloaded file.
|
Whether to do size compare to check downloaded file.
|
||||||
"""
|
|
||||||
|
|
||||||
import requests
|
verbose: int, optional
|
||||||
|
Verbose level
|
||||||
|
"""
|
||||||
if sys.version_info >= (3,):
|
if sys.version_info >= (3,):
|
||||||
import urllib.request as urllib2
|
import urllib.request as urllib2
|
||||||
else:
|
else:
|
||||||
|
@ -33,6 +34,7 @@ def download(url, path, overwrite=False, size_compare=False):
|
||||||
|
|
||||||
if os.path.isfile(path) and not overwrite:
|
if os.path.isfile(path) and not overwrite:
|
||||||
if size_compare:
|
if size_compare:
|
||||||
|
import requests
|
||||||
file_size = os.path.getsize(path)
|
file_size = os.path.getsize(path)
|
||||||
res_head = requests.head(url)
|
res_head = requests.head(url)
|
||||||
res_get = requests.get(url, stream=True)
|
res_get = requests.get(url, stream=True)
|
||||||
|
@ -45,7 +47,9 @@ def download(url, path, overwrite=False, size_compare=False):
|
||||||
return
|
return
|
||||||
print('File {} exists, skip.'.format(path))
|
print('File {} exists, skip.'.format(path))
|
||||||
return
|
return
|
||||||
print('Downloading from url {} to {}'.format(url, path))
|
|
||||||
|
if verbose >= 1:
|
||||||
|
print('Downloading from url {} to {}'.format(url, path))
|
||||||
|
|
||||||
# Stateful start time
|
# Stateful start time
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
|
@ -142,3 +142,35 @@ def which(exec_name):
|
||||||
if os.path.isfile(full_path) and os.access(full_path, os.X_OK):
|
if os.path.isfile(full_path) and os.access(full_path, os.X_OK):
|
||||||
return full_path
|
return full_path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_lower_ir(s):
|
||||||
|
"""Get lower ir code of a schedule.
|
||||||
|
This is useful for debug, since you don't have to find all inputs/outputs
|
||||||
|
for a schedule in a fused subgraph.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s: Schedule
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ir: str
|
||||||
|
The lower ir
|
||||||
|
"""
|
||||||
|
from .. import tensor
|
||||||
|
from ..build_module import lower
|
||||||
|
|
||||||
|
outputs = s.outputs
|
||||||
|
|
||||||
|
inputs = []
|
||||||
|
def find_all(op):
|
||||||
|
if isinstance(op, tensor.PlaceholderOp):
|
||||||
|
inputs.append(op.output(0))
|
||||||
|
else:
|
||||||
|
for x in op.input_tensors:
|
||||||
|
find_all(x.op)
|
||||||
|
|
||||||
|
for out in outputs:
|
||||||
|
find_all(out)
|
||||||
|
|
||||||
|
return lower(s, inputs, simple_mode=True)
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
"""Pick best log entries from a large file and store them to a small file"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .. import autotvm
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--act", type=str, choices=['pick-best'],
|
||||||
|
help="The action")
|
||||||
|
parser.add_argument("--i", type=str, help="The input file or directory")
|
||||||
|
parser.add_argument("--o", type=str, help="The output file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
if args.act == 'pick-best':
|
||||||
|
if os.path.isfile(args.i):
|
||||||
|
args.o = args.o or args.i + ".best.log"
|
||||||
|
autotvm.record.pick_best(args.i, args.o)
|
||||||
|
elif os.path.isdir(args.i):
|
||||||
|
args.o = args.o or "best.log"
|
||||||
|
tmp_filename = args.o + ".tmp"
|
||||||
|
|
||||||
|
with open(tmp_filename, 'w') as tmp_fout:
|
||||||
|
for filename in os.listdir(args.i):
|
||||||
|
if filename.endswith(".log"):
|
||||||
|
try:
|
||||||
|
autotvm.record.pick_best(filename, tmp_fout)
|
||||||
|
except Exception: # pylint: disable=broad-except
|
||||||
|
warnings.warn("Ignore invalid file %s" % filename)
|
||||||
|
|
||||||
|
logging.info("Run final filter...")
|
||||||
|
autotvm.record.pick_best(tmp_filename, args.o)
|
||||||
|
os.remove(tmp_filename)
|
||||||
|
logging.info("Output to %s ...", args.o)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid input file: " + args.i)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid action " + args.act)
|
|
@ -40,20 +40,21 @@ if __name__ == "__main__":
|
||||||
help='The port of the PRC')
|
help='The port of the PRC')
|
||||||
parser.add_argument('--port-end', type=int, default=9199,
|
parser.add_argument('--port-end', type=int, default=9199,
|
||||||
help='The end search port of the PRC')
|
help='The end search port of the PRC')
|
||||||
|
parser.add_argument('--tracker', type=str,
|
||||||
|
help="The address of RPC tracker in host:port format. "
|
||||||
|
"e.g. (10.77.1.234:9190)")
|
||||||
parser.add_argument('--key', type=str, default="",
|
parser.add_argument('--key', type=str, default="",
|
||||||
help="RPC key used to identify the connection type.")
|
help="The key used to identify the device type in tracker.")
|
||||||
parser.add_argument('--load-library', type=str, default="",
|
parser.add_argument('--silent', action='store_true',
|
||||||
|
help="Whether run in silent mode.")
|
||||||
|
parser.add_argument('--load-library', type=str,
|
||||||
help="Additional library to load")
|
help="Additional library to load")
|
||||||
parser.add_argument('--tracker', type=str, default="",
|
|
||||||
help="Report to RPC tracker")
|
|
||||||
parser.add_argument('--no-fork', dest='fork', action='store_false',
|
parser.add_argument('--no-fork', dest='fork', action='store_false',
|
||||||
help="Use spawn mode to avoid fork. This option \
|
help="Use spawn mode to avoid fork. This option \
|
||||||
is able to avoid potential fork problems with Metal, OpenCL \
|
is able to avoid potential fork problems with Metal, OpenCL \
|
||||||
and ROCM compilers.")
|
and ROCM compilers.")
|
||||||
parser.add_argument('--custom-addr', type=str,
|
parser.add_argument('--custom-addr', type=str,
|
||||||
help="Custom IP Address to Report to RPC Tracker")
|
help="Custom IP Address to Report to RPC Tracker")
|
||||||
parser.add_argument('--silent', action='store_true',
|
|
||||||
help="Whether run in silent mode.")
|
|
||||||
|
|
||||||
parser.set_defaults(fork=True)
|
parser.set_defaults(fork=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
@ -6,13 +6,12 @@ import logging
|
||||||
import argparse
|
import argparse
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import sys
|
import sys
|
||||||
from ..rpc.tracker import Tracker
|
from .. import rpc
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
"""Main funciton"""
|
"""Main funciton"""
|
||||||
tracker = Tracker(args.host, port=args.port, port_end=args.port_end,
|
tracker = rpc.Tracker(args.host, port=args.port, port_end=args.port_end,
|
||||||
silent=args.silent)
|
silent=args.silent)
|
||||||
tracker.proc.join()
|
tracker.proc.join()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
"""Download pre-tuned parameters of ops"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ..autotvm.tophub import list_packages, download_package
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--download", type=str, nargs='+',
|
||||||
|
help="Target to download. Use 'all' to download for all targets")
|
||||||
|
parser.add_argument("-l", "--list", action='store_true', help="List available packages")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
if args.list:
|
||||||
|
info = list_packages()
|
||||||
|
print("\n%-20s %-20s" % ("Target", "Size"))
|
||||||
|
print("-" * 41)
|
||||||
|
for target, info in info:
|
||||||
|
print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
|
||||||
|
|
||||||
|
if args.download:
|
||||||
|
info = list_packages()
|
||||||
|
all_targets = [x[0] for x in info]
|
||||||
|
if 'all' in args.download:
|
||||||
|
targets = all_targets
|
||||||
|
else:
|
||||||
|
targets = args.download
|
||||||
|
|
||||||
|
for t in targets:
|
||||||
|
if t not in all_targets:
|
||||||
|
print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
|
||||||
|
download_package(t)
|
|
@ -10,4 +10,6 @@ upload and run remote RPC server, get the result back to verify correctness.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .server import Server
|
from .server import Server
|
||||||
|
from .tracker import Tracker
|
||||||
|
from .proxy import Proxy
|
||||||
from .client import RPCSession, LocalSession, TrackerSession, connect, connect_tracker
|
from .client import RPCSession, LocalSession, TrackerSession, connect, connect_tracker
|
||||||
|
|
|
@ -225,18 +225,24 @@ class TrackerSession(object):
|
||||||
res += item["key"] + "\n"
|
res += item["key"] + "\n"
|
||||||
res += "----------------------------\n"
|
res += "----------------------------\n"
|
||||||
res += "\n"
|
res += "\n"
|
||||||
res += "Queue Status\n"
|
|
||||||
res += "----------------------------\n"
|
# compute max length of device key
|
||||||
res += "key\tfree\tpending\n"
|
|
||||||
res += "----------------------------\n"
|
|
||||||
queue_info = data['queue_info']
|
queue_info = data['queue_info']
|
||||||
keys = list(queue_info.keys())
|
keys = list(queue_info.keys())
|
||||||
if keys:
|
if keys:
|
||||||
keys.sort()
|
keys.sort()
|
||||||
max_key_len = max([len(k) for k in keys])
|
max_key_len = max([len(k) for k in keys])
|
||||||
for k in keys:
|
else:
|
||||||
res += ("%%-%d" % max_key_len + "s\t%d\t%g\n") % \
|
max_key_len = 0
|
||||||
(k, queue_info[k]["free"], queue_info[k]["pending"])
|
|
||||||
|
res += "Queue Status\n"
|
||||||
|
res += "----------------------------\n"
|
||||||
|
res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
|
||||||
|
res += "----------------------------\n"
|
||||||
|
for k in keys:
|
||||||
|
res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
|
||||||
|
(k, queue_info[k]["free"], queue_info[k]["pending"])
|
||||||
|
|
||||||
res += "----------------------------\n"
|
res += "----------------------------\n"
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
|
@ -460,6 +460,10 @@ class Proxy(object):
|
||||||
timeout_server : float, optional
|
timeout_server : float, optional
|
||||||
Timeout of server until it sees a matching connection.
|
Timeout of server until it sees a matching connection.
|
||||||
|
|
||||||
|
tracker_addr: Tuple (str, int) , optional
|
||||||
|
The address of RPC Tracker in tuple (host, ip) format.
|
||||||
|
If is not None, the server will register itself to the tracker.
|
||||||
|
|
||||||
index_page : str, optional
|
index_page : str, optional
|
||||||
Path to an index page that can be used to display at proxy index.
|
Path to an index page that can be used to display at proxy index.
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ import multiprocessing
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
|
import signal
|
||||||
|
|
||||||
from .._ffi.function import register_func
|
from .._ffi.function import register_func
|
||||||
from .._ffi.base import py_str
|
from .._ffi.base import py_str
|
||||||
|
@ -257,7 +258,7 @@ def _popen(cmd):
|
||||||
|
|
||||||
|
|
||||||
class Server(object):
|
class Server(object):
|
||||||
"""Start RPC server on a seperate process.
|
"""Start RPC server on a separate process.
|
||||||
|
|
||||||
This is a simple python implementation based on multi-processing.
|
This is a simple python implementation based on multi-processing.
|
||||||
It is also possible to implement a similar C based sever with
|
It is also possible to implement a similar C based sever with
|
||||||
|
@ -284,14 +285,21 @@ class Server(object):
|
||||||
This is recommended to switch on if we want to do local RPC demonstration
|
This is recommended to switch on if we want to do local RPC demonstration
|
||||||
for GPU devices to avoid fork safety issues.
|
for GPU devices to avoid fork safety issues.
|
||||||
|
|
||||||
silent: bool, optional
|
tracker_addr: Tuple (str, int) , optional
|
||||||
Whether run this server in silent mode.
|
The address of RPC Tracker in tuple(host, ip) format.
|
||||||
|
If is not None, the server will register itself to the tracker.
|
||||||
|
|
||||||
key : str, optional
|
key : str, optional
|
||||||
The key used to identify the server in Proxy connection.
|
The key used to identify the device type in tracker.
|
||||||
|
|
||||||
load_library : str, optional
|
load_library : str, optional
|
||||||
List of additional libraries to be loaded during execution.
|
List of additional libraries to be loaded during execution.
|
||||||
|
|
||||||
|
custom_addr: str, optional
|
||||||
|
Custom IP Address to Report to RPC Tracker
|
||||||
|
|
||||||
|
silent: bool, optional
|
||||||
|
Whether run this server in silent mode.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
host,
|
host,
|
||||||
|
@ -299,11 +307,11 @@ class Server(object):
|
||||||
port_end=9199,
|
port_end=9199,
|
||||||
is_proxy=False,
|
is_proxy=False,
|
||||||
use_popen=False,
|
use_popen=False,
|
||||||
silent=False,
|
|
||||||
tracker_addr=None,
|
tracker_addr=None,
|
||||||
key="",
|
key="",
|
||||||
load_library=None,
|
load_library=None,
|
||||||
custom_addr=None):
|
custom_addr=None,
|
||||||
|
silent=False):
|
||||||
try:
|
try:
|
||||||
if base._ServerLoop is None:
|
if base._ServerLoop is None:
|
||||||
raise RuntimeError("Please compile with USE_RPC=1")
|
raise RuntimeError("Please compile with USE_RPC=1")
|
||||||
|
@ -313,6 +321,7 @@ class Server(object):
|
||||||
self.port = port
|
self.port = port
|
||||||
self.libs = []
|
self.libs = []
|
||||||
self.custom_addr = custom_addr
|
self.custom_addr = custom_addr
|
||||||
|
self.use_popen = use_popen
|
||||||
|
|
||||||
self.logger = logging.getLogger("RPCServer")
|
self.logger = logging.getLogger("RPCServer")
|
||||||
if silent:
|
if silent:
|
||||||
|
@ -334,10 +343,7 @@ class Server(object):
|
||||||
if silent:
|
if silent:
|
||||||
cmd += ["--silent"]
|
cmd += ["--silent"]
|
||||||
|
|
||||||
self.proc = multiprocessing.Process(
|
self.proc = subprocess.Popen(cmd, preexec_fn=os.setsid)
|
||||||
target=subprocess.check_call, args=(cmd,))
|
|
||||||
self.proc.deamon = True
|
|
||||||
self.proc.start()
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
elif not is_proxy:
|
elif not is_proxy:
|
||||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
@ -371,9 +377,14 @@ class Server(object):
|
||||||
|
|
||||||
def terminate(self):
|
def terminate(self):
|
||||||
"""Terminate the server process"""
|
"""Terminate the server process"""
|
||||||
if self.proc:
|
if self.use_popen:
|
||||||
self.proc.terminate()
|
if self.proc:
|
||||||
self.proc = None
|
os.killpg(self.proc.pid, signal.SIGTERM)
|
||||||
|
self.proc = None
|
||||||
|
else:
|
||||||
|
if self.proc:
|
||||||
|
self.proc.terminate()
|
||||||
|
self.proc = None
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.terminate()
|
self.terminate()
|
||||||
|
|
|
@ -40,6 +40,8 @@ We can also use other specific function in this module to create specific target
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ._ffi.base import _LIB_NAME
|
from ._ffi.base import _LIB_NAME
|
||||||
from ._ffi.node import NodeBase, register_node
|
from ._ffi.node import NodeBase, register_node
|
||||||
from . import _api_internal
|
from . import _api_internal
|
||||||
|
@ -51,7 +53,6 @@ except ImportError as err_msg:
|
||||||
if _LIB_NAME != "libtvm_runtime.so":
|
if _LIB_NAME != "libtvm_runtime.so":
|
||||||
raise err_msg
|
raise err_msg
|
||||||
|
|
||||||
|
|
||||||
def _merge_opts(opts, new_opts):
|
def _merge_opts(opts, new_opts):
|
||||||
"""Helper function to merge options"""
|
"""Helper function to merge options"""
|
||||||
if isinstance(new_opts, str):
|
if isinstance(new_opts, str):
|
||||||
|
@ -72,7 +73,7 @@ class Target(NodeBase):
|
||||||
Do not use class constructor, you can create target using the following functions
|
Do not use class constructor, you can create target using the following functions
|
||||||
|
|
||||||
- :any:`tvm.target.create` create target from string
|
- :any:`tvm.target.create` create target from string
|
||||||
- :any:`tvm.target.rasp` create raspberry pi target
|
- :any:`tvm.target.arm_cpu` create arm_cpu target
|
||||||
- :any:`tvm.target.cuda` create CUDA target
|
- :any:`tvm.target.cuda` create CUDA target
|
||||||
- :any:`tvm.target.rocm` create ROCM target
|
- :any:`tvm.target.rocm` create ROCM target
|
||||||
- :any:`tvm.target.mali` create Mali target
|
- :any:`tvm.target.mali` create Mali target
|
||||||
|
@ -374,22 +375,6 @@ def rocm(options=None):
|
||||||
return _api_internal._TargetCreate("rocm", *options)
|
return _api_internal._TargetCreate("rocm", *options)
|
||||||
|
|
||||||
|
|
||||||
def rasp(options=None):
|
|
||||||
"""Returns a rasp target.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
options : str or list of str
|
|
||||||
Additional options
|
|
||||||
"""
|
|
||||||
opts = ["-device=rasp",
|
|
||||||
"-mtriple=armv7l-none-linux-gnueabihf",
|
|
||||||
"-mcpu=cortex-a53",
|
|
||||||
"-mattr=+neon"]
|
|
||||||
opts = _merge_opts(opts, options)
|
|
||||||
return _api_internal._TargetCreate("llvm", *opts)
|
|
||||||
|
|
||||||
|
|
||||||
def mali(options=None):
|
def mali(options=None):
|
||||||
"""Returns a ARM Mali GPU target.
|
"""Returns a ARM Mali GPU target.
|
||||||
|
|
||||||
|
@ -428,6 +413,52 @@ def opengl(options=None):
|
||||||
return _api_internal._TargetCreate("opengl", *options)
|
return _api_internal._TargetCreate("opengl", *options)
|
||||||
|
|
||||||
|
|
||||||
|
def arm_cpu(model='unknown', options=None):
|
||||||
|
"""Returns a ARM CPU target.
|
||||||
|
This function will also download pre-tuned op parameters when there is none.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model: str
|
||||||
|
SoC name or phone name of the arm board.
|
||||||
|
options : str or list of str
|
||||||
|
Additional options
|
||||||
|
"""
|
||||||
|
from . import autotvm
|
||||||
|
|
||||||
|
trans_table = {
|
||||||
|
"pixel2": ["-model=snapdragon835", "-target=arm64-linux-android"],
|
||||||
|
"mate10": ["-model=kirin970", "-target=arm64-linux-android"],
|
||||||
|
"mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
|
||||||
|
"p20": ["-model=kirin970", "-target=arm64-linux-android"],
|
||||||
|
"p20pro": ["-model=kirin970", "-target=arm64-linux-android"],
|
||||||
|
"rasp3b": ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
|
||||||
|
"rk3399": ["-model=rk3399", "-target=aarch64-linux-gnu"],
|
||||||
|
"pynq": ["-model=pynq", "-target=armv7a-linux-eabi"],
|
||||||
|
}
|
||||||
|
pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
|
||||||
|
|
||||||
|
# download pre-tuned parameters for arm_cpu if there is not any.
|
||||||
|
autotvm.tophub.check_package('arm_cpu')
|
||||||
|
|
||||||
|
opts = ["-device=arm_cpu"] + pre_defined_opt
|
||||||
|
opts = _merge_opts(opts, options)
|
||||||
|
return _api_internal._TargetCreate("llvm", *opts)
|
||||||
|
|
||||||
|
|
||||||
|
def rasp(options=None):
|
||||||
|
"""Return a Raspberry 3b target.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
options : str or list of str
|
||||||
|
Additional options
|
||||||
|
"""
|
||||||
|
warnings.warn('tvm.target.rasp() is going to be deprecated. '
|
||||||
|
'Please use tvm.target.arm_cpu("rasp3b")')
|
||||||
|
return arm_cpu('rasp3b', options)
|
||||||
|
|
||||||
|
|
||||||
def create(target_str):
|
def create(target_str):
|
||||||
"""Get a target given target string.
|
"""Get a target given target string.
|
||||||
|
|
||||||
|
|
|
@ -261,15 +261,6 @@ Target metal(const std::vector<std::string>& options) {
|
||||||
return CreateTarget("metal", options);
|
return CreateTarget("metal", options);
|
||||||
}
|
}
|
||||||
|
|
||||||
Target rasp(const std::vector<std::string>& options) {
|
|
||||||
return CreateTarget("llvm", MergeOptions(options, {
|
|
||||||
"-device=rasp",
|
|
||||||
"-mtriple=armv7l-none-linux-gnueabihf",
|
|
||||||
"-mcpu=cortex-a53",
|
|
||||||
"-mattr=+neon"
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
Target mali(const std::vector<std::string>& options) {
|
Target mali(const std::vector<std::string>& options) {
|
||||||
return CreateTarget("opencl", MergeOptions(options, {
|
return CreateTarget("opencl", MergeOptions(options, {
|
||||||
"-device=mali"
|
"-device=mali"
|
||||||
|
@ -731,11 +722,6 @@ TVM_REGISTER_API("_GetCurrentTarget")
|
||||||
TVM_REGISTER_API("_EnterTargetScope")
|
TVM_REGISTER_API("_EnterTargetScope")
|
||||||
.set_body([](TVMArgs args, TVMRetValue* ret) {
|
.set_body([](TVMArgs args, TVMRetValue* ret) {
|
||||||
Target target = args[0];
|
Target target = args[0];
|
||||||
auto current = Target::current_target();
|
|
||||||
if (current.defined() && target->str() != current->str()) {
|
|
||||||
LOG(WARNING) << "Overriding target " << current->str()
|
|
||||||
<< " with new target scope " << target->str();
|
|
||||||
}
|
|
||||||
Target::EnterTargetScope(target);
|
Target::EnterTargetScope(target);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,6 @@ Module OpenCLModuleCreate(
|
||||||
std::string fmt,
|
std::string fmt,
|
||||||
std::unordered_map<std::string, FunctionInfo> fmap,
|
std::unordered_map<std::string, FunctionInfo> fmap,
|
||||||
std::string source) {
|
std::string source) {
|
||||||
LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
|
|
||||||
return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "opencl");
|
return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "opencl");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -108,19 +108,19 @@ def test_task_tuner_without_measurement():
|
||||||
"""test task and tuner without measurement"""
|
"""test task and tuner without measurement"""
|
||||||
task, target = get_sample_task()
|
task, target = get_sample_task()
|
||||||
|
|
||||||
def measure_batch(inputs):
|
def custom_measure(input_pack, build_func, build_args, number, repeat,
|
||||||
|
ref_input, ref_output):
|
||||||
from tvm.autotvm import MeasureResult
|
from tvm.autotvm import MeasureResult
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for inp in inputs:
|
for inp in input_pack:
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
# do nothing
|
# do nothing
|
||||||
time.sleep(0.001)
|
time.sleep(0.001)
|
||||||
results.append(MeasureResult([time.time() - tic], 0,
|
results.append(MeasureResult([time.time() - tic], 0,
|
||||||
time.time() - tic, time.time()))
|
time.time() - tic, time.time()))
|
||||||
return results
|
return results
|
||||||
measure_option = autotvm.measure_option(mode='custom',
|
measure_option = autotvm.measure_option(custom_measure)
|
||||||
custom_measure_batch=measure_batch)
|
|
||||||
|
|
||||||
logging.info("%s", task.config_space)
|
logging.info("%s", task.config_space)
|
||||||
|
|
||||||
|
@ -128,6 +128,7 @@ def test_task_tuner_without_measurement():
|
||||||
for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
|
for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
|
||||||
tuner = tuner_class(task)
|
tuner = tuner_class(task)
|
||||||
tuner.tune(n_trial=10, measure_option=measure_option)
|
tuner.tune(n_trial=10, measure_option=measure_option)
|
||||||
|
assert tuner.best_flops > 1
|
||||||
|
|
||||||
def test_tuning_with_measure():
|
def test_tuning_with_measure():
|
||||||
def check(target, target_host):
|
def check(target, target_host):
|
||||||
|
@ -140,7 +141,7 @@ def test_tuning_with_measure():
|
||||||
task, target = get_sample_task(target, target_host)
|
task, target = get_sample_task(target, target_host)
|
||||||
logging.info("%s", task.config_space)
|
logging.info("%s", task.config_space)
|
||||||
|
|
||||||
measure_option = autotvm.measure_option(mode='local',
|
measure_option = autotvm.measure_option('local',
|
||||||
timeout=4,
|
timeout=4,
|
||||||
number=2)
|
number=2)
|
||||||
|
|
||||||
|
@ -152,7 +153,8 @@ def test_tuning_with_measure():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# only print log when invoked from main
|
# only print log when invoked from main
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
test_task_tuner_without_measurement()
|
test_task_tuner_without_measurement()
|
||||||
test_tuning_with_measure()
|
test_tuning_with_measure()
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ def test_db_filter():
|
||||||
|
|
||||||
batch_size = 2
|
batch_size = 2
|
||||||
|
|
||||||
measure_option = autotvm.measure_option(mode='local-nofork', timeout=2)
|
measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
|
||||||
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
||||||
|
|
||||||
ct = 0
|
ct = 0
|
||||||
|
@ -72,7 +72,7 @@ def test_db_filter():
|
||||||
db.flush()
|
db.flush()
|
||||||
|
|
||||||
# First setting, memoize one input at a time, check that each is saved and replayed
|
# First setting, memoize one input at a time, check that each is saved and replayed
|
||||||
measure_option = autotvm.measure_option(mode='local-nofork', timeout=2, replay_db=db)
|
measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
|
||||||
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
||||||
|
|
||||||
for i in range(len(all_inputs)+1):
|
for i in range(len(all_inputs)+1):
|
||||||
|
@ -160,9 +160,10 @@ def test_db_save_replay():
|
||||||
if not ctx.exist:
|
if not ctx.exist:
|
||||||
logging.warning("Skip this test because there is no supported device for test")
|
logging.warning("Skip this test because there is no supported device for test")
|
||||||
|
|
||||||
measure_option = autotvm.measure_option(mode='local-nofork',
|
measure_option = autotvm.measure_option('local',
|
||||||
|
do_fork=False,
|
||||||
timeout=2,
|
timeout=2,
|
||||||
replay_db=_db, save_to_replay_db=True)
|
replay_db=_db)
|
||||||
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
||||||
|
|
||||||
batch_size = 2
|
batch_size = 2
|
||||||
|
@ -182,6 +183,8 @@ def test_db_save_replay():
|
||||||
results = measure_batch(inputs)
|
results = measure_batch(inputs)
|
||||||
all_results += results
|
all_results += results
|
||||||
ct += 1
|
ct += 1
|
||||||
|
callback = autotvm.callback.log_to_database(_db)
|
||||||
|
callback(None, all_inputs, all_results)
|
||||||
|
|
||||||
assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
|
assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
|
||||||
"%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
|
"%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
|
||||||
|
@ -207,7 +210,7 @@ def test_check_hashmismatch():
|
||||||
if not ctx.exist:
|
if not ctx.exist:
|
||||||
logging.warning("Skip this test because there is no supported device for test")
|
logging.warning("Skip this test because there is no supported device for test")
|
||||||
|
|
||||||
measure_option = autotvm.measure_option(mode='local-nofork')
|
measure_option = autotvm.measure_option('local', do_fork=False)
|
||||||
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
|
||||||
|
|
||||||
inputs = list()
|
inputs = list()
|
||||||
|
|
|
@ -84,7 +84,7 @@ def test_feature_shape():
|
||||||
targets = [
|
targets = [
|
||||||
tvm.target.cuda(),
|
tvm.target.cuda(),
|
||||||
tvm.target.mali(),
|
tvm.target.mali(),
|
||||||
tvm.target.rasp(),
|
tvm.target.arm_cpu(),
|
||||||
]
|
]
|
||||||
|
|
||||||
for target in targets:
|
for target in targets:
|
||||||
|
|
|
@ -28,7 +28,7 @@ def test_target_dispatch():
|
||||||
with tvm.target.create("cuda"):
|
with tvm.target.create("cuda"):
|
||||||
assert mygeneric(1) == 3
|
assert mygeneric(1) == 3
|
||||||
|
|
||||||
with tvm.target.rasp():
|
with tvm.target.arm_cpu():
|
||||||
assert mygeneric(1) == 11
|
assert mygeneric(1) == 11
|
||||||
|
|
||||||
with tvm.target.create("metal"):
|
with tvm.target.create("metal"):
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
|
|
||||||
export PYTHONPATH=python:nnvm/python:vta/python:topi/python
|
export PYTHONPATH=python:nnvm/python:vta/python:topi/python
|
||||||
|
|
||||||
|
rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*/*.pyc
|
||||||
|
rm -rf ~/.tvm
|
||||||
|
|
||||||
echo "Running unittest..."
|
echo "Running unittest..."
|
||||||
python -m nose -v vta/tests/python/unittest || exit -1
|
python -m nose -v vta/tests/python/unittest || exit -1
|
||||||
python3 -m nose -v vta/tests/python/unittest || exit -1
|
python3 -m nose -v vta/tests/python/unittest || exit -1
|
||||||
|
|
|
@ -24,7 +24,7 @@ from .broadcast import *
|
||||||
from . import nn
|
from . import nn
|
||||||
from . import x86
|
from . import x86
|
||||||
from . import cuda
|
from . import cuda
|
||||||
from . import rasp
|
from . import arm_cpu
|
||||||
from . import mali
|
from . import mali
|
||||||
from . import intel_graphics
|
from . import intel_graphics
|
||||||
from . import opengl
|
from . import opengl
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
"""Schedule for ARM CPU"""
|
||||||
|
|
||||||
|
from . import conv2d
|
||||||
|
from . import depthwise_conv2d
|
||||||
|
from . import bitserial_conv2d
|
|
@ -43,7 +43,7 @@ _QUANTIZED_SCHEDULES_NCHW = [
|
||||||
SpatialPackNCHW(1, 1, 8, 1, 16),
|
SpatialPackNCHW(1, 1, 8, 1, 16),
|
||||||
]
|
]
|
||||||
|
|
||||||
@_get_schedule.register("rasp")
|
@_get_schedule.register("arm_cpu")
|
||||||
def _get_schedule_bitserial_conv2d(wkl, layout):
|
def _get_schedule_bitserial_conv2d(wkl, layout):
|
||||||
if wkl not in _WORKLOADS:
|
if wkl not in _WORKLOADS:
|
||||||
raise ValueError("no schedule for such workload: {}".format(wkl))
|
raise ValueError("no schedule for such workload: {}".format(wkl))
|
||||||
|
@ -55,7 +55,7 @@ def _get_schedule_bitserial_conv2d(wkl, layout):
|
||||||
return sch
|
return sch
|
||||||
|
|
||||||
|
|
||||||
@bitserial_conv2d.register("rasp")
|
@bitserial_conv2d.register("arm_cpu")
|
||||||
def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
|
def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
|
||||||
layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
|
layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
|
||||||
if out_dtype is None:
|
if out_dtype is None:
|
||||||
|
@ -323,7 +323,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
|
||||||
s = s.normalize()
|
s = s.normalize()
|
||||||
return s
|
return s
|
||||||
|
|
||||||
@generic.schedule_bitserial_conv2d_nhwc.register(["rasp"])
|
@generic.schedule_bitserial_conv2d_nhwc.register(["arm_cpu"])
|
||||||
def schedule_bitserial_conv2d_nhwc(outs):
|
def schedule_bitserial_conv2d_nhwc(outs):
|
||||||
"""Raspverry pi schedule for bitserial conv2d"""
|
"""Raspverry pi schedule for bitserial conv2d"""
|
||||||
s = tvm.create_schedule([x.op for x in outs])
|
s = tvm.create_schedule([x.op for x in outs])
|
|
@ -0,0 +1,515 @@
|
||||||
|
# pylint: disable=invalid-name,unused-variable,no-else-return
|
||||||
|
"""Conv2D schedule for ARM CPU"""
|
||||||
|
from __future__ import absolute_import as _abs
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
|
|
||||||
|
from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
|
||||||
|
from ..util import traverse_inline, get_const_tuple, const_matrix
|
||||||
|
from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
|
||||||
|
from ..nn.util import get_const_int, get_pad_tuple
|
||||||
|
|
||||||
|
def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
|
||||||
|
"""convert argument to workload"""
|
||||||
|
if len(kernel.shape) == 4:
|
||||||
|
raw_kernel = kernel
|
||||||
|
else: # the input kernel is transformed by alter_op_layout
|
||||||
|
shape = get_const_tuple(kernel.shape)
|
||||||
|
raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
|
||||||
|
dtype=kernel.dtype)
|
||||||
|
return ('conv2d', ) + autotvm.task.args_to_workload(
|
||||||
|
[data, raw_kernel, strides, padding, layout, out_dtype])
|
||||||
|
|
||||||
|
@conv2d.register('arm_cpu')
|
||||||
|
@autotvm.task.dispatcher
|
||||||
|
def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
|
||||||
|
"""TOPI compute callback. Mark this function as a dispatcher, so
|
||||||
|
this template can assign config according to workload"""
|
||||||
|
return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
|
||||||
|
|
||||||
|
@conv2d_arm_cpu.register(['direct'])
|
||||||
|
def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
|
||||||
|
"""spatial packing template"""
|
||||||
|
return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
|
||||||
|
|
||||||
|
@autotvm.task.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
|
||||||
|
def schedule_conv2d_nchw_arm_cpu(cfg, outs):
|
||||||
|
"""TOPI schedule callback"""
|
||||||
|
s = tvm.create_schedule([x.op for x in outs])
|
||||||
|
|
||||||
|
def _callback(op):
|
||||||
|
# schedule conv2d
|
||||||
|
if 'spatial_conv_output' in op.tag:
|
||||||
|
output = op.output(0)
|
||||||
|
conv = op.input_tensors[0]
|
||||||
|
|
||||||
|
data_vec = conv.op.input_tensors[0]
|
||||||
|
data_pad = data_vec.op.input_tensors[0]
|
||||||
|
s[data_pad].compute_inline()
|
||||||
|
|
||||||
|
kernel_vec = conv.op.input_tensors[1]
|
||||||
|
if kernel_vec.op.name == 'kernel_vec':
|
||||||
|
kernel = kernel_vec.op.input_tensors[0]
|
||||||
|
else:
|
||||||
|
kernel = kernel_vec
|
||||||
|
if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
|
||||||
|
s[kernel].compute_inline()
|
||||||
|
|
||||||
|
_schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
|
||||||
|
|
||||||
|
if 'winograd_conv_output' in op.tag:
|
||||||
|
output = op.output(0)
|
||||||
|
_schedule_winograd(cfg, s, output, outs[0])
|
||||||
|
|
||||||
|
traverse_inline(s, outs[0].op, _callback)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
|
||||||
|
assert layout == "NCHW", "Only support NCHW"
|
||||||
|
out_dtype = out_dtype or data.dtype
|
||||||
|
|
||||||
|
_, CI, IH, IW = get_const_tuple(data.shape)
|
||||||
|
if len(kernel.shape) == 4:
|
||||||
|
pre_packed = False
|
||||||
|
CO, _, KH, KW = get_const_tuple(kernel.shape)
|
||||||
|
else: # kernel tensor is pre packed
|
||||||
|
pre_packed = True
|
||||||
|
CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
|
||||||
|
CO = CO * VC
|
||||||
|
|
||||||
|
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
|
||||||
|
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
|
||||||
|
|
||||||
|
N = 1
|
||||||
|
OH = (IH + pad_top + pad_down - KH) // HSTR + 1
|
||||||
|
OW = (IW + pad_left + pad_right - KW) // WSTR + 1
|
||||||
|
data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
|
||||||
|
|
||||||
|
# ==================== define configuration space ====================
|
||||||
|
n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
|
||||||
|
ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
|
||||||
|
|
||||||
|
if num_tile == 2: # for arm cpu
|
||||||
|
co, vc = cfg.define_split('tile_co', co, num_outputs=2)
|
||||||
|
oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
|
||||||
|
ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
|
||||||
|
elif num_tile == 3: # for mali gpu
|
||||||
|
co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
|
||||||
|
oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
|
||||||
|
ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Invalid num_tile")
|
||||||
|
|
||||||
|
cfg.define_reorder("reorder_0",
|
||||||
|
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
|
||||||
|
policy='candidate', candidate=[
|
||||||
|
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
|
||||||
|
[n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
|
||||||
|
|
||||||
|
cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
|
||||||
|
cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
|
||||||
|
# ====================================================================
|
||||||
|
|
||||||
|
VC = cfg["tile_co"].size[-1]
|
||||||
|
VH = cfg["tile_oh"].size[-1]
|
||||||
|
VW = cfg["tile_ow"].size[-1]
|
||||||
|
|
||||||
|
dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
|
||||||
|
kvshape = (CO // VC, CI, KH, KW, VC)
|
||||||
|
ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
|
||||||
|
oshape = (N, CO, OH, OW)
|
||||||
|
|
||||||
|
data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
|
||||||
|
data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
|
||||||
|
name='data_vec')
|
||||||
|
|
||||||
|
if pre_packed:
|
||||||
|
kernel_vec = kernel
|
||||||
|
else:
|
||||||
|
kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
|
||||||
|
kernel[co*VC+vc][ci][kh][kw],
|
||||||
|
name='kernel_vec')
|
||||||
|
|
||||||
|
ci = tvm.reduce_axis((0, CI), name='ci')
|
||||||
|
kh = tvm.reduce_axis((0, KH), name='kh')
|
||||||
|
kw = tvm.reduce_axis((0, KW), name='kw')
|
||||||
|
|
||||||
|
conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
|
||||||
|
tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
|
||||||
|
kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
|
||||||
|
axis=[ci, kh, kw]), name='conv')
|
||||||
|
|
||||||
|
output = tvm.compute(oshape, lambda n, co, h, w:
|
||||||
|
conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
|
||||||
|
name='output_unpack', tag='spatial_conv_output',
|
||||||
|
attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
|
||||||
|
layout, out_dtype)})
|
||||||
|
return output
|
||||||
|
|
||||||
|
def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
|
||||||
|
conv, output, last):
|
||||||
|
"""schedule implementation"""
|
||||||
|
n, co, oh, ow, vh, vw, vc = s[conv].op.axis
|
||||||
|
ci, kh, kw = s[conv].op.reduce_axis
|
||||||
|
|
||||||
|
# schedule conv
|
||||||
|
cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
|
||||||
|
cfg["ann_reduce"].apply(s, conv, [kh, kw],
|
||||||
|
axis_lens=[get_const_int(kh.dom.extent),
|
||||||
|
get_const_int(kw.dom.extent)],
|
||||||
|
max_unroll=16,
|
||||||
|
cfg=cfg)
|
||||||
|
cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
|
||||||
|
axis_lens=[cfg['tile_oh'].size[-1],
|
||||||
|
cfg['tile_ow'].size[-1],
|
||||||
|
cfg['tile_co'].size[-1]],
|
||||||
|
max_unroll=16,
|
||||||
|
cfg=cfg)
|
||||||
|
|
||||||
|
# schedule fusion
|
||||||
|
n, co, h, w = s[last].op.axis
|
||||||
|
co, vc = cfg['tile_co'].apply(s, last, co)
|
||||||
|
oh, vh = cfg['tile_oh'].apply(s, last, h)
|
||||||
|
ow, vw = cfg['tile_ow'].apply(s, last, w)
|
||||||
|
s[last].reorder(n, co, oh, ow, vh, vw, vc)
|
||||||
|
if last != output:
|
||||||
|
s[output].compute_inline()
|
||||||
|
cfg["ann_spatial"].apply(s, last, [vh, vw, vc],
|
||||||
|
axis_lens=[cfg['tile_oh'].size[-1],
|
||||||
|
cfg['tile_ow'].size[-1],
|
||||||
|
cfg['tile_co'].size[-1]],
|
||||||
|
max_unroll=16,
|
||||||
|
cfg=cfg)
|
||||||
|
s[conv].compute_at(s[last], ow)
|
||||||
|
|
||||||
|
# mark parallel
|
||||||
|
s[last].parallel(co)
|
||||||
|
|
||||||
|
_, h, _, _, _, _ = s[data_vec].op.axis
|
||||||
|
s[data_vec].parallel(h)
|
||||||
|
|
||||||
|
if kernel_vec.op.name == 'kernel_vec':
|
||||||
|
co, _, _, _, _ = s[kernel_vec].op.axis
|
||||||
|
if autotvm.GLOBAL_SCOPE.in_tuning:
|
||||||
|
# kernel packing will be pre-computed during compliation, so we skip
|
||||||
|
# this part to make tuning records correct
|
||||||
|
s[kernel_vec].pragma(co, 'debug_skip_region')
|
||||||
|
else:
|
||||||
|
s[kernel_vec].parallel(co)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@conv2d_arm_cpu.register('winograd')
|
||||||
|
def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
|
||||||
|
tile_size = 4
|
||||||
|
return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
|
||||||
|
|
||||||
|
def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
|
||||||
|
N, CI, IH, IW = get_const_tuple(data.shape)
|
||||||
|
if len(kernel.shape) == 4:
|
||||||
|
pre_computed = False
|
||||||
|
CO, _, KH, KW = get_const_tuple(kernel.shape)
|
||||||
|
else:
|
||||||
|
pre_computed = True
|
||||||
|
H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
|
||||||
|
CO *= VC
|
||||||
|
KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
|
||||||
|
HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
|
||||||
|
HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
|
||||||
|
|
||||||
|
assert layout == 'NCHW'
|
||||||
|
assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
|
||||||
|
data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
|
||||||
|
|
||||||
|
if tile_size == 4:
|
||||||
|
G_data = np.array([
|
||||||
|
[1 / 4.0, 0, 0],
|
||||||
|
[-1 / 6.0, -1 / 6.0, -1 / 6.0],
|
||||||
|
[-1 / 6.0, 1 / 6.0, -1 / 6.0],
|
||||||
|
[1 / 24.0, 1 / 12.0, 1 / 6.0],
|
||||||
|
[1 / 24.0, -1 / 12.0, 1 / 6.0],
|
||||||
|
[0, 0, 1]], dtype=np.float32)
|
||||||
|
|
||||||
|
B_data = np.array([
|
||||||
|
[4, 0, 0, 0, 0, 0],
|
||||||
|
[0, -4, 4, -2, 2, 4],
|
||||||
|
[-5, -4, -4, -1, -1, 0],
|
||||||
|
[0, 1, -1, 2, -2, -5],
|
||||||
|
[1, 1, 1, 1, 1, 0],
|
||||||
|
[0, 0, 0, 0, 0, 1]], out_dtype)
|
||||||
|
|
||||||
|
A_data = np.array([
|
||||||
|
[1, 0, 0, 0],
|
||||||
|
[1, 1, 1, 1],
|
||||||
|
[1, -1, 1, -1],
|
||||||
|
[1, 2, 4, 8],
|
||||||
|
[1, -2, 4, -8],
|
||||||
|
[0, 0, 0, 1]], out_dtype)
|
||||||
|
elif tile_size == 2:
|
||||||
|
G_data = np.array([
|
||||||
|
[1, 0, 0],
|
||||||
|
[1.0/2, 1.0/2, 1.0/2],
|
||||||
|
[1.0/2, -1.0/2, 1.0/2],
|
||||||
|
[0, 0, 1]], np.float32)
|
||||||
|
|
||||||
|
B_data = np.array([
|
||||||
|
[1, 0, 0, 0],
|
||||||
|
[0, 1, -1, 1],
|
||||||
|
[-1, 1, 1, 0],
|
||||||
|
[0, 0, 0, -1]], out_dtype)
|
||||||
|
|
||||||
|
A_data = np.array([
|
||||||
|
[1, 0],
|
||||||
|
[1, 1],
|
||||||
|
[1, -1],
|
||||||
|
[0, -1]], out_dtype)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
|
||||||
|
|
||||||
|
m = A_data.shape[1]
|
||||||
|
r = 3
|
||||||
|
alpha = m + r - 1
|
||||||
|
K = CO
|
||||||
|
C = CI
|
||||||
|
|
||||||
|
H = (IH + 2 * HPAD - 3) // HSTR + 1
|
||||||
|
W = (IW + 2 * WPAD - 3) // WSTR + 1
|
||||||
|
nH, nW = (H + m-1) // m, (W + m-1) // m
|
||||||
|
P = N * nH * nW
|
||||||
|
|
||||||
|
cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
|
||||||
|
cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
|
||||||
|
VP = cfg['tile_p'].size[-1]
|
||||||
|
VK = cfg['tile_k'].size[-1]
|
||||||
|
|
||||||
|
# pack input tile
|
||||||
|
input_tile = tvm.compute((C, P // VP, alpha, alpha, VP),
|
||||||
|
lambda c, b, eps, nu, bb:
|
||||||
|
data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps]
|
||||||
|
[(b*VP+bb) % nW * m + nu],
|
||||||
|
name='d')
|
||||||
|
|
||||||
|
# transform kernel
|
||||||
|
if pre_computed:
|
||||||
|
U = kernel
|
||||||
|
else:
|
||||||
|
G = const_matrix(G_data, 'G')
|
||||||
|
r_kh = tvm.reduce_axis((0, KH), 'r_kh')
|
||||||
|
r_kw = tvm.reduce_axis((0, KW), 'r_kw')
|
||||||
|
U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk:
|
||||||
|
tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
|
||||||
|
G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
|
||||||
|
|
||||||
|
# transform image
|
||||||
|
B = const_matrix(B_data, 'B')
|
||||||
|
r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
|
||||||
|
r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
|
||||||
|
V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb:
|
||||||
|
tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
|
||||||
|
B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')
|
||||||
|
|
||||||
|
# batch gemm
|
||||||
|
c = tvm.reduce_axis((0, C), name='c')
|
||||||
|
M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
|
||||||
|
tvm.sum(U[eps][nu][k // VK][c][k % VK] *
|
||||||
|
V[eps][nu][b // VP][c][b % VP], axis=c), name='M')
|
||||||
|
|
||||||
|
# inverse transform
|
||||||
|
A = const_matrix(A_data, 'A')
|
||||||
|
r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
|
||||||
|
r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
|
||||||
|
Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
|
||||||
|
tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
|
||||||
|
axis=[r_eps, r_nu]), name='Y')
|
||||||
|
|
||||||
|
# unpack output
|
||||||
|
output = tvm.compute((N, K, H, W), lambda n, k, h, w:
|
||||||
|
Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
|
||||||
|
name='output', tag='winograd_conv_output',
|
||||||
|
attrs={'workload': _winograd_conv_arg_to_workload(
|
||||||
|
data, kernel, strides, padding, layout, out_dtype, tile_size)})
|
||||||
|
|
||||||
|
# we have to manually assign effective GFLOP for winogard
|
||||||
|
cfg.add_flop(2 * N * K * H * W * KH * KW * C)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def _schedule_winograd(cfg, s, output, last):
|
||||||
|
Y = output.op.input_tensors[0]
|
||||||
|
M, A = Y.op.input_tensors
|
||||||
|
U, V = M.op.input_tensors
|
||||||
|
d, B = V.op.input_tensors
|
||||||
|
data_pad = d.op.input_tensors[0]
|
||||||
|
|
||||||
|
# padding
|
||||||
|
s[data_pad].compute_inline()
|
||||||
|
|
||||||
|
# pack input tiles
|
||||||
|
s[d].compute_inline()
|
||||||
|
|
||||||
|
# transform kernel
|
||||||
|
if isinstance(U.op, tvm.tensor.ComputeOp):
|
||||||
|
kernel, G = U.op.input_tensors
|
||||||
|
s[G].compute_inline()
|
||||||
|
eps, nu, k, c, kk, = s[U].op.axis
|
||||||
|
r_kh, r_kw = s[U].op.reduce_axis
|
||||||
|
s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
|
||||||
|
s[U].unroll(eps)
|
||||||
|
s[U].unroll(nu)
|
||||||
|
s[U].unroll(r_kh)
|
||||||
|
s[U].unroll(r_kw)
|
||||||
|
s[U].vectorize(kk)
|
||||||
|
if autotvm.GLOBAL_SCOPE.in_tuning:
|
||||||
|
# kernel transformation will be pre-computed during compilation, so we skip
|
||||||
|
# this part to make tuning records correct
|
||||||
|
s[U].pragma(k, 'debug_skip_region')
|
||||||
|
else:
|
||||||
|
s[U].parallel(k)
|
||||||
|
|
||||||
|
# transform image
|
||||||
|
DD = s.cache_read(d, 'global', [V])
|
||||||
|
s[B].compute_inline()
|
||||||
|
eps, nu, b, c, bb = s[V].op.axis
|
||||||
|
r_eps, r_nu = s[V].op.reduce_axis
|
||||||
|
s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
|
||||||
|
s[V].unroll(eps)
|
||||||
|
s[V].unroll(nu)
|
||||||
|
s[V].unroll(r_eps)
|
||||||
|
s[V].unroll(r_nu)
|
||||||
|
s[DD].compute_at(s[V], c)
|
||||||
|
s[V].vectorize(bb)
|
||||||
|
s[V].parallel(b)
|
||||||
|
|
||||||
|
# batch gemm
|
||||||
|
eps, nu, k, b = s[M].op.axis
|
||||||
|
c = s[M].op.reduce_axis[0]
|
||||||
|
cfg.define_split('tile_c', c, num_outputs=2, filter=lambda x: x.size[-1] <= 16)
|
||||||
|
co, ci = cfg['tile_c'].apply(s, M, c)
|
||||||
|
xo, xi = cfg['tile_p'].apply(s, M, b)
|
||||||
|
s[M].reorder(eps, nu, xo, co, k, ci, xi)
|
||||||
|
cfg.define_annotate('ann_reduce', [ci], policy='try_unroll')
|
||||||
|
cfg.define_annotate('ann_spatial', [k, xi], policy='try_unroll_vec')
|
||||||
|
cfg['ann_reduce'].apply(s, M, [ci],
|
||||||
|
axis_lens=[cfg['tile_c'].size[-1]],
|
||||||
|
max_unroll=16,
|
||||||
|
cfg=cfg)
|
||||||
|
cfg['ann_spatial'].apply(s, M, [k, xi])
|
||||||
|
|
||||||
|
# inverse transform
|
||||||
|
s[A].compute_inline()
|
||||||
|
k, b, vh, vw = s[Y].op.axis
|
||||||
|
r_eps, r_nu = s[Y].op.reduce_axis
|
||||||
|
s[Y].unroll(vh)
|
||||||
|
s[Y].unroll(vw)
|
||||||
|
s[Y].unroll(r_eps)
|
||||||
|
s[Y].unroll(r_nu)
|
||||||
|
|
||||||
|
# output
|
||||||
|
n, co, h, w = s[last].op.axis
|
||||||
|
co, coi = cfg['tile_k'].apply(s, last, co)
|
||||||
|
s[M].compute_at(s[last], co)
|
||||||
|
s[last].parallel(co)
|
||||||
|
|
||||||
|
MM = s.cache_read(M, 'global', [Y])
|
||||||
|
m = get_const_int(V.shape[0]) + 1 - 3
|
||||||
|
ho, wo, hi, wi = s[last].tile(h, w, m, m)
|
||||||
|
s[Y].compute_at(s[last], wo)
|
||||||
|
s[MM].compute_at(s[last], wo)
|
||||||
|
|
||||||
|
if output != last:
|
||||||
|
s[output].compute_inline()
|
||||||
|
|
||||||
|
|
||||||
|
def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size):
|
||||||
|
"""convert argument to workload"""
|
||||||
|
K = 3
|
||||||
|
shape = get_const_tuple(kernel.shape)
|
||||||
|
alpha = tile_size + K - 1
|
||||||
|
if len(kernel.shape) == 4:
|
||||||
|
assert shape[2:] == (K, K)
|
||||||
|
CO, CI = shape[:2]
|
||||||
|
else:
|
||||||
|
assert shape[:2] == (alpha, alpha)
|
||||||
|
CO, CI, VCO = shape[2:]
|
||||||
|
CO *= VCO
|
||||||
|
|
||||||
|
raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
|
||||||
|
return ('conv2d', ) + autotvm.task.args_to_workload(
|
||||||
|
[data, raw_kernel, strides, padding, layout, out_dtype])
|
||||||
|
|
||||||
|
|
||||||
|
@conv2d_winograd_without_weight_transform.register(['arm_cpu'])
|
||||||
|
@autotvm.task.dispatcher
|
||||||
|
def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
|
||||||
|
return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
|
||||||
|
tile_size)
|
||||||
|
|
||||||
|
|
||||||
|
@winograd_ww_config_dispatcher_.register(['winograd'])
|
||||||
|
def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
|
||||||
|
return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
|
||||||
|
tile_size)
|
||||||
|
|
||||||
|
|
||||||
|
@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
|
||||||
|
'arm_cpu', ['winograd'])
|
||||||
|
def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
|
||||||
|
"""TOPI schedule callback"""
|
||||||
|
s = tvm.create_schedule([x.op for x in outs])
|
||||||
|
|
||||||
|
def _callback(op):
|
||||||
|
if 'winograd_conv_output' in op.tag:
|
||||||
|
output = op.output(0)
|
||||||
|
_schedule_winograd(cfg, s, output, outs[0])
|
||||||
|
|
||||||
|
traverse_inline(s, outs[0].op, _callback)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@conv2d_alter_layout.register(["arm_cpu", "mali"])
|
||||||
|
def _alter_conv2d_layout(attrs, inputs, tinfos):
|
||||||
|
"""Alter op layout for pre-computing kernel transformation"""
|
||||||
|
import nnvm.symbol as sym
|
||||||
|
copy_inputs = [s for s in inputs]
|
||||||
|
|
||||||
|
new_attrs = {k: attrs[k] for k in attrs.keys()}
|
||||||
|
|
||||||
|
assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
|
||||||
|
"when alter_op_layout is enabled"
|
||||||
|
strides = attrs.get_int_tuple("strides")
|
||||||
|
padding = attrs.get_int_tuple("padding")
|
||||||
|
groups = attrs.get_int('groups')
|
||||||
|
layout = attrs["layout"]
|
||||||
|
out_dtype = attrs["out_dtype"]
|
||||||
|
out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
|
||||||
|
|
||||||
|
if groups == 1:
|
||||||
|
# query config of this workload
|
||||||
|
workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
|
||||||
|
layout, out_dtype)
|
||||||
|
cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
|
||||||
|
|
||||||
|
if cfg.template_key == 'direct': # packing weight tensor
|
||||||
|
new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
|
||||||
|
return sym.conv2d(*copy_inputs, **new_attrs)
|
||||||
|
else: # pre-compute weight transformation in winograd
|
||||||
|
tile_size = 4
|
||||||
|
|
||||||
|
weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
|
||||||
|
tile_size=tile_size)
|
||||||
|
CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
|
||||||
|
VC = cfg['tile_k'].size[-1]
|
||||||
|
weight = sym.reshape(weight,
|
||||||
|
shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
|
||||||
|
weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
|
||||||
|
|
||||||
|
copy_inputs[1] = weight
|
||||||
|
new_attrs['tile_size'] = tile_size
|
||||||
|
return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
|
||||||
|
|
||||||
|
# do nothing for depthwise convolution
|
||||||
|
return None
|
|
@ -0,0 +1,94 @@
|
||||||
|
# pylint: disable=invalid-name,unused-variable
|
||||||
|
"""Depthwise convolution schedule for ARM CPU"""
|
||||||
|
|
||||||
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
|
|
||||||
|
from ..generic import schedule_depthwise_conv2d_nchw
|
||||||
|
from ..nn import depthwise_conv2d_nchw
|
||||||
|
from ..util import traverse_inline
|
||||||
|
|
||||||
|
# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
|
||||||
|
autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
|
||||||
|
depthwise_conv2d_nchw.fdefault)
|
||||||
|
|
||||||
|
# register customized schedule for arm cpu.
|
||||||
|
@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
|
||||||
|
def schedule_depthwise_conv2d_nchw_(cfg, outs):
|
||||||
|
"""Schedule depthwise conv2d"""
|
||||||
|
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
|
||||||
|
s = tvm.create_schedule([x.op for x in outs])
|
||||||
|
|
||||||
|
def _schedule(cfg, s, data, data_pad, kernel, output):
|
||||||
|
A, B, C = data, kernel, output
|
||||||
|
s[data_pad].compute_inline()
|
||||||
|
|
||||||
|
# define tile
|
||||||
|
n, c, h, w = s[output].op.axis
|
||||||
|
cfg.define_split('tile_c', c, num_outputs=2)
|
||||||
|
cfg.define_split('tile_h', h, num_outputs=2)
|
||||||
|
cfg.define_split('tile_w', w, num_outputs=2)
|
||||||
|
|
||||||
|
# park data to vector form [n, c, h, w] -> [n, C, h, w, VC]
|
||||||
|
A0 = s.cache_read(data_pad, "global", C)
|
||||||
|
_, c, h, w = s[A0].op.axis
|
||||||
|
c, vc = cfg['tile_c'].apply(s, A0, c)
|
||||||
|
s[A0].reorder(c, h, w, vc)
|
||||||
|
A1 = s.cache_write(A0, 'global')
|
||||||
|
s[A0].compute_inline()
|
||||||
|
|
||||||
|
# park kernel to vector form [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
|
||||||
|
B0 = s.cache_read(B, "global", C)
|
||||||
|
c, m, h, w = s[B0].op.axis
|
||||||
|
c, vc, = cfg['tile_c'].apply(s, B0, c)
|
||||||
|
s[B0].reorder(c, m, h, w, vc)
|
||||||
|
B1 = s.cache_write(B0, 'global')
|
||||||
|
s[B0].compute_inline()
|
||||||
|
|
||||||
|
_, c, h, w = s[C].op.axis
|
||||||
|
c, vc, = cfg['tile_c'].apply(s, C, c)
|
||||||
|
s[C].reorder(c, h, w, vc)
|
||||||
|
|
||||||
|
# depthwise conv
|
||||||
|
C0 = s.cache_write(C, 'global')
|
||||||
|
_, c, h, w, vc = s[C0].op.axis
|
||||||
|
dh, dw = s[C0].op.reduce_axis
|
||||||
|
oh, ih = cfg['tile_h'].apply(s, C0, h)
|
||||||
|
ow, iw = cfg['tile_w'].apply(s, C0, w)
|
||||||
|
s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
|
||||||
|
s[A1].compute_at(s[C0], oh)
|
||||||
|
|
||||||
|
# try unroll and vectorization
|
||||||
|
cfg.define_annotate('ann', [ih, iw, vc], policy='try_unroll_vec')
|
||||||
|
cfg['ann'].apply(s, C0, [ih, iw, vc],
|
||||||
|
axis_lens=[cfg['tile_h'].size[-1],
|
||||||
|
cfg['tile_w'].size[-1],
|
||||||
|
cfg['tile_c'].size[-1]],
|
||||||
|
max_unroll=16,
|
||||||
|
cfg=cfg)
|
||||||
|
|
||||||
|
# mark parallel
|
||||||
|
n, c, h, w = s[C].op.axis
|
||||||
|
s[C].parallel(c)
|
||||||
|
|
||||||
|
n, c, h, w, vc = s[C0].op.axis
|
||||||
|
s[C0].parallel(c)
|
||||||
|
|
||||||
|
c, m, h, w, vc = s[B1].op.axis
|
||||||
|
s[B1].parallel(c)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _callback(op):
|
||||||
|
if op.tag == 'depthwise_conv2d_nchw':
|
||||||
|
output = op.output(0)
|
||||||
|
kernel = op.input_tensors[1]
|
||||||
|
data = op.input_tensors[0]
|
||||||
|
data_pad = None
|
||||||
|
if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
|
||||||
|
data_pad = data
|
||||||
|
data = data_pad.op.input_tensors[0]
|
||||||
|
_schedule(cfg, s, data, data_pad, kernel, output)
|
||||||
|
|
||||||
|
traverse_inline(s, outs[0].op, _callback)
|
||||||
|
return s
|
|
@ -91,6 +91,54 @@ def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
|
||||||
return _default_schedule(outs, False)
|
return _default_schedule(outs, False)
|
||||||
|
|
||||||
|
|
||||||
|
@tvm.target.generic_func
|
||||||
|
def schedule_conv2d_winograd_weight_transform(outs):
|
||||||
|
"""Schedule for weight transformation of winograd
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
outs: Array of Tensor
|
||||||
|
The computation graph description of this operator
|
||||||
|
in the format of an array of tensors.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
sch: Schedule
|
||||||
|
The computation schedule for the op.
|
||||||
|
"""
|
||||||
|
# Typically this is computed in nnvm PreCompute pass
|
||||||
|
# so we make a schedule here for cpu llvm
|
||||||
|
s = tvm.create_schedule([x.op for x in outs])
|
||||||
|
output = outs[0]
|
||||||
|
_, G = s[output].op.input_tensors
|
||||||
|
s[G].compute_inline()
|
||||||
|
eps, nu, co, ci = s[output].op.axis
|
||||||
|
r_kh, r_kw = s[output].op.reduce_axis
|
||||||
|
s[output].reorder(co, ci, r_kh, r_kw, eps, nu)
|
||||||
|
for axis in [r_kh, r_kw, eps, nu]:
|
||||||
|
s[output].unroll(axis)
|
||||||
|
s[output].parallel(co)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@tvm.target.generic_func
|
||||||
|
def schedule_conv2d_winograd_without_weight_transform(outs):
|
||||||
|
"""Schedule for winograd without weight transformation
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
outs: Array of Tensor
|
||||||
|
The computation graph description of this operator
|
||||||
|
in the format of an array of tensors.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
sch: Schedule
|
||||||
|
The computation schedule for the op.
|
||||||
|
"""
|
||||||
|
return _default_schedule(outs, False)
|
||||||
|
|
||||||
|
|
||||||
@tvm.target.generic_func
|
@tvm.target.generic_func
|
||||||
def schedule_conv2d_transpose_nchw(outs):
|
def schedule_conv2d_transpose_nchw(outs):
|
||||||
"""Schedule for conv2d_transpose_nchw
|
"""Schedule for conv2d_transpose_nchw
|
||||||
|
|
|
@ -3,78 +3,18 @@
|
||||||
"""Conv2D operators"""
|
"""Conv2D operators"""
|
||||||
from __future__ import absolute_import as _abs
|
from __future__ import absolute_import as _abs
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
import numpy as np
|
||||||
import tvm
|
import tvm
|
||||||
|
|
||||||
from .pad import pad
|
from .pad import pad
|
||||||
from .util import get_pad_tuple
|
from .util import get_pad_tuple
|
||||||
from ..util import simplify
|
from ..util import simplify, const_matrix, get_const_tuple
|
||||||
|
|
||||||
# workload description of conv2d
|
# workload description of conv2d
|
||||||
Workload = namedtuple('Workload',
|
Workload = namedtuple('Workload',
|
||||||
['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
|
['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
|
||||||
'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
|
'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
|
||||||
|
|
||||||
# schedule description of spatial
|
|
||||||
SpatialPack = namedtuple('SpatialPack',
|
|
||||||
['vh', 'vw', 'vc', 'ba', 'bc', 'unroll'])
|
|
||||||
|
|
||||||
# schedule description of im2col
|
|
||||||
Im2ColPack = namedtuple('Im2ColPack',
|
|
||||||
['vp', 'vq', 'ba', 'bc', 'unroll'])
|
|
||||||
|
|
||||||
_WORKLOADS = [
|
|
||||||
# workloads of resnet18 on imagenet
|
|
||||||
Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
|
|
||||||
Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
|
|
||||||
# workloads of mobile net on imagenet
|
|
||||||
Workload('float32', 'float32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('float32', 'float32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('float32', 'float32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1),
|
|
||||||
# workloads of resnet18 on imagenet (int16->int32 version)
|
|
||||||
Workload('int16', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
|
|
||||||
Workload('int16', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('int16', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int16', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int16', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('int16', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int16', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int16', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('int16', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int16', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int16', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
|
|
||||||
# workloads of mobile net on imagenet (int16->int32 version)
|
|
||||||
Workload('int16', 'int32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int16', 'int32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int16', 'int32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1),
|
|
||||||
]
|
|
||||||
|
|
||||||
# platform specific schedule
|
|
||||||
_CONV_SCHEDULE = {}
|
|
||||||
|
|
||||||
@tvm.target.generic_func
|
@tvm.target.generic_func
|
||||||
def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
|
def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
|
||||||
"""Conv2D operator.
|
"""Conv2D operator.
|
||||||
|
@ -178,137 +118,6 @@ def _get_schedule_NCHWc(wkl, layout, out_layout):
|
||||||
return wkl
|
return wkl
|
||||||
|
|
||||||
|
|
||||||
def _spatial_pack(data, kernel, stride, padding, out_dtype=None):
|
|
||||||
""" Compute convolution with pack on spatial axes. """
|
|
||||||
if out_dtype is None:
|
|
||||||
out_dtype = data.dtype
|
|
||||||
assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, out_dtype)
|
|
||||||
sch = _get_schedule(wkl)
|
|
||||||
|
|
||||||
H, W = wkl.height, wkl.width
|
|
||||||
CI, CO = wkl.in_filter, wkl.out_filter
|
|
||||||
KH, KW = wkl.hkernel, wkl.wkernel
|
|
||||||
HPAD, WPAD = wkl.hpad, wkl.wpad
|
|
||||||
HSTR, WSTR = wkl.hstride, wkl.wstride
|
|
||||||
HCAT, WCAT = KH-1, KW-1
|
|
||||||
|
|
||||||
VH = sch.vh
|
|
||||||
VW = sch.vw
|
|
||||||
VC = sch.vc
|
|
||||||
UNROLL = sch.unroll
|
|
||||||
|
|
||||||
TH = H + 2*HPAD
|
|
||||||
TW = W + 2*WPAD
|
|
||||||
OH = (H + 2*HPAD - KH) // HSTR + 1
|
|
||||||
OW = (W + 2*WPAD - KW) // WSTR + 1
|
|
||||||
|
|
||||||
dshape = (1, CI, H, W)
|
|
||||||
dpshape = (1, CI, TH, TW)
|
|
||||||
dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
|
|
||||||
|
|
||||||
kshape = (CO, CI, KH, KW)
|
|
||||||
kvshape = (CO/VC, CI, KH, KW, VC)
|
|
||||||
|
|
||||||
ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
|
|
||||||
oshape = (1, CO, OH, OW)
|
|
||||||
|
|
||||||
DOPAD = (HPAD != 0 and WPAD != 0)
|
|
||||||
if DOPAD:
|
|
||||||
data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
|
|
||||||
else:
|
|
||||||
data_pad = data
|
|
||||||
|
|
||||||
data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
|
|
||||||
data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
|
|
||||||
|
|
||||||
kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
|
|
||||||
kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')
|
|
||||||
|
|
||||||
ci = tvm.reduce_axis((0, CI), name='ci')
|
|
||||||
dh = tvm.reduce_axis((0, KH), name='dh')
|
|
||||||
dw = tvm.reduce_axis((0, KW), name='dw')
|
|
||||||
|
|
||||||
conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
|
|
||||||
tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
|
|
||||||
kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
|
|
||||||
axis=[ci, dh, dw]), name='conv')
|
|
||||||
|
|
||||||
output = tvm.compute(oshape, lambda n, co, h, w:
|
|
||||||
conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
|
|
||||||
name='output_unpack', tag='spatial_conv_output')
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _im2col_pack(data, kernel, stride, padding, out_dtype=None):
|
|
||||||
""" Compute convolution with im2col pack layout. """
|
|
||||||
if out_dtype is None:
|
|
||||||
out_dtype = data.dtype
|
|
||||||
assert data.shape[0].value == 1, "im2col pack convolution only support batch size=1"
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, out_dtype)
|
|
||||||
sch = _get_schedule(wkl)
|
|
||||||
|
|
||||||
N = 1
|
|
||||||
H, W = wkl.height, wkl.width
|
|
||||||
CI = wkl.in_filter
|
|
||||||
CO = wkl.out_filter
|
|
||||||
KH, KW = wkl.hkernel, wkl.wkernel
|
|
||||||
HPAD, WPAD = wkl.hpad, wkl.hpad
|
|
||||||
HSTR, WSTR = wkl.hstride, wkl.wstride
|
|
||||||
|
|
||||||
OH = (H + 2*HPAD - KH) // HSTR + 1
|
|
||||||
OW = (W + 2*WPAD - KW) // WSTR + 1
|
|
||||||
|
|
||||||
P = sch.vp
|
|
||||||
Q = sch.vq
|
|
||||||
UNROLL = sch.unroll
|
|
||||||
|
|
||||||
dshape = (N, CI, H, W)
|
|
||||||
dpshape = (N, CI, H+2*HPAD, W+2*WPAD)
|
|
||||||
dcshape = (N, OH, OW, CI, KH, KW)
|
|
||||||
dvshape = (N, OH * OW // P, CI, KH, KW, P)
|
|
||||||
|
|
||||||
kshape = (CO, CI, KH, KW)
|
|
||||||
kvshape = (CO // Q, CI, KH, KW, Q)
|
|
||||||
|
|
||||||
ovshape = (N, CO // Q, OH * OW // P, P, Q)
|
|
||||||
oshape = (N, CO, OH, OW)
|
|
||||||
|
|
||||||
############### declaration
|
|
||||||
|
|
||||||
DO_PAD = (wkl.hpad != 0 and wkl.wpad != 0)
|
|
||||||
if DO_PAD:
|
|
||||||
data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
|
|
||||||
else:
|
|
||||||
data_pad = data
|
|
||||||
|
|
||||||
data_col = tvm.compute(dcshape, lambda n, oh, ow, ci, hk, wk: \
|
|
||||||
data_pad[n][ci][oh*HSTR+hk][ow*WSTR+wk], name='data_col')
|
|
||||||
|
|
||||||
data_vec = tvm.compute(dvshape, lambda n, im, ci, hk, wk, vim: \
|
|
||||||
data_col[n][(im*P+vim)//OW][(im*P+vim)%OW][ci][hk][wk], name='data_vec')
|
|
||||||
|
|
||||||
|
|
||||||
kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
|
|
||||||
kernel[co*Q+vc][ci][dh][dw], name='kernel_vec')
|
|
||||||
|
|
||||||
ci = tvm.reduce_axis((0, CI), name='ci')
|
|
||||||
hk = tvm.reduce_axis((0, KH), name='hk')
|
|
||||||
wk = tvm.reduce_axis((0, KW), name='wk')
|
|
||||||
|
|
||||||
conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \
|
|
||||||
tvm.sum(data_vec[n][im][ci][hk][wk][vim].astype(out_dtype) *
|
|
||||||
kernel_vec[co][ci][hk][wk][vco].astype(out_dtype),
|
|
||||||
axis=[ci, hk, wk]), name='conv')
|
|
||||||
|
|
||||||
output = tvm.compute(oshape, lambda n, co, h, w: \
|
|
||||||
conv[n][co//Q][(h*OW+w)//P][(h*OW+w)%P][co%Q],
|
|
||||||
name='output_vec', tag='im2col_conv_output')
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
|
def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
|
||||||
"""Convolution operator in NCHW layout.
|
"""Convolution operator in NCHW layout.
|
||||||
|
|
||||||
|
@ -435,7 +244,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
output : tvm.Tensor
|
output : tvm.Tensor
|
||||||
4-D with shape [batch, out_height, out_width, out_channel]
|
4-D with shape [batch, out_height, out_width, out_channel]
|
||||||
"""
|
"""
|
||||||
assert isinstance(stride, int) or len(stride) == 2
|
assert isinstance(stride, int) or len(stride) == 2
|
||||||
batch, in_height, in_width, in_channel = Input.shape
|
batch, in_height, in_width, in_channel = Input.shape
|
||||||
|
@ -465,6 +274,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
|
||||||
name="Conv2dOutput", tag="conv2d_nhwc")
|
name="Conv2dOutput", tag="conv2d_nhwc")
|
||||||
return Output
|
return Output
|
||||||
|
|
||||||
|
|
||||||
@tvm.target.generic_func
|
@tvm.target.generic_func
|
||||||
def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
|
def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
|
||||||
padding, layout, out_layout, out_dtype='float32'):
|
padding, layout, out_layout, out_dtype='float32'):
|
||||||
|
@ -510,8 +320,80 @@ def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
|
||||||
# default declaration
|
# default declaration
|
||||||
raise ValueError("missing register for topi.nn.conv2d_NCHWc")
|
raise ValueError("missing register for topi.nn.conv2d_NCHWc")
|
||||||
|
|
||||||
# map from schedule type to declaration function
|
|
||||||
_SCH_TO_DECL_FUNC = {
|
def conv2d_winograd_weight_transform(kernel, tile_size):
|
||||||
SpatialPack: _spatial_pack,
|
"""Weight transformation for winograd
|
||||||
Im2ColPack: _im2col_pack,
|
|
||||||
}
|
Parameters
|
||||||
|
----------
|
||||||
|
kernel: Tensor
|
||||||
|
The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now
|
||||||
|
tile_size: int
|
||||||
|
Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
output : tvm.Tensor
|
||||||
|
4-D with shape [alpha, alpha, CO, CI]
|
||||||
|
"""
|
||||||
|
K = 3
|
||||||
|
|
||||||
|
shape = get_const_tuple(kernel.shape)
|
||||||
|
assert shape[2:] == (K, K), "Only support 3x3 kernel"
|
||||||
|
|
||||||
|
r = tile_size + K - 1
|
||||||
|
shape = (r, r) + shape[:2]
|
||||||
|
|
||||||
|
if tile_size == 2:
|
||||||
|
G_data = np.array([
|
||||||
|
[1, 0, 0],
|
||||||
|
[1.0/2, 1.0/2, 1.0/2],
|
||||||
|
[1.0/2, -1.0/2, 1.0/2],
|
||||||
|
[0, 0, 1],
|
||||||
|
], dtype=kernel.dtype)
|
||||||
|
elif tile_size == 4:
|
||||||
|
G_data = np.array([
|
||||||
|
[1 / 4.0, 0, 0],
|
||||||
|
[-1 / 6.0, -1 / 6.0, -1 / 6.0],
|
||||||
|
[-1 / 6.0, 1 / 6.0, -1 / 6.0],
|
||||||
|
[1 / 24.0, 1 / 12.0, 1 / 6.0],
|
||||||
|
[1 / 24.0, -1 / 12.0, 1 / 6.0],
|
||||||
|
[0, 0, 1]
|
||||||
|
], dtype=kernel.dtype)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupoorted tile size:" + tile_size)
|
||||||
|
|
||||||
|
G = const_matrix(G_data, 'G')
|
||||||
|
r_kh = tvm.reduce_axis((0, K), name='r_kh')
|
||||||
|
r_kw = tvm.reduce_axis((0, K), name='r_kw')
|
||||||
|
return tvm.compute(shape, lambda eps, nu, co, ci:
|
||||||
|
tvm.sum(kernel[co][ci][r_kh][r_kw] *
|
||||||
|
G[eps][r_kh] * G[nu][r_kw],
|
||||||
|
axis=[r_kh, r_kw]), name='transform_weight')
|
||||||
|
|
||||||
|
|
||||||
|
@tvm.target.generic_func
|
||||||
|
def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
|
||||||
|
layout, out_dtype, tile_size):
|
||||||
|
"""Compute convolution in winograd algorithm. The filter is supposed to be transformed
|
||||||
|
in advance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input : tvm.Tensor
|
||||||
|
4-D with shape [batch, in_height, in_width, in_channel]
|
||||||
|
filter : tvm.Tensor
|
||||||
|
4-D with shape [filter_height, filter_width, in_channel, num_filter]
|
||||||
|
strides : int or a list/tuple of two ints
|
||||||
|
Stride size, or [stride_height, stride_width]
|
||||||
|
padding : int or str
|
||||||
|
Padding size, or ['VALID', 'SAME']
|
||||||
|
tile_size: int
|
||||||
|
Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
output : tvm.Tensor
|
||||||
|
4-D with shape [batch, out_height, out_width, out_channel]
|
||||||
|
"""
|
||||||
|
raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
# pylint: disable=redefined-builtin, wildcard-import
|
|
||||||
"""Raspberry pi specific declaration and schedules."""
|
|
||||||
from __future__ import absolute_import as _abs
|
|
||||||
|
|
||||||
from .conv2d import schedule_conv2d_nchw
|
|
||||||
from .depthwise_conv2d import schedule_depthwise_conv2d_nchw
|
|
||||||
from .bitserial_conv2d import schedule_bitserial_conv2d_nhwc
|
|
|
@ -1,358 +0,0 @@
|
||||||
# pylint: disable=invalid-name,unused-variable,invalid-name
|
|
||||||
"""Conv2D schedule on raspberry pi"""
|
|
||||||
from __future__ import absolute_import as _abs
|
|
||||||
import tvm
|
|
||||||
from .. import tag
|
|
||||||
from ..nn.conv2d import conv2d as _conv2d, _get_schedule
|
|
||||||
from ..nn.conv2d import SpatialPack, Im2ColPack
|
|
||||||
from ..nn.conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC
|
|
||||||
from ..nn.conv2d import _get_workload
|
|
||||||
from ..nn.util import infer_pad, infer_stride
|
|
||||||
from .. import generic
|
|
||||||
|
|
||||||
_SCHEDULES = [
|
|
||||||
# float32 imagenet
|
|
||||||
SpatialPack(1, 8, 4, 1, 4, True),
|
|
||||||
SpatialPack(1, 7, 4, 2, 4, True),
|
|
||||||
SpatialPack(1, 4, 8, 4, 1, True),
|
|
||||||
SpatialPack(1, 4, 4, 1, 16, False),
|
|
||||||
SpatialPack(1, 4, 8, 4, 8, False),
|
|
||||||
SpatialPack(1, 7, 4, 3, 8, True),
|
|
||||||
SpatialPack(1, 2, 8, 1, 8, True),
|
|
||||||
SpatialPack(2, 1, 16, 1, 4, True),
|
|
||||||
SpatialPack(1, 7, 4, 1, 1, True),
|
|
||||||
Im2ColPack(7, 4, 1, 16, True),
|
|
||||||
Im2ColPack(7, 4, 1, 8, False),
|
|
||||||
Im2ColPack(7, 4, 1, 16, False),
|
|
||||||
|
|
||||||
# float32 mobilenet
|
|
||||||
SpatialPack(2, 2, 4, 28, 1, True),
|
|
||||||
SpatialPack(1, 4, 8, 14, 1, False),
|
|
||||||
SpatialPack(1, 2, 16, 8, 1, True),
|
|
||||||
SpatialPack(1, 4, 8, 8, 8, True),
|
|
||||||
SpatialPack(2, 2, 8, 1, 1, False),
|
|
||||||
SpatialPack(1, 4, 8, 4, 8, False),
|
|
||||||
SpatialPack(2, 2, 8, 1, 4, False),
|
|
||||||
SpatialPack(2, 2, 8, 1, 8, False),
|
|
||||||
Im2ColPack(7, 4, 1, 16, False),
|
|
||||||
Im2ColPack(7, 4, 1, 4, True),
|
|
||||||
|
|
||||||
# int8 imagenet
|
|
||||||
SpatialPack(2, 2, 4, 19, 8, False),
|
|
||||||
SpatialPack(2, 2, 8, 1, 4, True),
|
|
||||||
SpatialPack(2, 2, 8, 7, 4, False),
|
|
||||||
SpatialPack(2, 4, 4, 7, 16, False),
|
|
||||||
SpatialPack(1, 7, 4, 14, 4, True),
|
|
||||||
SpatialPack(2, 2, 8, 5, 1, False),
|
|
||||||
SpatialPack(1, 2, 16, 3, 8, True),
|
|
||||||
SpatialPack(1, 7, 4, 1, 16, True),
|
|
||||||
SpatialPack(2, 2, 8, 2, 16, True),
|
|
||||||
SpatialPack(1, 1, 8, 4, 4, True),
|
|
||||||
SpatialPack(1, 1, 4, 1, 8, False),
|
|
||||||
SpatialPack(1, 1, 8, 1, 16, True),
|
|
||||||
|
|
||||||
# int8 mobilenet
|
|
||||||
SpatialPack(2, 2, 8, 8, 1, True),
|
|
||||||
SpatialPack(1, 7, 4, 16, 4, True),
|
|
||||||
SpatialPack(1, 4, 8, 1, 1, True),
|
|
||||||
SpatialPack(1, 4, 8, 1, 1, True),
|
|
||||||
SpatialPack(1, 4, 8, 4, 8, True),
|
|
||||||
SpatialPack(1, 4, 8, 7, 1, True),
|
|
||||||
SpatialPack(1, 2, 8, 2, 32, True),
|
|
||||||
SpatialPack(1, 2, 16, 2, 16, True),
|
|
||||||
SpatialPack(1, 1, 32, 1, 16, False),
|
|
||||||
SpatialPack(1, 1, 16, 1, 32, True),
|
|
||||||
]
|
|
||||||
|
|
||||||
@_get_schedule.register("rasp")
|
|
||||||
def _get_schedule_conv2d(wkl):
|
|
||||||
if wkl not in _WORKLOADS:
|
|
||||||
raise ValueError("no schedule for such workload: {}".format(wkl))
|
|
||||||
idx = _WORKLOADS.index(wkl)
|
|
||||||
sch = _SCHEDULES[idx]
|
|
||||||
return sch
|
|
||||||
|
|
||||||
|
|
||||||
@_conv2d.register("rasp")
|
|
||||||
def _declaration_conv2d(data, kernel, stride, padding, layout, out_dtype):
|
|
||||||
if out_dtype is None:
|
|
||||||
out_dtype = data.dtype
|
|
||||||
assert layout == 'NCHW', "only support NCHW convolution on rasp"
|
|
||||||
assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, out_dtype)
|
|
||||||
sch = _get_schedule(wkl)
|
|
||||||
return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, out_dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def _schedule_spatial_conv2d(s, data, data_pad, data_vec,
|
|
||||||
kernel, kernel_vec,
|
|
||||||
conv_out, output, last):
|
|
||||||
# no stride and padding info here
|
|
||||||
padding = infer_pad(data, data_pad)
|
|
||||||
if data_pad is None:
|
|
||||||
stride = infer_stride(data, kernel, output)
|
|
||||||
else:
|
|
||||||
stride = infer_stride(data_pad, kernel, output)
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, output.dtype)
|
|
||||||
sch = _get_schedule(wkl)
|
|
||||||
|
|
||||||
H, W = wkl.height, wkl.width
|
|
||||||
CI, CO = wkl.in_filter, wkl.out_filter
|
|
||||||
HK, WK = wkl.hkernel, wkl.wkernel
|
|
||||||
HPAD, WPAD = wkl.hpad, wkl.wpad
|
|
||||||
HSTR, WSTR = wkl.hstride, wkl.wstride
|
|
||||||
|
|
||||||
HCAT, WCAT = HK-1, WK-1
|
|
||||||
DOPAD = (HPAD != 0 and WPAD != 0)
|
|
||||||
|
|
||||||
VH = sch.vh
|
|
||||||
VW = sch.vw
|
|
||||||
VC = sch.vc
|
|
||||||
UNROLL = sch.unroll
|
|
||||||
|
|
||||||
A, B, C = data, kernel, last
|
|
||||||
A0, A1 = data_pad, data_vec
|
|
||||||
B0 = kernel_vec
|
|
||||||
C0, C1 = conv_out, output
|
|
||||||
|
|
||||||
CC = s.cache_write(C0, "global")
|
|
||||||
|
|
||||||
_, co, oh, ow, vh, vw, vc = s[C0].op.axis
|
|
||||||
if UNROLL:
|
|
||||||
s[C0].unroll(vw)
|
|
||||||
s[C0].vectorize(vc)
|
|
||||||
|
|
||||||
s[CC].compute_at(s[C0], ow)
|
|
||||||
_, co, oh, ow, vh, vw, vc = s[CC].op.axis
|
|
||||||
ci, dh, dw = s[CC].op.reduce_axis
|
|
||||||
s[CC].reorder(ci, dh, vh, dw, vw, vc)
|
|
||||||
|
|
||||||
if UNROLL:
|
|
||||||
s[CC].unroll(vw)
|
|
||||||
s[CC].vectorize(vc)
|
|
||||||
|
|
||||||
##### Schedule A
|
|
||||||
if DOPAD:
|
|
||||||
s[A0].compute_inline()
|
|
||||||
|
|
||||||
_, h, _, _, _, _ = s[A1].op.axis
|
|
||||||
if sch.ba == 1:
|
|
||||||
oaxis = h
|
|
||||||
paxis = h
|
|
||||||
else:
|
|
||||||
oh, ih = s[A1].split(h, sch.ba)
|
|
||||||
oaxis = oh
|
|
||||||
paxis = ih
|
|
||||||
|
|
||||||
s[A1].parallel(paxis)
|
|
||||||
s[A1].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[A1].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[A1].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
|
|
||||||
##### Schedule B
|
|
||||||
co, _, _, _, _ = s[B0].op.axis
|
|
||||||
if sch.bc == 1:
|
|
||||||
oaxis = co
|
|
||||||
paxis = co
|
|
||||||
else:
|
|
||||||
oco, ico = s[B0].split(co, sch.bc)
|
|
||||||
oaxis = oco
|
|
||||||
paxis = ico
|
|
||||||
|
|
||||||
s[B0].parallel(paxis)
|
|
||||||
s[B0].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[B0].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[B0].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
|
|
||||||
##### Schedule C
|
|
||||||
n, co, h, w = s[C].op.axis
|
|
||||||
co, vc = s[C].split(co, VC)
|
|
||||||
oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
|
|
||||||
s[C].reorder(n, co, oh, ow, vh, vw, vc)
|
|
||||||
if C != C1:
|
|
||||||
s[C1].compute_inline()
|
|
||||||
s[C0].compute_at(s[C], ow)
|
|
||||||
|
|
||||||
if sch.bc == 1:
|
|
||||||
oaxis = co
|
|
||||||
paxis = co
|
|
||||||
else:
|
|
||||||
oco, ico = s[C].split(co, sch.bc)
|
|
||||||
oaxis = oco
|
|
||||||
paxis = ico
|
|
||||||
|
|
||||||
s[C].parallel(paxis)
|
|
||||||
s[C].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[C].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[C].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
|
|
||||||
kernel, kernel_vec,
|
|
||||||
conv_out, output, last):
|
|
||||||
# no stride and padding info here
|
|
||||||
padding = infer_pad(data, data_pad)
|
|
||||||
if data_pad is None:
|
|
||||||
stride = infer_stride(data, kernel, output)
|
|
||||||
else:
|
|
||||||
stride = infer_stride(data_pad, kernel, output)
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, output.dtype)
|
|
||||||
sch = _get_schedule(wkl)
|
|
||||||
|
|
||||||
H, W = wkl.height, wkl.width
|
|
||||||
CI = wkl.in_filter
|
|
||||||
CO = wkl.out_filter
|
|
||||||
HK, WK = wkl.hkernel, wkl.wkernel
|
|
||||||
HPAD, WPAD = wkl.hpad, wkl.wpad
|
|
||||||
HSTR, WSTR = wkl.hstride, wkl.wstride
|
|
||||||
|
|
||||||
HCAT, WCAT = HK-1, WK-1
|
|
||||||
DOPAD = (HPAD != 0 and WPAD != 0)
|
|
||||||
|
|
||||||
P = sch.vp
|
|
||||||
Q = sch.vq
|
|
||||||
UNROLL = sch.unroll
|
|
||||||
|
|
||||||
A, B, C = data, kernel, last
|
|
||||||
A0, A1, A2 = data_pad, data_col, data_vec
|
|
||||||
B0 = kernel_vec
|
|
||||||
C0, C1 = conv_out, output
|
|
||||||
|
|
||||||
CC = s.cache_write(C0, "global")
|
|
||||||
AA = s.cache_read(A2, "global", [CC])
|
|
||||||
BB = s.cache_read(B0, "global", [CC])
|
|
||||||
|
|
||||||
|
|
||||||
##### Schedule CC
|
|
||||||
_, co, im, vim, vco = s[C0].op.axis
|
|
||||||
s[C0].unroll(vim)
|
|
||||||
s[C0].vectorize(vco)
|
|
||||||
|
|
||||||
s[CC].compute_at(s[C0], im)
|
|
||||||
_, co, im, vim, vco = s[CC].op.axis
|
|
||||||
ci, hk, wk = s[CC].op.reduce_axis
|
|
||||||
s[CC].reorder(ci, hk, wk, vim, vco)
|
|
||||||
s[CC].unroll(vim)
|
|
||||||
s[CC].vectorize(vco)
|
|
||||||
# s[CC].unroll(ccr)
|
|
||||||
|
|
||||||
### Schedule C
|
|
||||||
_, co, h, w = s[C].op.axis
|
|
||||||
im = s[C].fuse(h, w)
|
|
||||||
im, vim = s[C].split(im, P)
|
|
||||||
co, vco = s[C].split(co, Q)
|
|
||||||
s[C].reorder(co, im, vim, vco)
|
|
||||||
|
|
||||||
if sch.bc == 1:
|
|
||||||
oaxis = co
|
|
||||||
paxis = co
|
|
||||||
else:
|
|
||||||
oco, ico = s[C].split(co, sch.bc)
|
|
||||||
oaxis = oco
|
|
||||||
paxis = ico
|
|
||||||
|
|
||||||
s[C].parallel(paxis)
|
|
||||||
s[C].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[C].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[C].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
if C1 != C:
|
|
||||||
s[C1].compute_inline()
|
|
||||||
|
|
||||||
s[C0].compute_at(s[C], paxis)
|
|
||||||
|
|
||||||
##### Schedule A
|
|
||||||
if DOPAD:
|
|
||||||
s[A0].compute_inline()
|
|
||||||
s[A1].compute_inline()
|
|
||||||
s[AA].compute_at(s[CC], wk)
|
|
||||||
s[AA].unroll(AA.op.axis[4])
|
|
||||||
|
|
||||||
_, im, _, _, _, _ = s[A2].op.axis
|
|
||||||
if sch.ba == 1:
|
|
||||||
oaxis = im
|
|
||||||
paxis = im
|
|
||||||
else:
|
|
||||||
oim, iim = s[A2].split(im, sch.ba)
|
|
||||||
oaxis = oim
|
|
||||||
paxis = iim
|
|
||||||
|
|
||||||
s[A2].parallel(paxis)
|
|
||||||
s[A2].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[A2].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[A2].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
|
|
||||||
##### Schedule B
|
|
||||||
s[BB].compute_at(s[CC], wk)
|
|
||||||
s[BB].vectorize(BB.op.axis[4])
|
|
||||||
|
|
||||||
co, _, _, _, _ = s[B0].op.axis
|
|
||||||
if sch.bc == 1:
|
|
||||||
oaxis = co
|
|
||||||
paxis = co
|
|
||||||
else:
|
|
||||||
oco, ico = s[B0].split(co, sch.bc)
|
|
||||||
oaxis = oco
|
|
||||||
paxis = ico
|
|
||||||
|
|
||||||
s[B0].parallel(paxis)
|
|
||||||
s[B0].pragma(oaxis, "parallel_launch_point")
|
|
||||||
s[B0].pragma(paxis, "parallel_stride_pattern")
|
|
||||||
s[B0].pragma(oaxis, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
return s
|
|
||||||
|
|
||||||
@generic.schedule_conv2d_nchw.register(["rasp"])
|
|
||||||
def schedule_conv2d_nchw(outs):
|
|
||||||
"""Create schedule for tensors"""
|
|
||||||
s = tvm.create_schedule([x.op for x in outs])
|
|
||||||
|
|
||||||
def traverse(op):
|
|
||||||
"""Traverse operators from computation graph"""
|
|
||||||
# inline all one-to-one-mapping operators except the last stage (output)
|
|
||||||
if tag.is_broadcast(op.tag):
|
|
||||||
if op not in s.outputs:
|
|
||||||
s[op].compute_inline()
|
|
||||||
for tensor in op.input_tensors:
|
|
||||||
if tensor.op.input_tensors:
|
|
||||||
traverse(tensor.op)
|
|
||||||
|
|
||||||
if 'spatial_conv_output' in op.tag:
|
|
||||||
output = op.output(0)
|
|
||||||
conv_out = op.input_tensors[0]
|
|
||||||
kernel_vec = conv_out.op.input_tensors[1]
|
|
||||||
kernel = kernel_vec.op.input_tensors[0]
|
|
||||||
if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
|
|
||||||
s[kernel].compute_inline()
|
|
||||||
data_vec = conv_out.op.input_tensors[0]
|
|
||||||
data = data_vec.op.input_tensors[0]
|
|
||||||
data_pad = None
|
|
||||||
if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
|
|
||||||
data_pad = data
|
|
||||||
data = data_pad.op.input_tensors[0]
|
|
||||||
|
|
||||||
_schedule_spatial_conv2d(s, data, data_pad, data_vec,
|
|
||||||
kernel, kernel_vec,
|
|
||||||
conv_out, output, outs[0])
|
|
||||||
|
|
||||||
if 'im2col_conv_output' in op.tag:
|
|
||||||
output = op.output(0)
|
|
||||||
conv_out = op.input_tensors[0]
|
|
||||||
kernel_vec = conv_out.op.input_tensors[1]
|
|
||||||
kernel = kernel_vec.op.input_tensors[0]
|
|
||||||
data_vec = conv_out.op.input_tensors[0]
|
|
||||||
data_col = data_vec.op.input_tensors[0]
|
|
||||||
data = data_col.op.input_tensors[0]
|
|
||||||
data_pad = None
|
|
||||||
if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
|
|
||||||
data_pad = data
|
|
||||||
data = data_pad.op.input_tensors[0]
|
|
||||||
_schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
|
|
||||||
kernel, kernel_vec,
|
|
||||||
conv_out, output, outs[0])
|
|
||||||
|
|
||||||
traverse(outs[0].op)
|
|
||||||
return s
|
|
|
@ -1,207 +0,0 @@
|
||||||
# pylint: disable=invalid-name,unused-variable, unused-argument
|
|
||||||
"""Schedule for depthwise_conv2d with auto fusion"""
|
|
||||||
from __future__ import absolute_import as _abs
|
|
||||||
from collections import namedtuple
|
|
||||||
import tvm
|
|
||||||
from .. import tag
|
|
||||||
from ..nn.util import infer_pad, infer_stride, get_pad_tuple
|
|
||||||
from .. import generic
|
|
||||||
|
|
||||||
_Workload = namedtuple('Workload',
|
|
||||||
['in_dtype', 'out_dtype', 'height', 'width', 'channel', 'multiplier',
|
|
||||||
'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
|
|
||||||
|
|
||||||
_Schedule = namedtuple('Schedule', ['vh', 'vw', 'vc', 'bc', 'unroll'])
|
|
||||||
|
|
||||||
# workloads of depthwise conv mobile net on imagenet
|
|
||||||
_WORKLOADS = [
|
|
||||||
_Workload('float32', 'float32', 112, 112, 32, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('float32', 'float32', 112, 112, 64, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('float32', 'float32', 7, 7, 1024, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('int16', 'int32', 112, 112, 32, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('int16', 'int32', 112, 112, 64, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('int16', 'int32', 56, 56, 128, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('int16', 'int32', 56, 56, 128, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('int16', 'int32', 28, 28, 256, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('int16', 'int32', 28, 28, 256, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('int16', 'int32', 14, 14, 512, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
_Workload('int16', 'int32', 14, 14, 512, 1, 3, 3, 1, 1, 2, 2),
|
|
||||||
_Workload('int16', 'int32', 7, 7, 1024, 1, 3, 3, 1, 1, 1, 1),
|
|
||||||
]
|
|
||||||
|
|
||||||
_SCHEDULES = [
|
|
||||||
_Schedule(2, 1, 4, 1, True),
|
|
||||||
_Schedule(2, 4, 4, 2, True),
|
|
||||||
_Schedule(2, 1, 4, 2, False),
|
|
||||||
_Schedule(2, 4, 4, 1, True),
|
|
||||||
_Schedule(4, 1, 4, 8, True),
|
|
||||||
_Schedule(1, 1, 4, 2, True),
|
|
||||||
_Schedule(1, 1, 8, 8, True),
|
|
||||||
_Schedule(1, 1, 4, 1, False),
|
|
||||||
_Schedule(1, 1, 4, 4, False),
|
|
||||||
_Schedule(2, 4, 4, 2, False),
|
|
||||||
_Schedule(2, 7, 4, 1, True),
|
|
||||||
_Schedule(2, 4, 4, 4, False),
|
|
||||||
_Schedule(2, 2, 4, 4, False),
|
|
||||||
_Schedule(2, 2, 8, 4, False),
|
|
||||||
_Schedule(2, 2, 4, 4, True),
|
|
||||||
_Schedule(2, 2, 8, 4, False),
|
|
||||||
_Schedule(1, 2, 8, 4, True),
|
|
||||||
_Schedule(1, 1, 4, 8, True),
|
|
||||||
]
|
|
||||||
|
|
||||||
def _get_workload(data, kernel, stride, padding, out_dtype):
|
|
||||||
_, C, IH, IW = [x.value for x in data.shape]
|
|
||||||
_, MT, KH, KW = [x.value for x in kernel.shape]
|
|
||||||
HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
|
|
||||||
if isinstance(stride, (tuple, list)):
|
|
||||||
HSTR, WSTR = stride
|
|
||||||
else:
|
|
||||||
HSTR, WSTR = stride, stride
|
|
||||||
return _Workload(data.dtype, out_dtype, IH, IW, C, MT, KH, KW, HPAD, WPAD, HSTR, WSTR)
|
|
||||||
|
|
||||||
|
|
||||||
def _schedule(s, data, data_pad, kernel, output, last):
|
|
||||||
padding = infer_pad(data, data_pad)
|
|
||||||
if data_pad is None:
|
|
||||||
stride = infer_stride(data, kernel, output)
|
|
||||||
else:
|
|
||||||
stride = infer_stride(data_pad, kernel, output)
|
|
||||||
wkl = _get_workload(data, kernel, stride, padding, output.dtype)
|
|
||||||
|
|
||||||
if wkl not in _WORKLOADS:
|
|
||||||
return s
|
|
||||||
|
|
||||||
# use specified schedule
|
|
||||||
sch = _SCHEDULES[_WORKLOADS.index(wkl)]
|
|
||||||
|
|
||||||
H, W = wkl.height, wkl.width
|
|
||||||
CN = wkl.channel
|
|
||||||
MT = wkl.multiplier
|
|
||||||
|
|
||||||
HK, WK = wkl.hkernel, wkl.wkernel
|
|
||||||
HPAD, WPAD = wkl.hpad, wkl.wpad
|
|
||||||
HSTR, WSTR = wkl.hstride, wkl.wstride
|
|
||||||
|
|
||||||
VH, VW = sch.vh, sch.vw
|
|
||||||
BC = sch.bc
|
|
||||||
VC = sch.vc
|
|
||||||
|
|
||||||
TH = H + 2*HPAD
|
|
||||||
TW = W + 2*WPAD
|
|
||||||
OH = (H + 2*HPAD - HK) / HSTR + 1
|
|
||||||
OW = (W + 2*WPAD - WK) / WSTR + 1
|
|
||||||
|
|
||||||
|
|
||||||
A, B, C = data, kernel, output
|
|
||||||
A0 = data_pad
|
|
||||||
|
|
||||||
A1 = s.cache_read(A0, "global", C)
|
|
||||||
_, c, h, w = s[A1].op.axis
|
|
||||||
c, vc = s[A1].split(c, VC)
|
|
||||||
s[A1].reorder(c, h, w, vc)
|
|
||||||
|
|
||||||
A2 = s.cache_write(A1, 'global')
|
|
||||||
s[A0].compute_inline()
|
|
||||||
s[A1].compute_inline()
|
|
||||||
|
|
||||||
B0 = s.cache_read(B, "global", C)
|
|
||||||
c, m, h, w = s[B0].op.axis
|
|
||||||
c, vc = s[B0].split(c, VC)
|
|
||||||
s[B0].reorder(c, m, h, w, vc)
|
|
||||||
|
|
||||||
B1 = s.cache_write(B0, 'global')
|
|
||||||
s[B0].compute_inline()
|
|
||||||
|
|
||||||
_, c, h, w = s[C].op.axis
|
|
||||||
c, vc = s[C].split(c, VC)
|
|
||||||
s[C].reorder(c, h, w, vc)
|
|
||||||
|
|
||||||
|
|
||||||
C0 = s.cache_write(C, 'global')
|
|
||||||
_, c, h, w, vc = s[C0].op.axis
|
|
||||||
dh, dw = s[C0].op.reduce_axis
|
|
||||||
oh, ow, ih, iw = s[C0].tile(h, w, VH, VW)
|
|
||||||
s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
|
|
||||||
if sch.unroll:
|
|
||||||
s[C0].unroll(iw)
|
|
||||||
s[C0].vectorize(vc)
|
|
||||||
|
|
||||||
|
|
||||||
# # s[C0].compute_at(s[C0], ow)
|
|
||||||
launch, c, _, _ = s[C].op.axis
|
|
||||||
s[C].pragma(launch, "parallel_launch_point")
|
|
||||||
|
|
||||||
s[C].parallel(c)
|
|
||||||
s[C].pragma(c, "parallel_stride_pattern")
|
|
||||||
s[C].pragma(c, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
|
|
||||||
s[C0].compute_at(s[C], launch)
|
|
||||||
_, c, h, w, vc = s[C0].op.axis
|
|
||||||
s[C0].parallel(c)
|
|
||||||
s[C0].pragma(c, "parallel_stride_pattern")
|
|
||||||
s[C0].pragma(c, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
|
|
||||||
s[A2].compute_at(s[C0], oh)
|
|
||||||
# parallel(s[A2], s[A2].op.axis[1], BC)
|
|
||||||
|
|
||||||
# # s[B0].compute_at(s[C0], ow)
|
|
||||||
s[B1].compute_at(s[C], launch)
|
|
||||||
c, m, h, w, vc = s[B1].op.axis
|
|
||||||
s[B1].parallel(c)
|
|
||||||
s[B1].pragma(c, "parallel_stride_pattern")
|
|
||||||
s[B1].pragma(c, "parallel_barrier_when_finish")
|
|
||||||
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
@generic.schedule_depthwise_conv2d_nchw.register(["cpu", "rasp"])
|
|
||||||
def schedule_depthwise_conv2d_nchw(outs):
|
|
||||||
"""Schedule for depthwise_conv2d nchw forward.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
outs: Array of Tensor
|
|
||||||
The computation graph description of depthwise_conv2d
|
|
||||||
in the format of an array of tensors.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
s: Schedule
|
|
||||||
The computation schedule for depthwise_conv2d nchw.
|
|
||||||
"""
|
|
||||||
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
|
|
||||||
s = tvm.create_schedule([x.op for x in outs])
|
|
||||||
|
|
||||||
def traverse(op):
|
|
||||||
"""Internal travserse function"""
|
|
||||||
# inline all one-to-one-mapping operators except the last stage (output)
|
|
||||||
if tag.is_broadcast(op.tag):
|
|
||||||
if op not in s.outputs:
|
|
||||||
s[op].compute_inline()
|
|
||||||
for tensor in op.input_tensors:
|
|
||||||
if tensor.op.input_tensors:
|
|
||||||
traverse(tensor.op)
|
|
||||||
# schedule depthwise_conv2d
|
|
||||||
if op.tag == 'depthwise_conv2d_nchw':
|
|
||||||
output = op.output(0)
|
|
||||||
kernel = op.input_tensors[1]
|
|
||||||
if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
|
|
||||||
s[kernel].compute_inline()
|
|
||||||
data = op.input_tensors[0]
|
|
||||||
data_pad = None
|
|
||||||
if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
|
|
||||||
data_pad = data
|
|
||||||
data = data_pad.op.input_tensors[0]
|
|
||||||
_schedule(s, data, data_pad, kernel, output, outs[0])
|
|
||||||
|
|
||||||
traverse(outs[0].op)
|
|
||||||
return s
|
|
|
@ -1,7 +1,30 @@
|
||||||
|
# pylint: disable=invalid-name
|
||||||
"""Common topi utilities"""
|
"""Common topi utilities"""
|
||||||
from __future__ import absolute_import as _abs
|
from __future__ import absolute_import as _abs
|
||||||
import tvm
|
import tvm
|
||||||
|
|
||||||
|
from . import tag
|
||||||
|
|
||||||
|
def traverse_inline(s, op, callback):
|
||||||
|
"""Traverse computation graph and do auto inline
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s: schedule
|
||||||
|
The schedule
|
||||||
|
op: Operation
|
||||||
|
The final output operator.
|
||||||
|
callback: callable
|
||||||
|
The callback function on each op
|
||||||
|
"""
|
||||||
|
if tag.is_injective(op.tag):
|
||||||
|
if op not in s.outputs:
|
||||||
|
s[op].compute_inline()
|
||||||
|
for tensor in op.input_tensors:
|
||||||
|
if tensor.op.input_tensors:
|
||||||
|
traverse_inline(s, tensor.op, callback)
|
||||||
|
callback(op)
|
||||||
|
|
||||||
|
|
||||||
def prod(x):
|
def prod(x):
|
||||||
"""Get the product of every items in the tuple.
|
"""Get the product of every items in the tuple.
|
||||||
|
@ -151,3 +174,33 @@ def unravel_index(idx, shape):
|
||||||
idx = idx // shape[i]
|
idx = idx // shape[i]
|
||||||
indices = indices[::-1]
|
indices = indices[::-1]
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
|
|
||||||
|
def const_matrix(matrix, name="const_matrix"):
|
||||||
|
"""convert a const numpy 2-dimensional matrix to tvm tensor
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
matrix: numpy.ndarray
|
||||||
|
Const input array
|
||||||
|
name: str, optional
|
||||||
|
The name of output op
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tensor: Tensor
|
||||||
|
The created tensor
|
||||||
|
"""
|
||||||
|
row, col = matrix.shape
|
||||||
|
dtype = str(matrix.dtype)
|
||||||
|
|
||||||
|
def select_array(i, j):
|
||||||
|
now = tvm.const(0.0, dtype)
|
||||||
|
for ii in range(row):
|
||||||
|
for jj in range(col):
|
||||||
|
now = tvm.select(tvm.all(i % row == ii, j % col == jj),
|
||||||
|
tvm.const(matrix[ii][jj], dtype),
|
||||||
|
now)
|
||||||
|
return now
|
||||||
|
|
||||||
|
return tvm.compute(matrix.shape, select_array, name=name)
|
||||||
|
|
|
@ -22,7 +22,7 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
|
||||||
input_type='uint32'
|
input_type='uint32'
|
||||||
out_dtype='int32'
|
out_dtype='int32'
|
||||||
|
|
||||||
with tvm.target.rasp():
|
with tvm.target.arm_cpu('rasp3b'):
|
||||||
A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
|
A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
|
||||||
W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
|
W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
|
||||||
B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype,
|
B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype,
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tvm
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
import topi
|
import topi
|
||||||
import topi.testing
|
import topi.testing
|
||||||
from tvm.contrib.pickle_memoize import memoize
|
from tvm.contrib.pickle_memoize import memoize
|
||||||
|
@ -11,10 +12,10 @@ from topi.util import get_const_tuple
|
||||||
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
|
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
|
||||||
in_height = in_width = in_size
|
in_height = in_width = in_size
|
||||||
|
|
||||||
with tvm.target.rasp():
|
with tvm.target.arm_cpu():
|
||||||
A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
|
A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
|
||||||
W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
|
W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
|
||||||
B = topi.nn.conv2d(A, W, stride, padding)
|
B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
|
||||||
s = topi.generic.schedule_conv2d_nchw([B])
|
s = topi.generic.schedule_conv2d_nchw([B])
|
||||||
|
|
||||||
a_shape = get_const_tuple(A.shape)
|
a_shape = get_const_tuple(A.shape)
|
||||||
|
@ -39,7 +40,8 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
|
||||||
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
|
np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
|
||||||
|
|
||||||
def test_conv2d():
|
def test_conv2d():
|
||||||
verify_conv2d(1, 56, 64, 64, 3, 1, 1)
|
with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
|
||||||
|
verify_conv2d(1, 56, 64, 64, 3, 1, 1)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_conv2d()
|
test_conv2d()
|
||||||
|
|
|
@ -8,6 +8,27 @@ NVIDIA GPU. By running auto-tuner on this template, we can outperform the
|
||||||
vendor provided library CuDNN in many cases.
|
vendor provided library CuDNN in many cases.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Install dependencies
|
||||||
|
# ----------------------------------------
|
||||||
|
# To use autotvm package in tvm, we need to install some extra dependencies.
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user psutil xgboost
|
||||||
|
#
|
||||||
|
# To make tvm run faster in tuning, it is recommended to use cython
|
||||||
|
# as FFI of tvm. In the root directory of tvm, execute
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user cython
|
||||||
|
# sudo make cython3
|
||||||
|
#
|
||||||
|
# Now return to python code. Import packages.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -133,7 +154,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
|
||||||
# for this template
|
# for this template
|
||||||
|
|
||||||
# logging config (for printing tuning log to screen)
|
# logging config (for printing tuning log to screen)
|
||||||
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
|
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
|
||||||
|
|
||||||
# the last layer in resnet
|
# the last layer in resnet
|
||||||
N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
|
N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
|
||||||
|
@ -144,12 +165,12 @@ print(task.config_space)
|
||||||
|
|
||||||
# use local gpu, measure 5 times for every config to reduce variance
|
# use local gpu, measure 5 times for every config to reduce variance
|
||||||
# run 8 parallel threads for compilation
|
# run 8 parallel threads for compilation
|
||||||
measure_option = autotvm.measure_option(mode='local',
|
measure_option = autotvm.measure_option('local',
|
||||||
number=10,
|
number=5,
|
||||||
parallel_num=8,
|
parallel_num=8,
|
||||||
timeout=20)
|
timeout=20)
|
||||||
|
|
||||||
# begin tuning, log records to file `conv2d.tsv`
|
# begin tuning, log records to file `conv2d.log`
|
||||||
tuner = autotvm.tuner.XGBTuner(task)
|
tuner = autotvm.tuner.XGBTuner(task)
|
||||||
tuner.tune(n_trial=20,
|
tuner.tune(n_trial=20,
|
||||||
measure_option=measure_option,
|
measure_option=measure_option,
|
||||||
|
@ -186,6 +207,6 @@ np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
|
||||||
|
|
||||||
# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
|
# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
|
||||||
# and the overhead of kernel launch. You can also use nvprof to validate the result.
|
# and the overhead of kernel launch. You can also use nvprof to validate the result.
|
||||||
|
|
||||||
evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
|
evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
|
||||||
print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
|
print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
|
||||||
|
|
|
@ -0,0 +1,364 @@
|
||||||
|
"""
|
||||||
|
Auto-tuning a convolutional network for ARM CPU
|
||||||
|
====================================================
|
||||||
|
**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
|
||||||
|
|
||||||
|
Auto-tuning for a specific ARM device is critical for getting the best
|
||||||
|
performance. This is a tutorial about how to tune a whole convolutional
|
||||||
|
network.
|
||||||
|
|
||||||
|
The operator implementation for ARM CPU in TVM is written in template form.
|
||||||
|
It has many tunable knobs (tile factor, vectorization, unrolling, etc).
|
||||||
|
We will do tuning for all convolution and depthwise convolution operators
|
||||||
|
in the neural network. After the tuning, we can get a log file which stores
|
||||||
|
the best knob values for all required operators. When the tvm compiler compiles
|
||||||
|
these operators, it will query this log file to get the best knob values.
|
||||||
|
|
||||||
|
We also released pre-tuned parameters for some arm devices. You can go to
|
||||||
|
`ARM CPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#arm-cpu>`_
|
||||||
|
to see the results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Install dependencies
|
||||||
|
# ----------------------------------------
|
||||||
|
# To use autotvm package in tvm, we need to install some extra dependencies.
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user psutil xgboost
|
||||||
|
#
|
||||||
|
# To make tvm run faster in tuning, it is recommended to use cython
|
||||||
|
# as FFI of tvm. In the root directory of tvm, execute
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user cython
|
||||||
|
# sudo make cython3
|
||||||
|
#
|
||||||
|
# Now return to python code. Import packages.
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import nnvm.testing
|
||||||
|
import nnvm.compiler
|
||||||
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
|
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
|
||||||
|
from tvm.contrib.util import tempdir
|
||||||
|
import tvm.contrib.graph_runtime as runtime
|
||||||
|
|
||||||
|
#################################################################
|
||||||
|
# Define network
|
||||||
|
# --------------
|
||||||
|
# First we need to define the network in nnvm symbol API.
|
||||||
|
# We can load some pre-defined network from :code:`nnvm.testing`.
|
||||||
|
# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
|
||||||
|
# tutorials :ref:`tutorial-nnvm` for more details).
|
||||||
|
|
||||||
|
def get_network(name, batch_size):
|
||||||
|
"""Get the symbol definition and random weight of a network"""
|
||||||
|
shape = {"data": (batch_size, 3, 224, 224)}
|
||||||
|
output_shape = (batch_size, 1000)
|
||||||
|
|
||||||
|
if name =='resnet-18':
|
||||||
|
net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
|
||||||
|
elif name =='mobilenet':
|
||||||
|
net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
|
||||||
|
elif name =='squeezenet v1.1':
|
||||||
|
net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
|
||||||
|
elif name =='vgg-16':
|
||||||
|
net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
|
||||||
|
elif name =='custom':
|
||||||
|
# an example for custom network
|
||||||
|
from nnvm.testing import utils
|
||||||
|
net = nnvm.sym.Variable('data')
|
||||||
|
net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
|
||||||
|
net = nnvm.sym.flatten(net)
|
||||||
|
net = nnvm.sym.dense(net, units=1000)
|
||||||
|
net, params = utils.create_workload(net, batch_size, (3, 224, 224))
|
||||||
|
elif name == 'mxnet':
|
||||||
|
# an example for mxnet model
|
||||||
|
from mxnet.gluon.model_zoo.vision import get_model
|
||||||
|
block = get_model('resnet18_v1', pretrained=True)
|
||||||
|
net, params = nnvm.frontend.from_mxnet(block)
|
||||||
|
net = nnvm.sym.softmax(net)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported network: " + name)
|
||||||
|
|
||||||
|
return net, params, shape, output_shape
|
||||||
|
|
||||||
|
#################################################################
|
||||||
|
# Start RPC Tracker
|
||||||
|
# -----------------
|
||||||
|
# TVM uses RPC session to communicate with ARM boards.
|
||||||
|
# During tuning, the tuner will send the generated code to the board and
|
||||||
|
# measure the speed of code on the board.
|
||||||
|
#
|
||||||
|
# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
|
||||||
|
# The RPC Tracker is a centralized master node. We can register all devices to
|
||||||
|
# the tracker. For example, if we have 10 phones, we can register all of them
|
||||||
|
# to the tracker, then we can run 10 measurements in parallel, which accelerates
|
||||||
|
# the tuning process.
|
||||||
|
#
|
||||||
|
# To start an RPC tracker, run this command in the host machine. The tracker is
|
||||||
|
# required during the whole tuning process, so we need to open a new terminal for
|
||||||
|
# this command:
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
|
||||||
|
#
|
||||||
|
# The expected output is
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# INFO:RPCTracker:bind to 0.0.0.0:9190
|
||||||
|
|
||||||
|
#################################################################
|
||||||
|
# Register devices to RPC Tracker
|
||||||
|
# -----------------------------------
|
||||||
|
# Now we can register our devices to the tracker. The first step is to
|
||||||
|
# build tvm runtime for the ARM devices.
|
||||||
|
#
|
||||||
|
# * For Linux:
|
||||||
|
# Follow this section :ref:`build-tvm-runtime-on-device` to build
|
||||||
|
# tvm runtime on the device. Then register the device to tracker by
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
|
||||||
|
#
|
||||||
|
# (replace :code:`[HOST_IP]` with the IP address of your host machine)
|
||||||
|
#
|
||||||
|
# * For Android:
|
||||||
|
# Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
|
||||||
|
# install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
|
||||||
|
#
|
||||||
|
# After registering devices, we can confirm it by querying rpc_tracker
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
|
||||||
|
#
|
||||||
|
# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
|
||||||
|
# the output can be
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# Queue Status
|
||||||
|
# ----------------------------
|
||||||
|
# key free pending
|
||||||
|
# ----------------------------
|
||||||
|
# mate10pro 2 0
|
||||||
|
# rk3399 2 0
|
||||||
|
# rpi3b 11 0
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Set Tuning Options
|
||||||
|
# ------------------
|
||||||
|
# Before tuning, we should do some configurations. Here I use an RK3399 board
|
||||||
|
# in our environment as example. In your setting, you should modify the target
|
||||||
|
# and device_key accordingly.
|
||||||
|
|
||||||
|
# Replace "aarch64-linux-gnu" with the correct target of your board.
|
||||||
|
# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
|
||||||
|
target = tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu')
|
||||||
|
|
||||||
|
# Also replace this with the device key in your tracker
|
||||||
|
device_key = 'rk3399'
|
||||||
|
|
||||||
|
# tuning option
|
||||||
|
network = 'resnet-18'
|
||||||
|
log_file = "%s.%s.log" % (device_key, network)
|
||||||
|
dtype = 'float32'
|
||||||
|
|
||||||
|
tuning_option = {
|
||||||
|
'log_filename': log_file,
|
||||||
|
|
||||||
|
'tuner':'xgb',
|
||||||
|
'n_trial': 1000,
|
||||||
|
'early_stopping': 200,
|
||||||
|
|
||||||
|
'measure_option': autotvm.measure_option(
|
||||||
|
autotvm.use_rpc(device_key, host='localhost', port=9190),
|
||||||
|
number=4,
|
||||||
|
parallel_num=1,
|
||||||
|
timeout=10),
|
||||||
|
|
||||||
|
'use_transfer_learning': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
####################################################################
|
||||||
|
#
|
||||||
|
# .. note:: How to set tuning options
|
||||||
|
#
|
||||||
|
# In general, the default value provided here works well. It is the same
|
||||||
|
# value that we used to generate pre-tuned parameters.
|
||||||
|
# If you have multiple devices, you can set :code:`parallel_num` to
|
||||||
|
# the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
|
||||||
|
# boards to the tracker).
|
||||||
|
# If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
|
||||||
|
# which makes the tuning run longer.
|
||||||
|
# If your device is very slow or a single conv2d operator in your network has large FLOPs,
|
||||||
|
# consider setting timeout larger.
|
||||||
|
#
|
||||||
|
# **For android phone**, add :code:`build_func='ndk'` to the argument list of
|
||||||
|
# :code:`autotvm.measure_option` to use Android NDK for creating shared library.
|
||||||
|
#
|
||||||
|
|
||||||
|
###################################################################
|
||||||
|
# Begin Tuning
|
||||||
|
# ------------
|
||||||
|
# Now we can extract tuning tasks from the network and begin tuning.
|
||||||
|
# Here we provide a simple utility function to tune a list of tasks.
|
||||||
|
# This function is just an initial implementation which tune them in sequential order.
|
||||||
|
# Later we will bring more sophisticated tuner scheduler.
|
||||||
|
|
||||||
|
# You can skip the implementation of this function for this tutorial.
|
||||||
|
def tune_tasks(tasks,
|
||||||
|
measure_option,
|
||||||
|
tuner='xgb',
|
||||||
|
n_trial=500,
|
||||||
|
early_stopping=200,
|
||||||
|
log_filename='tuning.log',
|
||||||
|
use_transfer_learning=True,
|
||||||
|
try_winograd=True):
|
||||||
|
if try_winograd:
|
||||||
|
for i in range(len(tasks)):
|
||||||
|
try: # try winograd template
|
||||||
|
tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
|
||||||
|
tasks[i].target, tasks[i].target_host, 'winograd')
|
||||||
|
tasks.append(tsk)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# create tmp log file
|
||||||
|
tmp_log_file = log_filename + ".tmp"
|
||||||
|
if os.path.exists(tmp_log_file):
|
||||||
|
os.remove(tmp_log_file)
|
||||||
|
|
||||||
|
for i, tsk in enumerate(tasks):
|
||||||
|
prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
|
||||||
|
|
||||||
|
# create tuner
|
||||||
|
if tuner == 'xgb' or tuner == 'xgb-rank':
|
||||||
|
tuner_obj = XGBTuner(tsk, loss_type='rank')
|
||||||
|
elif tuner == 'ga':
|
||||||
|
tuner_obj = GATuner(tsk, pop_size=50)
|
||||||
|
elif tuner == 'random':
|
||||||
|
tuner_obj = RandomTuner(tsk)
|
||||||
|
elif tuner == 'gridsearch':
|
||||||
|
tuner_obj = GridSearchTuner(tsk)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid tuner: " + tuner)
|
||||||
|
|
||||||
|
if use_transfer_learning:
|
||||||
|
if os.path.isfile(tmp_log_file):
|
||||||
|
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
|
||||||
|
|
||||||
|
# do tuning
|
||||||
|
tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
|
||||||
|
early_stopping=early_stopping,
|
||||||
|
measure_option=measure_option,
|
||||||
|
callbacks=[
|
||||||
|
autotvm.callback.progress_bar(n_trial, prefix=prefix),
|
||||||
|
autotvm.callback.log_to_file(tmp_log_file)])
|
||||||
|
|
||||||
|
# pick best records to a cache file
|
||||||
|
autotvm.record.pick_best(tmp_log_file, log_filename)
|
||||||
|
os.remove(tmp_log_file)
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# Finally we launch tuning jobs and evaluate the end-to-end performance.
|
||||||
|
|
||||||
|
def tune_and_evaluate():
|
||||||
|
# extract workloads from nnvm graph
|
||||||
|
net, params, shape, out_shape = get_network(network, batch_size=1)
|
||||||
|
tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
|
||||||
|
symbols=(nnvm.sym.conv2d,),
|
||||||
|
target=target)
|
||||||
|
|
||||||
|
# run tuning tasks
|
||||||
|
tune_tasks(tasks, **tuning_option)
|
||||||
|
|
||||||
|
# compile kernels with history best records
|
||||||
|
with autotvm.apply_history_best(log_file):
|
||||||
|
print("Compile...")
|
||||||
|
with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
|
||||||
|
graph, lib, params = nnvm.compiler.build(
|
||||||
|
net, target=target,
|
||||||
|
shape=shape, params=params, dtype=dtype)
|
||||||
|
|
||||||
|
# export library
|
||||||
|
tmp = tempdir()
|
||||||
|
if tuning_option['measure_option']['build_func'] == 'ndk': # for android
|
||||||
|
from tvm.contrib import ndk
|
||||||
|
filename = "net.so"
|
||||||
|
lib.export_library(tmp.relpath(filename), ndk.create_shared)
|
||||||
|
else:
|
||||||
|
filename = "net.tar"
|
||||||
|
lib.export_library(tmp.relpath(filename))
|
||||||
|
|
||||||
|
# upload module to device
|
||||||
|
print("Upload...")
|
||||||
|
remote = autotvm.measure.request_remote(device_key, timeout=10000)
|
||||||
|
remote.upload(tmp.relpath(filename))
|
||||||
|
rlib = remote.load_module(filename)
|
||||||
|
|
||||||
|
# upload parameters to device
|
||||||
|
ctx = remote.context(str(target), 0)
|
||||||
|
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
||||||
|
data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
|
||||||
|
module = runtime.create(graph, rlib, ctx)
|
||||||
|
module.set_input('data', data_tvm)
|
||||||
|
module.set_input(**rparams)
|
||||||
|
|
||||||
|
# evaluate
|
||||||
|
print("Evaluate inference time cost...")
|
||||||
|
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
|
||||||
|
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
|
||||||
|
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
|
||||||
|
(np.mean(prof_res), np.std(prof_res)))
|
||||||
|
|
||||||
|
# We do not run the tuning in our webpage server since it takes too long.
|
||||||
|
# Uncomment the following line to run by yourself.
|
||||||
|
# tune_and_evaluate()
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Sample Output
|
||||||
|
# -------------
|
||||||
|
# The tuning needs to train xgboost models and use them for prediction.
|
||||||
|
# So a high performance CPU is recommended.
|
||||||
|
# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
|
||||||
|
# One sample output is
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# [Task 1/16] Current/Best: 13.15/ 20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
|
||||||
|
# [Task 2/16] Current/Best: 16.66/ 22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
|
||||||
|
# [Task 3/16] Current/Best: 10.33/ 14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
|
||||||
|
# [Task 4/16] Current/Best: 13.29/ 20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
|
||||||
|
# [Task 5/16] Current/Best: 13.28/ 15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
|
||||||
|
# [Task 6/16] Current/Best: 20.16/ 23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
|
||||||
|
# [Task 7/16] Current/Best: 9.22/ 22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
|
||||||
|
# [Task 8/16] Current/Best: 14.12/ 17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
|
||||||
|
# [Task 9/16] Current/Best: 14.59/ 24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
|
||||||
|
# [Task 10/16] Current/Best: 9.86/ 21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
|
||||||
|
# [Task 11/16] Current/Best: 5.01/ 18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
|
||||||
|
# [Task 12/16] Current/Best: 8.61/ 25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
|
||||||
|
# [Task 13/16] Current/Best: 10.87/ 25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
|
||||||
|
# [Task 14/16] Current/Best: 15.33/ 29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
|
||||||
|
# [Task 15/16] Current/Best: 12.09/ 38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
|
||||||
|
# [Task 16/16] Current/Best: 16.77/ 47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
|
||||||
|
# Compile...
|
||||||
|
# Upload...
|
||||||
|
# Evaluate inference time cost...
|
||||||
|
# Mean inference time (std dev): 156.51 ms (0.89 ms)
|
||||||
|
#
|
|
@ -12,6 +12,27 @@ In this tutorial, you can learn how to perform these two steps in tvm.
|
||||||
The whole workflow is illustrated by a matrix multiplication example.
|
The whole workflow is illustrated by a matrix multiplication example.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Install dependencies
|
||||||
|
# ----------------------------------------
|
||||||
|
# To use autotvm package in tvm, we need to install some extra dependencies.
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user psutil xgboost
|
||||||
|
#
|
||||||
|
# To make tvm run faster in tuning, it is recommended to use cython
|
||||||
|
# as FFI of tvm. In the root directory of tvm, execute
|
||||||
|
# (change "3" to "2" if you use python2):
|
||||||
|
#
|
||||||
|
# .. code-block:: bash
|
||||||
|
#
|
||||||
|
# pip3 install --user cython
|
||||||
|
# sudo make cython3
|
||||||
|
#
|
||||||
|
# Now return to python code. Import packages.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -247,10 +268,10 @@ print(task.config_space)
|
||||||
# used to get the best config later.
|
# used to get the best config later.
|
||||||
|
|
||||||
# logging config (for printing tuning log to screen)
|
# logging config (for printing tuning log to screen)
|
||||||
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
|
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
|
||||||
|
|
||||||
# use local cpu, measure 5 times for every config to reduce variance
|
# use local cpu, measure 5 times for every config to reduce variance
|
||||||
measure_option = autotvm.measure_option(mode='local',
|
measure_option = autotvm.measure_option('local',
|
||||||
number=5)
|
number=5)
|
||||||
|
|
||||||
# begin tuning, log records to file `matmul.log`
|
# begin tuning, log records to file `matmul.log`
|
||||||
|
|
|
@ -3,26 +3,23 @@
|
||||||
|
|
||||||
Cross Compilation and RPC
|
Cross Compilation and RPC
|
||||||
=========================
|
=========================
|
||||||
**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_
|
**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_
|
||||||
|
|
||||||
This tutorial introduces cross compilation and remote device
|
This tutorial introduces cross compilation and remote device
|
||||||
execution with RPC in TVM.
|
execution with RPC in TVM.
|
||||||
|
|
||||||
With cross compilation and RPC, you can **compile program on your
|
With cross compilation and RPC, you can **compile program on your
|
||||||
local machine then run it on remote device**. It is useful when the
|
local machine then run it on the remote device**. It is useful when
|
||||||
resource of remote device is limited, like Raspberry Pi and mobile
|
the resource of remote devices is limited, like Raspberry Pi and mobile
|
||||||
platforms, so you do not wish to put the compilation procedure on
|
platforms. In this tutorial, we will take Raspberry Pi for CPU example
|
||||||
the device in order to save time and space.
|
and Firefly-RK3399 for opencl example.
|
||||||
In this tutorial, I will take Raspberry Pi as our target platform
|
|
||||||
for example.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Build TVM Runtime on Device
|
# Build TVM Runtime on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
#
|
#
|
||||||
# There're some prerequisites: similar as compiling TVM on your
|
# The first step is to build tvm runtime on the remote device.
|
||||||
# local machine, we need build runtime on remote device.
|
|
||||||
#
|
#
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
|
@ -30,86 +27,44 @@ for example.
|
||||||
# executed on the target device, e.g. Raspberry Pi. And we assume it
|
# executed on the target device, e.g. Raspberry Pi. And we assume it
|
||||||
# has Linux running.
|
# has Linux running.
|
||||||
#
|
#
|
||||||
# To get started, clone tvm repo from github. It is important to clone
|
# Since we do compilation on local machine, the remote device is only used
|
||||||
# the submodules along, with --recursive option (Assuming you are in
|
# for running the generated code. We only need to build tvm runtime on
|
||||||
# your home directory):
|
# the remote device.
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# git clone --recursive https://github.com/dmlc/tvm
|
# git clone --recursive https://github.com/dmlc/tvm
|
||||||
|
# cd tvm
|
||||||
|
# make runtime -j2
|
||||||
#
|
#
|
||||||
# .. note::
|
# After building runtime successfully, we need to set environment variables
|
||||||
|
# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
|
||||||
|
# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
|
||||||
|
# directory is in :code:`~/tvm`):
|
||||||
#
|
#
|
||||||
# Usually device has limited resources and we only need to build
|
# .. code-block:: bash
|
||||||
# runtime. The idea is we will use TVM compiler on the local server
|
|
||||||
# to compile and upload the compiled program to the device and run
|
|
||||||
# the device function remotely.
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# export PYTHONPATH=$PYTHONPATH:~/tvm/python
|
||||||
#
|
#
|
||||||
# cd tvm
|
# To update the environment variables, execute :code:`source ~/.bashrc`.
|
||||||
# cp make/config.mk .
|
|
||||||
# echo USE_RPC=1>> config.mk
|
|
||||||
#
|
|
||||||
# Also make sure that you have set :code:`USE_RPC=1` in your
|
|
||||||
# :code:`config.mk`. We don't need LLVM when building runtime, so
|
|
||||||
# :code:`LLVM_CONFIG = llvm-config` in :code:`config.mk` is commented
|
|
||||||
# out by default. After that, build runtime!
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# make runtime
|
|
||||||
#
|
|
||||||
# After building runtime successfully, we need to set environment varibles
|
|
||||||
# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
|
|
||||||
# of system enviroment variables. Assuming your TVM directory is in
|
|
||||||
# :code:`~/tvm` and set environment variables below your account.
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# vi ~/.bashrc
|
|
||||||
#
|
|
||||||
# We need to edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
|
|
||||||
# lines below (Assuming your TVM directory is in :code:`~/tvm`):
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# export TVM_HOME=~/tvm
|
|
||||||
# export PATH=$PATH:$TVM_HOME/lib
|
|
||||||
# export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
|
|
||||||
#
|
|
||||||
# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Set Up RPC Server on Device
|
# Set Up RPC Server on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# To set up a TVM RPC server on the Raspberry Pi (our remote device),
|
# To start an RPC server, run the following command on your remote device
|
||||||
# we have prepared a one-line script so you only need to run this
|
# (Which is Raspberry Pi in this example).
|
||||||
# command after following the installation guide to install TVM on
|
|
||||||
# your device:
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
||||||
#
|
#
|
||||||
# After executing the command above, if you see these lines below, it means
|
# If you see the line below, it means the RPC server started
|
||||||
# the RPC server started successfully on your device.
|
# successfully on your device.
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
|
|
||||||
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
||||||
#
|
#
|
||||||
# In the following code block, we simply start an RPC server on the
|
|
||||||
# same machine, for demonstration. This line can be omitted if we
|
|
||||||
# started an remote server.
|
|
||||||
#
|
|
||||||
from __future__ import absolute_import, print_function
|
|
||||||
|
|
||||||
import tvm
|
|
||||||
import numpy as np
|
|
||||||
from tvm import rpc
|
|
||||||
from tvm.contrib import util
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Declare and Cross Compile Kernel on Local Machine
|
# Declare and Cross Compile Kernel on Local Machine
|
||||||
|
@ -117,36 +72,50 @@ from tvm.contrib import util
|
||||||
#
|
#
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
# Now we back to the local machine, which has a full TVM installed.
|
# Now we back to the local machine, which has a full TVM installed
|
||||||
|
# (with LLVM).
|
||||||
#
|
#
|
||||||
# Here we will declare a simple kernel with TVM on the local machine:
|
# Here we will declare a simple kernel on the local machine:
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import tvm
|
||||||
|
from tvm import rpc
|
||||||
|
from tvm.contrib import util
|
||||||
|
|
||||||
n = tvm.convert(1024)
|
n = tvm.convert(1024)
|
||||||
A = tvm.placeholder((n,), name='A')
|
A = tvm.placeholder((n,), name='A')
|
||||||
B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
|
B = tvm.compute((n,), lambda i: A[i] + 1.0, name='B')
|
||||||
s = tvm.create_schedule(B.op)
|
s = tvm.create_schedule(B.op)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Then we cross compile the kernel:
|
# Then we cross compile the kernel.
|
||||||
#
|
# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
|
||||||
|
# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
|
||||||
|
# on our webpage building server. See the detailed note in the following block.
|
||||||
|
|
||||||
# the target here should be 'llvm -target=armv7l-none-linux-gnueabihf',
|
local_demo = True
|
||||||
# and we use 'llvm' here to make example run locally, see the detailed
|
|
||||||
# note in the following block
|
if local_demo:
|
||||||
f = tvm.build(s, [A, B], target='llvm', name='myadd')
|
target = 'llvm'
|
||||||
# save the lib at local temp folder
|
else:
|
||||||
|
target = 'llvm -target=armv7l-linux-gnueabihf'
|
||||||
|
|
||||||
|
func = tvm.build(s, [A, B], target=target, name='add_one')
|
||||||
|
# save the lib at a local temp folder
|
||||||
temp = util.tempdir()
|
temp = util.tempdir()
|
||||||
path = temp.relpath('mylib.o')
|
path = temp.relpath('lib.tar')
|
||||||
f.save(path)
|
func.export_library(path)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
# the argument :code:`target` in :code:`build` should be replaced
|
# To run this tutorial with real remote device, change :code:`local_demo`
|
||||||
# :code:`'llvm'` with the target triple of your device, which might be
|
# to False and replace :code:`target` in :code:`build` with the true
|
||||||
# different for different device. For example, it is
|
# target triple of your device. The target triple which might be
|
||||||
# :code:`'llvm -target=armv7l-none-linux-gnueabihf'` for my Raspberry
|
# different for different devices. For example, it is
|
||||||
# Pi. Here we use :code:`'llvm'` directly to make the tutorial runable.
|
# :code:`'llvm -target=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
|
||||||
|
# :code:`'llvm -target=aarch64-linux-gnu'` for RK3399.
|
||||||
#
|
#
|
||||||
# Usually, you can query the target by execute :code:`gcc -v` on your
|
# Usually, you can query the target by execute :code:`gcc -v` on your
|
||||||
# device, and look for the line starting with :code:`Target:`
|
# device, and look for the line starting with :code:`Target:`
|
||||||
|
@ -155,8 +124,6 @@ f.save(path)
|
||||||
# Besides :code:`-target`, you can also set other compilation options
|
# Besides :code:`-target`, you can also set other compilation options
|
||||||
# like:
|
# like:
|
||||||
#
|
#
|
||||||
# * -mtriple=<target triple>
|
|
||||||
# Specify the target triple, same as '-target'.
|
|
||||||
# * -mcpu=<cpuname>
|
# * -mcpu=<cpuname>
|
||||||
# Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
|
# Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
|
||||||
# * -mattr=a1,+a2,-a3,...
|
# * -mattr=a1,+a2,-a3,...
|
||||||
|
@ -168,13 +135,6 @@ f.save(path)
|
||||||
# llc -mtriple=<your device target triple> -mattr=help
|
# llc -mtriple=<your device target triple> -mattr=help
|
||||||
#
|
#
|
||||||
# These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
|
# These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
|
||||||
# So for my board, to get the best performance, the complete compilation
|
|
||||||
# option would be:
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# llvm -mtriple=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon
|
|
||||||
#
|
|
||||||
# It is recommended to set target triple and feature set to contain specific
|
# It is recommended to set target triple and feature set to contain specific
|
||||||
# feature available, so we can take full advantage of the features of the
|
# feature available, so we can take full advantage of the features of the
|
||||||
# board.
|
# board.
|
||||||
|
@ -184,43 +144,40 @@ f.save(path)
|
||||||
######################################################################
|
######################################################################
|
||||||
# Run CPU Kernel Remotely by RPC
|
# Run CPU Kernel Remotely by RPC
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Here we will show you how to run the kernel on the remote device:
|
# We show how to run the generated cpu kernel on the remote device.
|
||||||
#
|
# First we obtain an RPC session from remote device.
|
||||||
# .. note::
|
|
||||||
# In order to have this tutorial runs locally to build the nice HTML, we
|
|
||||||
# start a RPC server on the local machine. You can ignore it if you already
|
|
||||||
# started the server on the target device. And then change host IP properly.
|
|
||||||
|
|
||||||
# Can be ignored if you already started the RPC server
|
if local_demo:
|
||||||
server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True)
|
remote = rpc.LocalSession()
|
||||||
host = '0.0.0.0' # Change to your target device IP
|
else:
|
||||||
port = 9090
|
# The following is my environment, change this to the IP address of your target device
|
||||||
# connect the remote device
|
host = '10.77.1.162'
|
||||||
remote = rpc.connect(host, port)
|
port = 9090
|
||||||
|
remote = rpc.connect(host, port)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Here we upload the lib to the remote device, then invoke a device local
|
# Upload the lib to the remote device, then invoke a device local
|
||||||
# compiler for shared lib and load it into device memory. now `f` is a
|
# compiler to relink them. Now `func` is a remote module object.
|
||||||
# remote module object.
|
|
||||||
remote.upload(path)
|
|
||||||
f = remote.load_module('mylib.o')
|
|
||||||
|
|
||||||
# create array on the remote device
|
remote.upload(path)
|
||||||
ctx = remote.cpu(0)
|
func = remote.load_module('lib.tar')
|
||||||
|
|
||||||
|
# create arrays on the remote device
|
||||||
|
ctx = remote.cpu()
|
||||||
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
|
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
|
||||||
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
|
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
|
||||||
# the function will run on the remote device
|
# the function will run on the remote device
|
||||||
f(a, b)
|
func(a, b)
|
||||||
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
|
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# When you want to evaluate the performance of the kernel on the remote
|
# When you want to evaluate the performance of the kernel on the remote
|
||||||
# device, it is important to avoid overhead of remote function calls.
|
# device, it is important to avoid the overhead of network.
|
||||||
# :code:`time_evaluator` will returns a remote function that runs the
|
# :code:`time_evaluator` will returns a remote function that runs the
|
||||||
# function over number times, measures the cost per run on the remote
|
# function over number times, measures the cost per run on the remote
|
||||||
# device and returns the measured cost.
|
# device and returns the measured cost. Network overhead is excluded.
|
||||||
#
|
|
||||||
time_f = f.time_evaluator(f.entry_name, ctx, number=10)
|
time_f = func.time_evaluator(func.entry_name, ctx, number=10)
|
||||||
cost = time_f(a, b).mean
|
cost = time_f(a, b).mean
|
||||||
print('%g secs/op' % cost)
|
print('%g secs/op' % cost)
|
||||||
|
|
||||||
|
@ -228,69 +185,54 @@ print('%g secs/op' % cost)
|
||||||
# Run OpenCL Kernel Remotely by RPC
|
# Run OpenCL Kernel Remotely by RPC
|
||||||
# ---------------------------------
|
# ---------------------------------
|
||||||
# As for remote OpenCL devices, the workflow is almost the same as above.
|
# As for remote OpenCL devices, the workflow is almost the same as above.
|
||||||
# You can define the kernel, upload files, and run by RPC. The files
|
# You can define the kernel, upload files, and run by RPC.
|
||||||
# include host object, kernel source code and module meta file. We rely
|
|
||||||
# on remote compiler to re-link them.
|
|
||||||
#
|
#
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
# Raspberry Pi does not support OpenCL, the following code is tested on
|
# Raspberry Pi does not support OpenCL, the following code is tested on
|
||||||
# Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
|
# Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
|
||||||
# to setup the RK3399 OS and OpenCL driver.
|
# to setup the OS and OpenCL driver for RK3399.
|
||||||
#
|
#
|
||||||
# The target_host should be 'llvm -target=aarch64-linux-gnu'.
|
# Also we need to build the runtime with OpenCL enabled on rk3399 board. In the tvm
|
||||||
# But here we set 'llvm' to enable this tutorial to run locally.
|
# root directory, execute
|
||||||
#
|
#
|
||||||
# Also we need to build the runtime with the flag `USE_OPENCL=1` to
|
# .. code-block:: bash
|
||||||
# build the kernel (different from cpu, we need bind axis for OpenCL)
|
|
||||||
#
|
#
|
||||||
# The following functions shows how we can deploy CL
|
# cp cmake/config.cmake .
|
||||||
def deploy_cl():
|
# sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
|
||||||
|
# make runtime -j4
|
||||||
|
#
|
||||||
|
# The following function shows how we run OpenCL kernel remotely
|
||||||
|
|
||||||
|
def run_opencl():
|
||||||
|
# NOTE: This is the setting for my rk3399 board. You need to modify
|
||||||
|
# them according to your environment.
|
||||||
|
target_host = "llvm -target=aarch64-linux-gnu"
|
||||||
|
opencl_device_host = '10.77.1.145'
|
||||||
|
opencl_device_port = 9090
|
||||||
|
|
||||||
|
# create scheule for the above "add one" compute decleration
|
||||||
s = tvm.create_schedule(B.op)
|
s = tvm.create_schedule(B.op)
|
||||||
xo, xi = s[B].split(B.op.axis[0], factor=32)
|
xo, xi = s[B].split(B.op.axis[0], factor=32)
|
||||||
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
|
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
|
||||||
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
|
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
|
||||||
f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
|
func = tvm.build(s, [A, B], "opencl", target_host=target_host)
|
||||||
|
|
||||||
# save files
|
remote = rpc.connect(opencl_device_host, opencl_device_port)
|
||||||
path_o = temp.relpath("myadd.o")
|
|
||||||
path_cl = temp.relpath("myadd.cl")
|
|
||||||
path_json = temp.relpath("myadd.tvm_meta.json")
|
|
||||||
f.save(path_o)
|
|
||||||
f.imported_modules[0].save(path_cl)
|
|
||||||
|
|
||||||
# upload files
|
# export and upload
|
||||||
remote.upload(path_o)
|
path = temp.relpath('lib_cl.tar')
|
||||||
remote.upload(path_cl)
|
func.export_library(path)
|
||||||
remote.upload(path_json)
|
remote.upload(path)
|
||||||
|
func = remote.load_module('lib_cl.tar')
|
||||||
# load files on remote device
|
|
||||||
fhost = remote.load_module("myadd.o")
|
|
||||||
fdev = remote.load_module("myadd.cl")
|
|
||||||
fhost.import_module(fdev)
|
|
||||||
|
|
||||||
# run
|
# run
|
||||||
ctx = remote.cl(0)
|
ctx = remote.cl()
|
||||||
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
|
a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
|
||||||
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
|
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
|
||||||
fhost(a, b)
|
func(a, b)
|
||||||
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
|
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
|
||||||
|
print("OpenCP test passed!")
|
||||||
|
|
||||||
#####################################################################
|
|
||||||
# Instead of uploading files separately, there is a more convinient way.
|
|
||||||
# You can export libraray as a tar ball.
|
|
||||||
# The following functions shows how we can deploy by tar ball
|
|
||||||
def deploy_cl_by_tar():
|
|
||||||
path_tar = temp.relpath("myadd.tar")
|
|
||||||
f.export_library(path_tar)
|
|
||||||
remote.upload(path_tar)
|
|
||||||
fhost = remote.load_module("myadd.tar")
|
|
||||||
fhost(a, b)
|
|
||||||
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
|
|
||||||
|
|
||||||
# terminate the server after experiment
|
|
||||||
server.terminate()
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Summary
|
# Summary
|
||||||
|
|
|
@ -1,2 +1,4 @@
|
||||||
|
.. _tutorial-nnvm:
|
||||||
|
|
||||||
Compile Deep Learning Models
|
Compile Deep Learning Models
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
|
@ -6,14 +6,10 @@ Deploy the Pretrained Model on ARM Mali GPU
|
||||||
**Author**: `Lianmin Zheng <https://lmzheng.net/>`_, `Ziheng Jiang <https://ziheng.org/>`_
|
**Author**: `Lianmin Zheng <https://lmzheng.net/>`_, `Ziheng Jiang <https://ziheng.org/>`_
|
||||||
|
|
||||||
This is an example of using NNVM to compile a ResNet model and
|
This is an example of using NNVM to compile a ResNet model and
|
||||||
deploy it on Firefly-RK3399 with ARM Mali GPU. We will use the
|
deploy it on Firefly-RK3399 with ARM Mali GPU. We will use the
|
||||||
Mali-T860 MP4 GPU on this board to accelerate the inference.
|
Mali-T860 MP4 GPU on this board to accelerate the inference.
|
||||||
|
|
||||||
This tutorial is based on the tutorial for deploying on Raspberry Pi by `Ziheng Jiang <https://ziheng.org/>`_.
|
|
||||||
Great thanks to the original author, I only do several lines of modification.
|
|
||||||
|
|
||||||
To begin with, we import nnvm (for compilation) and TVM (for deployment).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import tvm
|
import tvm
|
||||||
import nnvm.compiler
|
import nnvm.compiler
|
||||||
import nnvm.testing
|
import nnvm.testing
|
||||||
|
@ -24,92 +20,65 @@ from tvm.contrib import util, graph_runtime as runtime
|
||||||
# Build TVM Runtime on Device
|
# Build TVM Runtime on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
#
|
#
|
||||||
# There're some prerequisites: we need build tvm runtime and set up
|
# The first step is to build tvm runtime on the remote device.
|
||||||
# a RPC server on remote device.
|
|
||||||
#
|
|
||||||
# To get started, clone tvm repo from github. It is important to clone
|
|
||||||
# the submodules along, with --recursive option (Assuming you are in
|
|
||||||
# your home directory):
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# git clone --recursive https://github.com/dmlc/tvm
|
|
||||||
#
|
#
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
# Usually device has limited resources and we only need to build
|
# All instructions in both this section and next section should be
|
||||||
# runtime. The idea is we will use TVM compiler on the local server
|
# executed on the target device, e.g. Rk3399. And we assume it
|
||||||
# to compile and upload the compiled program to the device and run
|
# has Linux running.
|
||||||
# the device function remotely.
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# Since we do compilation on local machine, the remote device is only used
|
||||||
|
# for running the generated code. We only need to build tvm runtime on
|
||||||
|
# the remote device. Make sure you have opencl driver in your board.
|
||||||
|
# You can refer to `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
|
||||||
|
# to setup OS and opencl driver for rk3399.
|
||||||
#
|
#
|
||||||
# make runtime
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# After success of buildind runtime, we need set environment varibles
|
# git clone --recursive https://github.com/dmlc/tvm
|
||||||
# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
|
# cd tvm
|
||||||
# of system enviroment variables. Assuming your TVM directory is in
|
# cp cmake/config.cmake .
|
||||||
# :code:`~/tvm` and set environment variables below your account.
|
# sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
|
||||||
|
# make runtime -j4
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# After building runtime successfully, we need to set environment varibles
|
||||||
|
# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
|
||||||
|
# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
|
||||||
|
# directory is in :code:`~/tvm`):
|
||||||
#
|
#
|
||||||
# vi ~/.bashrc
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# We need edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
|
# export PYTHONPATH=$PYTHONPATH:~/tvm/python
|
||||||
# lines below (Assuming your TVM directory is in :code:`~/tvm`):
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# To update the environment variables, execute :code:`source ~/.bashrc`.
|
||||||
#
|
|
||||||
# export TVM_HOME=~/tvm
|
|
||||||
# export PATH=$PATH:$TVM_HOME/lib
|
|
||||||
# export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
|
|
||||||
#
|
|
||||||
# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Set Up RPC Server on Device
|
# Set Up RPC Server on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# To set up a TVM RPC server on the your ARM device (our remote device),
|
# To start an RPC server, run the following command on your remote device
|
||||||
# we have prepared a one-line script so you only need to run this
|
# (Which is RK3399 in our example).
|
||||||
# command after following the installation guide to install TVM on
|
|
||||||
# your device:
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
||||||
#
|
#
|
||||||
# After executing command above, if you see these lines below, it's
|
# If you see the line below, it means the RPC server started
|
||||||
# successful to start RPC server on your device.
|
# successfully on your device.
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
|
|
||||||
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
||||||
#
|
#
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# For demonstration, we simply start an RPC server on the same machine,
|
# Prepare the Pre-trained Model
|
||||||
# if :code:`use_mali` is False. If you have set up the remote
|
# -----------------------------
|
||||||
# environment, please change the three lines below: change the
|
# Back to the host machine, which should have a full TVM installed (with LLVM).
|
||||||
# :code:`use_mali` to True, also change the :code:`host` and :code:`port`
|
#
|
||||||
# with your device's host address and port number.
|
# We will use pre-trained model from
|
||||||
|
# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
|
||||||
use_mali = False
|
# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
|
||||||
host = '10.42.0.96'
|
|
||||||
port = 9090
|
|
||||||
|
|
||||||
if not use_mali:
|
|
||||||
# run server locally
|
|
||||||
host = 'localhost'
|
|
||||||
port = 9095
|
|
||||||
server = rpc.Server(host=host, port=port, use_popen=True)
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Prepare the Pretrained Model
|
|
||||||
# ----------------------------
|
|
||||||
# Back to the host machine, firstly, we need to download a MXNet Gluon
|
|
||||||
# ResNet model from model zoo, which is pretrained on ImageNet. You
|
|
||||||
# can found more details about this part at `Compile MXNet Models`
|
|
||||||
|
|
||||||
from mxnet.gluon.model_zoo.vision import get_model
|
from mxnet.gluon.model_zoo.vision import get_model
|
||||||
from mxnet.gluon.utils import download
|
from mxnet.gluon.utils import download
|
||||||
|
@ -135,7 +104,6 @@ def transform_image(image):
|
||||||
|
|
||||||
x = transform_image(image)
|
x = transform_image(image)
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# synset is used to transform the label from number of ImageNet class to
|
# synset is used to transform the label from number of ImageNet class to
|
||||||
# the word human can understand.
|
# the word human can understand.
|
||||||
|
@ -143,6 +111,7 @@ synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
|
||||||
'4d0b62f3d01426887599d4f7ede23ee5/raw/',
|
'4d0b62f3d01426887599d4f7ede23ee5/raw/',
|
||||||
'596b27d23537e5a1b5751d2b0481ef172f58b539/',
|
'596b27d23537e5a1b5751d2b0481ef172f58b539/',
|
||||||
'imagenet1000_clsid_to_human.txt'])
|
'imagenet1000_clsid_to_human.txt'])
|
||||||
|
|
||||||
synset_name = 'synset.txt'
|
synset_name = 'synset.txt'
|
||||||
download(synset_url, synset_name)
|
download(synset_url, synset_name)
|
||||||
with open(synset_name) as f:
|
with open(synset_name) as f:
|
||||||
|
@ -176,21 +145,29 @@ out_shape = (batch_size, num_classes)
|
||||||
# triplet for host ARM device by setting the parameter :code:`target_host`.
|
# triplet for host ARM device by setting the parameter :code:`target_host`.
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# If we run the example locally for demonstration, we can simply set
|
# If we run the example on our x86 server for demonstration, we can simply
|
||||||
# it as :code:`llvm`. If to run it on the ARM device, you need to specify
|
# set it as :code:`llvm`. If running it on the RK3399, we need to
|
||||||
# its instruction set. Here is the option I use for my Firefly-RK3399.
|
# specify its instruction set. Set :code:`local_demo` to False if you
|
||||||
|
# want to run this tutorial with a real device.
|
||||||
|
|
||||||
if use_mali:
|
local_demo = True
|
||||||
target_host = "llvm -target=aarch64-linux-gnu -mattr=+neon"
|
|
||||||
target = tvm.target.mali()
|
if local_demo:
|
||||||
else:
|
|
||||||
target_host = "llvm"
|
target_host = "llvm"
|
||||||
target = tvm.target.cuda()
|
target = "llvm"
|
||||||
|
else:
|
||||||
|
# Here is the setting for my rk3399 board
|
||||||
|
# If you don't use rk3399, you can query your target triple by
|
||||||
|
# execute `gcc -v` on your board.
|
||||||
|
target_host = "llvm -target=aarch64-linux-gnu"
|
||||||
|
|
||||||
# set target as `tvm.target.mali` instead of 'opencl' to enable
|
# set target as `tvm.target.mali` instead of 'opencl' to enable
|
||||||
# target-specified optimization
|
# optimization for mali
|
||||||
graph, lib, params = nnvm.compiler.build(net, target=target,
|
target = tvm.target.mali()
|
||||||
shape={"data": data_shape}, params=params, target_host=target_host)
|
|
||||||
|
with nnvm.compiler.build_config(opt_level=2):
|
||||||
|
graph, lib, params = nnvm.compiler.build(net, target=target,
|
||||||
|
shape={"data": data_shape}, params=params, target_host=target_host)
|
||||||
|
|
||||||
# After `nnvm.compiler.build`, you will get three return values: graph,
|
# After `nnvm.compiler.build`, you will get three return values: graph,
|
||||||
# library and the new parameter, since we do some optimization that will
|
# library and the new parameter, since we do some optimization that will
|
||||||
|
@ -207,14 +184,20 @@ lib.export_library(lib_fname)
|
||||||
# With RPC, you can deploy the model remotely from your host machine
|
# With RPC, you can deploy the model remotely from your host machine
|
||||||
# to the remote device.
|
# to the remote device.
|
||||||
|
|
||||||
# connect the server
|
# obtain an RPC session from remote device.
|
||||||
remote = rpc.connect(host, port)
|
if local_demo:
|
||||||
|
remote = rpc.LocalSession()
|
||||||
|
else:
|
||||||
|
# The following is my environment, change this to the IP address of your target device
|
||||||
|
host = '10.77.1.145'
|
||||||
|
port = 9090
|
||||||
|
remote = rpc.connect(host, port)
|
||||||
|
|
||||||
# upload the library to remote device and load it
|
# upload the library to remote device and load it
|
||||||
remote.upload(lib_fname)
|
remote.upload(lib_fname)
|
||||||
rlib = remote.load_module('net.tar')
|
rlib = remote.load_module('net.tar')
|
||||||
|
|
||||||
ctx = remote.cl(0) if use_mali else remote.gpu(0)
|
ctx = remote.cpu(0) if local_demo else remote.cl(0)
|
||||||
# upload the parameter
|
# upload the parameter
|
||||||
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
||||||
|
|
||||||
|
@ -231,7 +214,3 @@ out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
|
||||||
# get top1 result
|
# get top1 result
|
||||||
top1 = np.argmax(out.asnumpy())
|
top1 = np.argmax(out.asnumpy())
|
||||||
print('TVM prediction top-1: {}'.format(synset[top1]))
|
print('TVM prediction top-1: {}'.format(synset[top1]))
|
||||||
|
|
||||||
if not use_mali:
|
|
||||||
# terminate the local server
|
|
||||||
server.terminate()
|
|
||||||
|
|
|
@ -7,9 +7,8 @@ Deploy the Pretrained Model on Raspberry Pi
|
||||||
|
|
||||||
This is an example of using NNVM to compile a ResNet model and deploy
|
This is an example of using NNVM to compile a ResNet model and deploy
|
||||||
it on raspberry pi.
|
it on raspberry pi.
|
||||||
|
|
||||||
To begin with, we import nnvm(for compilation) and TVM(for deployment).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import tvm
|
import tvm
|
||||||
import nnvm.compiler
|
import nnvm.compiler
|
||||||
import nnvm.testing
|
import nnvm.testing
|
||||||
|
@ -17,102 +16,73 @@ from tvm import rpc
|
||||||
from tvm.contrib import util, graph_runtime as runtime
|
from tvm.contrib import util, graph_runtime as runtime
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
# .. _build-tvm-runtime-on-device:
|
||||||
|
#
|
||||||
# Build TVM Runtime on Device
|
# Build TVM Runtime on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
#
|
#
|
||||||
# There're some prerequisites: we need build tvm runtime and set up
|
# The first step is to build tvm runtime on the remote device.
|
||||||
# a RPC server on remote device.
|
|
||||||
#
|
|
||||||
# To get started, clone tvm repo from github. It is important to clone
|
|
||||||
# the submodules along, with --recursive option (Assuming you are in
|
|
||||||
# your home directory):
|
|
||||||
#
|
|
||||||
# .. code-block:: bash
|
|
||||||
#
|
|
||||||
# git clone --recursive https://github.com/dmlc/tvm
|
|
||||||
#
|
#
|
||||||
# .. note::
|
# .. note::
|
||||||
#
|
#
|
||||||
# Usually device has limited resources and we only need to build
|
# All instructions in both this section and next section should be
|
||||||
# runtime. The idea is we will use TVM compiler on the local server
|
# executed on the target device, e.g. Raspberry Pi. And we assume it
|
||||||
# to compile and upload the compiled program to the device and run
|
# has Linux running.
|
||||||
# the device function remotely.
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# Since we do compilation on local machine, the remote device is only used
|
||||||
|
# for running the generated code. We only need to build tvm runtime on
|
||||||
|
# the remote device.
|
||||||
#
|
#
|
||||||
# make runtime
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# After success of buildind runtime, we need set environment varibles
|
# git clone --recursive https://github.com/dmlc/tvm
|
||||||
# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
|
# cd tvm
|
||||||
# of system enviroment variables. Assuming your TVM directory is in
|
# make runtime -j4
|
||||||
# :code:`~/tvm` and set environment variables below your account.
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# After building runtime successfully, we need to set environment varibles
|
||||||
|
# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
|
||||||
|
# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
|
||||||
|
# directory is in :code:`~/tvm`):
|
||||||
#
|
#
|
||||||
# vi ~/.bashrc
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# We need edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
|
# export PYTHONPATH=$PYTHONPATH:~/tvm/python
|
||||||
# lines below (Assuming your TVM directory is in :code:`~/tvm`):
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# To update the environment variables, execute :code:`source ~/.bashrc`.
|
||||||
#
|
|
||||||
# export TVM_HOME=~/tvm
|
|
||||||
# export PATH=$PATH:$TVM_HOME/lib
|
|
||||||
# export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
|
|
||||||
#
|
|
||||||
# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Set Up RPC Server on Device
|
# Set Up RPC Server on Device
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# To set up a TVM RPC server on the Raspberry Pi (our remote device),
|
# To start an RPC server, run the following command on your remote device
|
||||||
# we have prepared a one-line script so you only need to run this
|
# (Which is Raspberry Pi in our example).
|
||||||
# command after following the installation guide to install TVM on
|
|
||||||
# your device:
|
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
|
||||||
#
|
#
|
||||||
# After executing command above, if you see these lines below, it's
|
# If you see the line below, it means the RPC server started
|
||||||
# successful to start RPC server on your device.
|
# successfully on your device.
|
||||||
#
|
#
|
||||||
# .. code-block:: bash
|
# .. code-block:: bash
|
||||||
#
|
#
|
||||||
# Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
|
|
||||||
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
# INFO:root:RPCServer: bind to 0.0.0.0:9090
|
||||||
|
#
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# For demonstration, we simply start an RPC server on the same machine,
|
# Prepare the Pre-trained Model
|
||||||
# if :code:`use_rasp` is False. If you have set up the remote
|
# -----------------------------
|
||||||
# environment, please change the three lines below: change the
|
# Back to the host machine, which should have a full TVM installed (with LLVM).
|
||||||
# :code:`use_rasp` to True, also change the :code:`host` and :code:`port`
|
#
|
||||||
# with your device's host address and port number.
|
# We will use pre-trained model from
|
||||||
|
# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
|
||||||
use_rasp = False
|
# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
|
||||||
host = 'rasp0'
|
|
||||||
port = 9090
|
|
||||||
|
|
||||||
if not use_rasp:
|
|
||||||
# run server locally
|
|
||||||
host = 'localhost'
|
|
||||||
port = 9091
|
|
||||||
server = rpc.Server(host=host, port=port, use_popen=True)
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Prepare the Pretrained Model
|
|
||||||
# ----------------------------
|
|
||||||
# Back to the host machine, firstly, we need to download a MXNet Gluon
|
|
||||||
# ResNet model from model zoo, which is pretrained on ImageNet. You
|
|
||||||
# can found more details about this part at `Compile MXNet Models`
|
|
||||||
|
|
||||||
from mxnet.gluon.model_zoo.vision import get_model
|
from mxnet.gluon.model_zoo.vision import get_model
|
||||||
from mxnet.gluon.utils import download
|
from mxnet.gluon.utils import download
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
# only one line to get the model
|
# one line to get the model
|
||||||
block = get_model('resnet18_v1', pretrained=True)
|
block = get_model('resnet18_v1', pretrained=True)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
@ -131,7 +101,6 @@ def transform_image(image):
|
||||||
|
|
||||||
x = transform_image(image)
|
x = transform_image(image)
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# synset is used to transform the label from number of ImageNet class to
|
# synset is used to transform the label from number of ImageNet class to
|
||||||
# the word human can understand.
|
# the word human can understand.
|
||||||
|
@ -173,29 +142,32 @@ out_shape = (batch_size, num_classes)
|
||||||
# will lead to very different performance.
|
# will lead to very different performance.
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# If we run the example locally for demonstration, we can simply set
|
# If we run the example on our x86 server for demonstration, we can simply
|
||||||
# it as :code:`llvm`. If to run it on the Raspberry Pi, you need to
|
# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
|
||||||
# specify its instruction set. Here is the option I use for my Raspberry
|
# specify its instruction set. Set :code:`local_demo` to False if you want
|
||||||
# Pi, which has been proved as a good compilation configuration.
|
# to run this tutorial with a real device.
|
||||||
|
|
||||||
if use_rasp:
|
local_demo = True
|
||||||
target = tvm.target.rasp()
|
|
||||||
else:
|
if local_demo:
|
||||||
target = tvm.target.create('llvm')
|
target = tvm.target.create('llvm')
|
||||||
|
else:
|
||||||
|
target = tvm.target.arm_cpu('rasp3b')
|
||||||
|
# The above line is a simple form of
|
||||||
|
# target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')
|
||||||
|
|
||||||
graph, lib, params = nnvm.compiler.build(
|
with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
|
||||||
net, target, shape={"data": data_shape}, params=params)
|
graph, lib, params = nnvm.compiler.build(
|
||||||
|
net, target, shape={"data": data_shape}, params=params)
|
||||||
|
|
||||||
# After `nnvm.compiler.build`, you will get three return values: graph,
|
# After `nnvm.compiler.build`, you will get three return values: graph,
|
||||||
# library and the new parameter, since we do some optimization that will
|
# library and the new parameter, since we do some optimization that will
|
||||||
# change the parameters but keep the result of model as the same.
|
# change the parameters but keep the result of model as the same.
|
||||||
|
|
||||||
|
|
||||||
# Save the library at local temporary directory.
|
# Save the library at local temporary directory.
|
||||||
tmp = util.tempdir()
|
tmp = util.tempdir()
|
||||||
lib_fname = tmp.relpath('net.o')
|
lib_fname = tmp.relpath('net.tar')
|
||||||
lib.save(lib_fname)
|
lib.export_library(lib_fname)
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Deploy the Model Remotely by RPC
|
# Deploy the Model Remotely by RPC
|
||||||
|
@ -203,15 +175,21 @@ lib.save(lib_fname)
|
||||||
# With RPC, you can deploy the model remotely from your host machine
|
# With RPC, you can deploy the model remotely from your host machine
|
||||||
# to the remote device.
|
# to the remote device.
|
||||||
|
|
||||||
# connect the server
|
# obtain an RPC session from remote device.
|
||||||
remote = rpc.connect(host, port)
|
if local_demo:
|
||||||
|
remote = rpc.LocalSession()
|
||||||
|
else:
|
||||||
|
# The following is my environment, change this to the IP address of your target device
|
||||||
|
host = '10.77.1.162'
|
||||||
|
port = 9090
|
||||||
|
remote = rpc.connect(host, port)
|
||||||
|
|
||||||
# upload the library to remote device and load it
|
# upload the library to remote device and load it
|
||||||
remote.upload(lib_fname)
|
remote.upload(lib_fname)
|
||||||
rlib = remote.load_module('net.o')
|
rlib = remote.load_module('net.tar')
|
||||||
|
|
||||||
|
# upload the parameter (this may take a while)
|
||||||
ctx = remote.cpu(0)
|
ctx = remote.cpu(0)
|
||||||
# upload the parameter
|
|
||||||
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
||||||
|
|
||||||
# create the remote runtime module
|
# create the remote runtime module
|
||||||
|
@ -227,7 +205,3 @@ out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
|
||||||
# get top1 result
|
# get top1 result
|
||||||
top1 = np.argmax(out.asnumpy())
|
top1 = np.argmax(out.asnumpy())
|
||||||
print('TVM prediction top-1: {}'.format(synset[top1]))
|
print('TVM prediction top-1: {}'.format(synset[top1]))
|
||||||
|
|
||||||
if not use_rasp:
|
|
||||||
# terminate the local server
|
|
||||||
server.terminate()
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
"""
|
"""
|
||||||
|
.. _tutorial-from-mxnet:
|
||||||
|
|
||||||
Compile MXNet Models
|
Compile MXNet Models
|
||||||
====================
|
====================
|
||||||
**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
|
**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
"""
|
|
||||||
Compile GPU Inference
|
|
||||||
=====================
|
|
||||||
**Author**: `Yuwei Hu <https://huyuwei.github.io/>`_
|
|
||||||
|
|
||||||
This is an example of using NNVM to compile MobileNet/ResNet model and deploy its inference on GPU.
|
|
||||||
|
|
||||||
To begin with, we import nnvm(for compilation) and TVM(for deployment).
|
|
||||||
"""
|
|
||||||
import tvm
|
|
||||||
import numpy as np
|
|
||||||
from tvm.contrib import nvcc, graph_runtime
|
|
||||||
import nnvm.compiler
|
|
||||||
import nnvm.testing
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Register the NVCC Compiler Option
|
|
||||||
# ---------------------------------
|
|
||||||
# NNVM optimizes the graph and relies on TVM to generate fast GPU code.
|
|
||||||
# To get the maximum performance, we need to enable nvcc's compiler hook.
|
|
||||||
# This usually gives better performance than nvrtc mode.
|
|
||||||
|
|
||||||
@tvm.register_func("tvm_callback_cuda_compile", override=True)
|
|
||||||
def tvm_callback_cuda_compile(code):
|
|
||||||
ptx = nvcc.compile_cuda(code, target="ptx")
|
|
||||||
return ptx
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Prepare the Benchmark
|
|
||||||
# ---------------------
|
|
||||||
# We construct a standard imagenet inference benchmark.
|
|
||||||
# NNVM needs two things to compile a deep learning model:
|
|
||||||
#
|
|
||||||
# - net: the graph representation of the computation
|
|
||||||
# - params: a dictionary of str to parameters
|
|
||||||
#
|
|
||||||
# We use nnvm's testing utility to produce the model description and random parameters
|
|
||||||
# so that the example does not depend on a specific front-end framework.
|
|
||||||
#
|
|
||||||
# .. note::
|
|
||||||
#
|
|
||||||
# In a typical workflow, we can get this pair from :any:`nnvm.frontend`
|
|
||||||
#
|
|
||||||
target = "cuda"
|
|
||||||
ctx = tvm.gpu(0)
|
|
||||||
batch_size = 1
|
|
||||||
num_classes = 1000
|
|
||||||
image_shape = (3, 224, 224)
|
|
||||||
data_shape = (batch_size,) + image_shape
|
|
||||||
out_shape = (batch_size, num_classes)
|
|
||||||
# To use ResNet to do inference, run the following instead
|
|
||||||
#net, params = nnvm.testing.resnet.get_workload(
|
|
||||||
# batch_size=1, image_shape=image_shape)
|
|
||||||
net, params = nnvm.testing.mobilenet.get_workload(
|
|
||||||
batch_size=1, image_shape=image_shape)
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Compile the Graph
|
|
||||||
# -----------------
|
|
||||||
# To compile the graph, we call the build function with the graph
|
|
||||||
# configuration and parameters.
|
|
||||||
# When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),
|
|
||||||
# and return the updated parameters.
|
|
||||||
|
|
||||||
graph, lib, params = nnvm.compiler.build(
|
|
||||||
net, target, shape={"data": data_shape}, params=params)
|
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Run the Compiled Module
|
|
||||||
# -----------------------
|
|
||||||
#
|
|
||||||
# To deploy the module, we call :any:`tvm.contrib.graph_runtime.create` passing in the graph, the lib, and context.
|
|
||||||
# Thanks to TVM, we can deploy the compiled module to many platforms and languages.
|
|
||||||
# The deployment module is designed to contain minimum dependencies.
|
|
||||||
# This example runs on the same machine.
|
|
||||||
#
|
|
||||||
# Note that the code below no longer depends on NNVM, and only relies TVM's runtime to run(deploy).
|
|
||||||
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
|
|
||||||
module = graph_runtime.create(graph, lib, ctx)
|
|
||||||
# set input
|
|
||||||
module.set_input(**params)
|
|
||||||
module.set_input("data", data)
|
|
||||||
# run
|
|
||||||
module.run()
|
|
||||||
# get output
|
|
||||||
out = module.get_output(0, tvm.nd.empty(out_shape))
|
|
||||||
# convert to numpy
|
|
||||||
out.asnumpy()
|
|
|
@ -6,9 +6,8 @@ Quick Start Tutorial for Compiling Deep Learning Models
|
||||||
**Author**: `Yao Wang <https://github.com/kevinthesun>`_
|
**Author**: `Yao Wang <https://github.com/kevinthesun>`_
|
||||||
|
|
||||||
This example shows how to build a neural network with NNVM python frontend and
|
This example shows how to build a neural network with NNVM python frontend and
|
||||||
generate runtime library for Nvidia GPU and Raspberry Pi with TVM.
|
generate runtime library for Nvidia GPU with TVM.
|
||||||
To run this notebook, you need to install tvm and nnvm.
|
Notice that you need to build TVM with cuda and llvm enabled.
|
||||||
Notice that you need to build tvm with cuda and llvm.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
@ -22,10 +21,13 @@ Notice that you need to build tvm with cuda and llvm.
|
||||||
#
|
#
|
||||||
# In this tutorial, we'll choose cuda and llvm as target backends.
|
# In this tutorial, we'll choose cuda and llvm as target backends.
|
||||||
# To begin with, let's import NNVM and TVM.
|
# To begin with, let's import NNVM and TVM.
|
||||||
import tvm
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
import nnvm.compiler
|
import nnvm.compiler
|
||||||
import nnvm.testing
|
import nnvm.testing
|
||||||
|
import tvm
|
||||||
|
from tvm.contrib import graph_runtime
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Define Neural Network in NNVM
|
# Define Neural Network in NNVM
|
||||||
|
@ -33,7 +35,8 @@ import nnvm.testing
|
||||||
# First, let's define a neural network with nnvm python frontend.
|
# First, let's define a neural network with nnvm python frontend.
|
||||||
# For simplicity, we'll use pre-defined resnet-18 network in NNVM.
|
# For simplicity, we'll use pre-defined resnet-18 network in NNVM.
|
||||||
# Parameters are initialized with Xavier initializer.
|
# Parameters are initialized with Xavier initializer.
|
||||||
# NNVM also supports other model formats such as MXNet, CoreML and ONNX.
|
# NNVM also supports other model formats such as MXNet, CoreML, ONNX and
|
||||||
|
# Tensorflow.
|
||||||
#
|
#
|
||||||
# In this tutorial, we assume we will do inference on our device
|
# In this tutorial, we assume we will do inference on our device
|
||||||
# and the batch size is set to be 1. Input images are RGB color
|
# and the batch size is set to be 1. Input images are RGB color
|
||||||
|
@ -46,7 +49,8 @@ image_shape = (3, 224, 224)
|
||||||
data_shape = (batch_size,) + image_shape
|
data_shape = (batch_size,) + image_shape
|
||||||
out_shape = (batch_size, num_class)
|
out_shape = (batch_size, num_class)
|
||||||
|
|
||||||
net, params = nnvm.testing.resnet.get_workload(batch_size=batch_size, image_shape=image_shape)
|
net, params = nnvm.testing.resnet.get_workload(layers=18,
|
||||||
|
batch_size=batch_size, image_shape=image_shape)
|
||||||
print(net.debug_str())
|
print(net.debug_str())
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
@ -54,10 +58,8 @@ print(net.debug_str())
|
||||||
# -----------
|
# -----------
|
||||||
# Next step is to compile the model using the NNVM/TVM pipeline.
|
# Next step is to compile the model using the NNVM/TVM pipeline.
|
||||||
# Users can specify the optimization level of the compilation.
|
# Users can specify the optimization level of the compilation.
|
||||||
# Currently this value can be 0 to 2, which corresponds to
|
# Currently this value can be 0 to 3. The optimization passes include
|
||||||
# "SimplifyInference", "OpFusion" and "PrecomputePrune" respectively.
|
# operator fusion, pre-computation, layout transformation and so on.
|
||||||
# In this example we set optimization level to be 0
|
|
||||||
# and use Raspberry Pi as compile target.
|
|
||||||
#
|
#
|
||||||
# :any:`nnvm.compiler.build` returns three components: the execution graph in
|
# :any:`nnvm.compiler.build` returns three components: the execution graph in
|
||||||
# json format, the TVM module library of compiled functions specifically
|
# json format, the TVM module library of compiled functions specifically
|
||||||
|
@ -68,24 +70,50 @@ print(net.debug_str())
|
||||||
#
|
#
|
||||||
# We'll first compile for Nvidia GPU. Behind the scene, `nnvm.compiler.build`
|
# We'll first compile for Nvidia GPU. Behind the scene, `nnvm.compiler.build`
|
||||||
# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
|
# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
|
||||||
# then registers the operators (i.e. the nodes of the optmized graphs) to
|
# then registers the operators (i.e. the nodes of the optimized graphs) to
|
||||||
# TVM implementations to generate a `tvm.module`.
|
# TVM implementations to generate a `tvm.module`.
|
||||||
# To generate the module library, TVM will first transfer the HLO IR into the lower
|
# To generate the module library, TVM will first transfer the High level IR
|
||||||
# intrinsic IR of the specified target backend, which is CUDA in this example.
|
# into the lower intrinsic IR of the specified target backend, which is CUDA
|
||||||
# Then the machine code will be generated as the module library.
|
# in this example. Then the machine code will be generated as the module library.
|
||||||
|
|
||||||
opt_level = 0
|
opt_level = 3
|
||||||
target = tvm.target.cuda()
|
target = tvm.target.cuda()
|
||||||
with nnvm.compiler.build_config(opt_level=opt_level):
|
with nnvm.compiler.build_config(opt_level=opt_level):
|
||||||
graph, lib, params = nnvm.compiler.build(
|
graph, lib, params = nnvm.compiler.build(
|
||||||
net, target, shape={"data": data_shape}, params=params)
|
net, target, shape={"data": data_shape}, params=params)
|
||||||
|
|
||||||
######################################################################
|
#####################################################################
|
||||||
# Save Compiled Module
|
# Run the generate library
|
||||||
# ----------------------------
|
# ------------------------
|
||||||
# After compilation, we can save the graph, lib and params into separate files
|
# Now we can create graph runtime and run the module on Nvidia GPU.
|
||||||
# and deploy them to Nvidia GPU.
|
|
||||||
|
|
||||||
|
# create random input
|
||||||
|
ctx = tvm.gpu()
|
||||||
|
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
|
||||||
|
# create module
|
||||||
|
module = graph_runtime.create(graph, lib, ctx)
|
||||||
|
# set input and parameters
|
||||||
|
module.set_input("data", data)
|
||||||
|
module.set_input(**params)
|
||||||
|
# run
|
||||||
|
module.run()
|
||||||
|
# get output
|
||||||
|
out = module.get_output(0, tvm.nd.empty(out_shape))
|
||||||
|
# convert to numpy
|
||||||
|
out.asnumpy()
|
||||||
|
|
||||||
|
# Print first 10 elements of output
|
||||||
|
print(out.asnumpy().flatten()[0:10])
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Save and Load Compiled Module
|
||||||
|
# -----------------------------
|
||||||
|
# We can also save the graph, lib and parameters into files and load them
|
||||||
|
# back in development environment.
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
# save the graph, lib and params into separate files
|
||||||
from tvm.contrib import util
|
from tvm.contrib import util
|
||||||
|
|
||||||
temp = util.tempdir()
|
temp = util.tempdir()
|
||||||
|
@ -97,95 +125,17 @@ with open(temp.relpath("deploy_param.params"), "wb") as fo:
|
||||||
fo.write(nnvm.compiler.save_param_dict(params))
|
fo.write(nnvm.compiler.save_param_dict(params))
|
||||||
print(temp.listdir())
|
print(temp.listdir())
|
||||||
|
|
||||||
######################################################################
|
####################################################
|
||||||
# Deploy locally to Nvidia GPU
|
|
||||||
# ------------------------------
|
|
||||||
# Now we can load the module back.
|
|
||||||
|
|
||||||
import numpy as np
|
# load the module back.
|
||||||
from tvm.contrib import graph_runtime
|
|
||||||
|
|
||||||
loaded_lib = tvm.module.load(path_lib)
|
|
||||||
loaded_json = open(temp.relpath("deploy_graph.json")).read()
|
loaded_json = open(temp.relpath("deploy_graph.json")).read()
|
||||||
|
loaded_lib = tvm.module.load(path_lib)
|
||||||
loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
|
loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
|
||||||
|
input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
|
||||||
|
|
||||||
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
|
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
|
||||||
module.load_params(loaded_params)
|
module.load_params(loaded_params)
|
||||||
|
|
||||||
input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
|
|
||||||
module.run(data=input_data)
|
module.run(data=input_data)
|
||||||
|
|
||||||
out = module.get_output(0, out=tvm.nd.empty(out_shape))
|
out = module.get_output(0, out=tvm.nd.empty(out_shape))
|
||||||
# Print first 10 elements of output
|
|
||||||
print(out.asnumpy()[0][0:10])
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
|
|
||||||
# --------------------------------------------------------------
|
|
||||||
# Following the steps above, we can also compile the model for Raspberry Pi.
|
|
||||||
# TVM provides rpc module to help with remote deploying.
|
|
||||||
#
|
|
||||||
# For demonstration, we simply start an RPC server on the same machine,
|
|
||||||
# if :code:`use_rasp` is False. If you have set up the remote
|
|
||||||
# environment, please change the three lines below: change the
|
|
||||||
# :code:`use_rasp` to True, also change the host and port with your
|
|
||||||
# device's host address and port number.
|
|
||||||
|
|
||||||
# If we run the example locally for demonstration, we can simply set the
|
|
||||||
# compilation target as `llvm`.
|
|
||||||
# To run it on the Raspberry Pi, you need to specify its instruction set.
|
|
||||||
# `llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon`
|
|
||||||
# is the recommended compilation configuration, thanks to Ziheng's work.
|
|
||||||
|
|
||||||
from tvm import rpc
|
|
||||||
|
|
||||||
use_rasp = False
|
|
||||||
host = 'rasp0'
|
|
||||||
port = 9090
|
|
||||||
|
|
||||||
if not use_rasp:
|
|
||||||
# run server locally
|
|
||||||
host = 'localhost'
|
|
||||||
port = 9099
|
|
||||||
server = rpc.Server(host=host, port=port, use_popen=True)
|
|
||||||
|
|
||||||
# compile and save model library
|
|
||||||
if use_rasp:
|
|
||||||
target = "llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon"
|
|
||||||
else:
|
|
||||||
target = "llvm"
|
|
||||||
# use `with tvm.target.rasp` for some target-specified optimization
|
|
||||||
with tvm.target.rasp():
|
|
||||||
graph, lib, params = nnvm.compiler.build(
|
|
||||||
net, target, shape={"data": data_shape}, params=params)
|
|
||||||
|
|
||||||
temp = util.tempdir()
|
|
||||||
path_lib = temp.relpath("deploy_lib_rasp.o")
|
|
||||||
lib.save(path_lib)
|
|
||||||
|
|
||||||
# connect the server
|
|
||||||
remote = rpc.connect(host, port)
|
|
||||||
|
|
||||||
# upload the library to remote device and load it
|
|
||||||
remote.upload(path_lib)
|
|
||||||
rlib = remote.load_module('deploy_lib_rasp.o')
|
|
||||||
|
|
||||||
ctx = remote.cpu(0)
|
|
||||||
# upload the parameter
|
|
||||||
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
|
|
||||||
|
|
||||||
# create the remote runtime module
|
|
||||||
module = graph_runtime.create(graph, rlib, ctx)
|
|
||||||
# set parameter
|
|
||||||
module.set_input(**rparams)
|
|
||||||
# set input data
|
|
||||||
input_data = np.random.uniform(size=data_shape)
|
|
||||||
module.set_input('data', tvm.nd.array(input_data.astype('float32')))
|
|
||||||
# run
|
|
||||||
module.run()
|
|
||||||
|
|
||||||
out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
|
|
||||||
# Print first 10 elements of output
|
|
||||||
print(out.asnumpy()[0][0:10])
|
|
||||||
|
|
||||||
if not use_rasp:
|
|
||||||
# terminate the local server
|
|
||||||
server.terminate()
|
|
||||||
|
|
|
@ -1,57 +1,21 @@
|
||||||
# pylint: disable=invalid-name,unused-variable,invalid-name
|
"""Reuse conv2d schedule from ARM CPU"""
|
||||||
"""Conv2D schedule ported from RASP
|
|
||||||
|
|
||||||
Used for CPU conv2d
|
import tvm
|
||||||
"""
|
|
||||||
from __future__ import absolute_import as _abs
|
|
||||||
|
|
||||||
from topi.nn.conv2d import conv2d, _get_schedule
|
from topi.nn import conv2d, conv2d_alter_layout
|
||||||
from topi.nn.conv2d import SpatialPack, Im2ColPack, Workload
|
|
||||||
from topi.rasp import conv2d as _rasp_conv2d
|
|
||||||
from topi import generic
|
from topi import generic
|
||||||
|
|
||||||
_WORKLOADS = [
|
@conv2d.register(["vtacpu", "vta"])
|
||||||
Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
|
def compute(*args, **kwargs):
|
||||||
Workload('int8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
|
with tvm.target.arm_cpu("vtacpu"):
|
||||||
Workload('int8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
|
return conv2d(*args, **kwargs)
|
||||||
Workload('int8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
|
|
||||||
Workload('int8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('int8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
|
|
||||||
Workload('int8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
|
|
||||||
Workload('int8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
|
|
||||||
Workload('int8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
|
|
||||||
]
|
|
||||||
_SCHEDULES = [
|
|
||||||
# float32 imagenet
|
|
||||||
SpatialPack(1, 8, 4, 1, 4, True),
|
|
||||||
SpatialPack(1, 8, 4, 1, 4, True),
|
|
||||||
SpatialPack(1, 7, 4, 2, 4, True),
|
|
||||||
SpatialPack(1, 4, 8, 4, 1, True),
|
|
||||||
SpatialPack(1, 4, 4, 1, 16, False),
|
|
||||||
SpatialPack(1, 4, 8, 4, 8, False),
|
|
||||||
SpatialPack(1, 7, 4, 3, 8, True),
|
|
||||||
SpatialPack(1, 2, 8, 1, 8, True),
|
|
||||||
SpatialPack(2, 1, 16, 1, 4, True),
|
|
||||||
SpatialPack(1, 7, 4, 1, 1, True),
|
|
||||||
Im2ColPack(7, 4, 1, 16, True),
|
|
||||||
Im2ColPack(7, 4, 1, 8, False),
|
|
||||||
Im2ColPack(7, 4, 1, 16, False),
|
|
||||||
]
|
|
||||||
|
|
||||||
@_get_schedule.register(["vtacpu", "vta"])
|
@generic.schedule_conv2d_nchw.register(["vtacpu", "vta"])
|
||||||
def _schedule_conv2d(wkl):
|
def schedule(*args, **kwargs):
|
||||||
if wkl not in _WORKLOADS:
|
with tvm.target.arm_cpu("vtacpu"):
|
||||||
raise ValueError("no schedule for such workload: {}".format(wkl))
|
return generic.schedule_conv2d_nchw(*args, **kwargs)
|
||||||
idx = _WORKLOADS.index(wkl)
|
|
||||||
sch = _SCHEDULES[idx]
|
|
||||||
return sch
|
|
||||||
|
|
||||||
conv2d.register(["vtacpu", "vta"], _rasp_conv2d._declaration_conv2d)
|
@conv2d_alter_layout.register(["vtacpu", "vta"])
|
||||||
|
def alter(*args, **kwargs):
|
||||||
generic.schedule_conv2d_nchw.register(
|
with tvm.target.arm_cpu("vtacpu"):
|
||||||
["vtacpu", "vta"],
|
return conv2d_alter_layout(*args, **kwargs)
|
||||||
_rasp_conv2d.schedule_conv2d_nchw)
|
|
||||||
|
|
|
@ -244,8 +244,11 @@ def is_packed_layout(layout):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@reg.register_alter_op_layout("conv2d", level=15)
|
@reg.register_alter_op_layout("conv2d", level=15)
|
||||||
def alter_conv2d_layout(*_):
|
def alter_conv2d_layout(attrs, inputs, out):
|
||||||
return None
|
layout = attrs['layout']
|
||||||
|
if is_packed_layout(layout):
|
||||||
|
return None
|
||||||
|
return _nn.alter_conv2d_layout(attrs, inputs, out)
|
||||||
|
|
||||||
|
|
||||||
@reg.register_compute("conv2d", level=15)
|
@reg.register_compute("conv2d", level=15)
|
||||||
|
@ -368,7 +371,6 @@ def schedule_packed_conv2d(outs):
|
||||||
oshape = topi.util.get_const_tuple(output.shape)
|
oshape = topi.util.get_const_tuple(output.shape)
|
||||||
s = tvm.create_schedule(output.op)
|
s = tvm.create_schedule(output.op)
|
||||||
|
|
||||||
|
|
||||||
# setup pad
|
# setup pad
|
||||||
if pad_data is not None:
|
if pad_data is not None:
|
||||||
cdata = pad_data
|
cdata = pad_data
|
||||||
|
@ -394,7 +396,6 @@ def schedule_packed_conv2d(outs):
|
||||||
h_factor = (plan.h_factor if plan.h_factor else oshape[2])
|
h_factor = (plan.h_factor if plan.h_factor else oshape[2])
|
||||||
w_factor = (plan.w_factor if plan.w_factor else oshape[3])
|
w_factor = (plan.w_factor if plan.w_factor else oshape[3])
|
||||||
|
|
||||||
|
|
||||||
x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
|
x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
|
||||||
x_co0, x_co1 = s[output].split(x_co, factor=oc_factor)
|
x_co0, x_co1 = s[output].split(x_co, factor=oc_factor)
|
||||||
x_i0, x_i1 = s[output].split(x_i, factor=h_factor)
|
x_i0, x_i1 = s[output].split(x_i, factor=h_factor)
|
||||||
|
@ -459,6 +460,7 @@ class Conv2DSchedule(object):
|
||||||
self.oc_nthread = oc_nthread
|
self.oc_nthread = oc_nthread
|
||||||
self.h_nthread = h_nthread
|
self.h_nthread = h_nthread
|
||||||
self.debug_sync = debug_sync
|
self.debug_sync = debug_sync
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "{}.{}.{}.{}.{}.{}.{}".format(
|
return "{}.{}.{}.{}.{}.{}.{}".format(
|
||||||
self.b_factor, self.oc_factor, self.ic_factor,
|
self.b_factor, self.oc_factor, self.ic_factor,
|
||||||
|
@ -483,7 +485,6 @@ RESNET = {
|
||||||
11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
|
11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
_WL2PLAN = {}
|
|
||||||
for idx in RESNET:
|
for idx in RESNET:
|
||||||
scheds = find_schedules(RESNET[idx], vt_only=True, best_only=True)[0]
|
scheds = find_schedules(RESNET[idx], vt_only=True, best_only=True)[0]
|
||||||
_WL2PLAN[RESNET[idx]] = scheds
|
_WL2PLAN[RESNET[idx]] = scheds
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
"""Testing if we can generate code in topi style"""
|
"""Testing if we can generate code in topi style"""
|
||||||
|
|
||||||
import tvm
|
import tvm
|
||||||
|
from tvm import autotvm
|
||||||
from tvm.contrib import util
|
from tvm.contrib import util
|
||||||
from tvm.contrib.pickle_memoize import memoize
|
from tvm.contrib.pickle_memoize import memoize
|
||||||
import topi
|
import topi
|
||||||
|
@ -62,8 +63,7 @@ def test_cpu_conv2d():
|
||||||
|
|
||||||
def verify(s, check_correctness):
|
def verify(s, check_correctness):
|
||||||
mod = tvm.build(s, [data, kernel, res],
|
mod = tvm.build(s, [data, kernel, res],
|
||||||
"llvm -device=vtacpu",
|
target_host=env.target_host,
|
||||||
env.target_host,
|
|
||||||
name="conv2d")
|
name="conv2d")
|
||||||
temp = util.tempdir()
|
temp = util.tempdir()
|
||||||
mod.save(temp.relpath("conv2d.o"))
|
mod.save(temp.relpath("conv2d.o"))
|
||||||
|
@ -126,7 +126,11 @@ def test_cpu_conv2d():
|
||||||
print(wl)
|
print(wl)
|
||||||
with tvm.target.create("llvm -device=vtacpu"):
|
with tvm.target.create("llvm -device=vtacpu"):
|
||||||
run_cpu_conv2d(env, remote, key, batch_size, wl)
|
run_cpu_conv2d(env, remote, key, batch_size, wl)
|
||||||
vta.testing.run(_run)
|
|
||||||
|
# load pre-tuned operator parameters for ARM CPU
|
||||||
|
autotvm.tophub.check_package('vta')
|
||||||
|
with autotvm.tophub.context('llvm -device=vtacpu'):
|
||||||
|
vta.testing.run(_run)
|
||||||
|
|
||||||
|
|
||||||
def test_vta_conv2d():
|
def test_vta_conv2d():
|
||||||
|
@ -172,7 +176,6 @@ def test_vta_conv2d():
|
||||||
a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
|
a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
|
||||||
return a_np, w_np, b_np
|
return a_np, w_np, b_np
|
||||||
|
|
||||||
|
|
||||||
def verify(s, check_correctness):
|
def verify(s, check_correctness):
|
||||||
mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
|
mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
|
||||||
env.target_host, name="conv2d")
|
env.target_host, name="conv2d")
|
||||||
|
|
|
@ -8,7 +8,6 @@ onto the VTA accelerator design to perform ImageNet classification tasks.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Import Libraries
|
# Import Libraries
|
||||||
# ----------------
|
# ----------------
|
||||||
|
@ -17,27 +16,22 @@ onto the VTA accelerator design to perform ImageNet classification tasks.
|
||||||
from __future__ import absolute_import, print_function
|
from __future__ import absolute_import, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import nnvm
|
|
||||||
import nnvm.compiler
|
|
||||||
import tvm
|
|
||||||
import vta
|
|
||||||
import vta.testing
|
|
||||||
import numpy as np
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from nnvm.compiler import graph_attr
|
|
||||||
from tvm import rpc
|
|
||||||
from tvm.contrib import graph_runtime, util
|
|
||||||
from tvm.contrib.download import download
|
|
||||||
from vta.testing import simulator
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
import tvm
|
||||||
|
from tvm import rpc, autotvm
|
||||||
|
from tvm.contrib import graph_runtime, util
|
||||||
|
from tvm.contrib.download import download
|
||||||
|
import nnvm.compiler
|
||||||
|
import vta
|
||||||
|
import vta.testing
|
||||||
|
|
||||||
# Load VTA parameters from the vta/config/vta_config.json file
|
# Load VTA parameters from the vta/config/vta_config.json file
|
||||||
env = vta.get_env()
|
env = vta.get_env()
|
||||||
|
|
||||||
|
@ -76,7 +70,6 @@ def classify(m, image):
|
||||||
# Takes in a path to a graph file, params file, and device target
|
# Takes in a path to a graph file, params file, and device target
|
||||||
# Returns the NNVM graph object, a compiled library object, and the params dict
|
# Returns the NNVM graph object, a compiled library object, and the params dict
|
||||||
def generate_graph(graph_fn, params_fn, device="vta"):
|
def generate_graph(graph_fn, params_fn, device="vta"):
|
||||||
|
|
||||||
# Measure build start time
|
# Measure build start time
|
||||||
build_start = time.time()
|
build_start = time.time()
|
||||||
|
|
||||||
|
@ -100,12 +93,6 @@ def generate_graph(graph_fn, params_fn, device="vta"):
|
||||||
shape_dict.update({k: v.shape for k, v in params.items()})
|
shape_dict.update({k: v.shape for k, v in params.items()})
|
||||||
dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
|
dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
|
||||||
|
|
||||||
# Create NNVM graph
|
|
||||||
graph = nnvm.graph.create(sym)
|
|
||||||
graph_attr.set_shape_inputs(sym, shape_dict)
|
|
||||||
graph_attr.set_dtype_inputs(sym, dtype_dict)
|
|
||||||
graph = graph.apply("InferShape").apply("InferType")
|
|
||||||
|
|
||||||
# Apply NNVM graph optimization passes
|
# Apply NNVM graph optimization passes
|
||||||
sym = vta.graph.clean_cast(sym)
|
sym = vta.graph.clean_cast(sym)
|
||||||
sym = vta.graph.clean_conv_fuse(sym)
|
sym = vta.graph.clean_conv_fuse(sym)
|
||||||
|
@ -166,6 +153,9 @@ for file in [categ_fn, graph_fn, params_fn]:
|
||||||
# Read in ImageNet Categories
|
# Read in ImageNet Categories
|
||||||
synset = eval(open(os.path.join(data_dir, categ_fn)).read())
|
synset = eval(open(os.path.join(data_dir, categ_fn)).read())
|
||||||
|
|
||||||
|
# Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
|
||||||
|
autotvm.tophub.check_package('vta')
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Setup the Pynq Board's RPC Server
|
# Setup the Pynq Board's RPC Server
|
||||||
|
@ -182,7 +172,6 @@ port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
|
||||||
# We configure both the bitstream and the runtime system on the Pynq
|
# We configure both the bitstream and the runtime system on the Pynq
|
||||||
# to match the VTA configuration specified by the vta_config.json file.
|
# to match the VTA configuration specified by the vta_config.json file.
|
||||||
if env.TARGET == "pynq":
|
if env.TARGET == "pynq":
|
||||||
|
|
||||||
# Make sure that TVM was compiled with RPC=1
|
# Make sure that TVM was compiled with RPC=1
|
||||||
assert tvm.module.enabled("rpc")
|
assert tvm.module.enabled("rpc")
|
||||||
remote = rpc.connect(host, port)
|
remote = rpc.connect(host, port)
|
||||||
|
@ -209,8 +198,8 @@ elif env.TARGET == "sim":
|
||||||
# ------------------------
|
# ------------------------
|
||||||
# Build the ResNet graph runtime, and configure the parameters.
|
# Build the ResNet graph runtime, and configure the parameters.
|
||||||
|
|
||||||
# Set ``device=cpu`` to run inference on the CPU,
|
# Set ``device=vtacpu`` to run inference on the CPU
|
||||||
# or ``device=vtacpu`` to run inference on the FPGA.
|
# or ``device=vta`` to run inference on the FPGA.
|
||||||
device = "vta"
|
device = "vta"
|
||||||
|
|
||||||
# Device context
|
# Device context
|
||||||
|
@ -225,7 +214,6 @@ m = graph_runtime.create(graph, lib, ctx)
|
||||||
# Set the parameters
|
# Set the parameters
|
||||||
m.set_input(**params)
|
m.set_input(**params)
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# Run ResNet-18 inference on a sample image
|
# Run ResNet-18 inference on a sample image
|
||||||
# -----------------------------------------
|
# -----------------------------------------
|
||||||
|
|
Загрузка…
Ссылка в новой задаче