[AUTOTVM] TOPI integration for ARM CPU (#1487)

2018-08-02 08:59:25 -07:00 · 2018-08-02 08:59:25 -07:00 · 32076df815
--- a/.gitignore
+++ b/.gitignore
@ -188,3 +188,6 @@ build*

 # Jetbrain
 .idea
+
+# tmp file
+.nfs*
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@ -0,0 +1,70 @@
+# Performance Benchmark
+
+## Results
+
+See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
+
+## How to Reproduce
+
+### ARM CPU
+We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
+
+1. Start an RPC Tracker on the host machine
+```bash
+python3 -m tvm.exec.rpc_tracker
+```
+
+2. Register devices to the tracker
+* For Linux device
+  * Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
+  * Register your device to tracker by
+  ```bash
+  python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
+  ```
+  replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
+  
+  E.g. Here is an example command for RK3399,
+  `python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
+
+* For Android device
+   * Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
+     Make sure you can pass the android rpc test. Then you have alreadly known how to register.
+
+3. Verify the device registration  
+  We can query all registered devices by
+  ```bash
+  python3 -m tvm.exec.query_rpc_tracker
+  ```
+  You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.
+
+  For our test environment, one sample output can be 
+  ```bash
+  Queue Status                
+  ------------------------------
+  key            free    pending    
+  ------------------------------
+  mate10pro      1       0   
+  p20pro         2       0  
+  pixel2         2       0 
+  rk3399         2       0
+  rasp3b         8       0
+  ```
+
+ 4. Run benchmark  
+  We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
+  and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
+  During compilation, TVM will download these operator parameters automatically.
+
+  ```bash
+  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
+  python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
+  python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
+  python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
+  python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
+  ```
+
+  If your device has a same SoC of the above device, you can reuse these parameters
+  (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
+  Otherwise, you need to tune for your own device, please follow this 
+  [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@ -0,0 +1,96 @@
+"""Benchmark script for performance on ARM CPU.
+see README.md for the usage and results of this script.
+"""
+
+import argparse
+import time
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
+                                                       batch_size=batch_size, image_shape=(3, 224, 224))
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
+                                                           version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
+    else:
+        raise RuntimeError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
+    parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro', 
+                                                                      'pixel2', 'rasp3b', 'pynq'])
+    parser.add_argument("--host", type=str, default='localhost')
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, required=True)
+    parser.add_argument("--number", type=int, default=6)
+    args = parser.parse_args()
+
+    dtype = 'float32'
+
+    if args.network is None:
+        networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+    else:
+        networks = [args.network]
+
+    target = tvm.target.arm_cpu(model=args.device)
+
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+    for network in networks:
+        net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        tmp = tempdir()
+        if 'android' in str(target):
+            from tvm.contrib import ndk
+            filename = "%s.so" % network
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "%s.tar" % network
+            lib.export_library(tmp.relpath(filename))
+
+        # upload library and params
+        ctx = remote.context(str(target), 0)
+        remote.upload(tmp.relpath(filename))
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+
+        rlib = remote.load_module(filename)
+        module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
--- a/apps/benchmark/rasp_imagenet_bench.py
+++ b/apps/benchmark/rasp_imagenet_bench.py
@ -1,76 +0,0 @@
-""" Benchmark script for performance on Raspberry Pi. For example, run the file with:
-`python rasp_imagenet_bench.py --model='modbilenet' --host='rasp0' --port=9090`. For
-more details about how to set up the inference environment on Raspberry Pi, Please
-refer to NNVM Tutorial: Deploy the Pretrained Model on Raspberry Pi """
-import time
-import argparse
-import numpy as np
-import tvm
-import nnvm.compiler
-import nnvm.testing
-from tvm.contrib import util, rpc
-from tvm.contrib import graph_runtime as runtime
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'],
-        help="The model type.")
-    parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.")
-    parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.")
-    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
-    parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.")
-    args = parser.parse_args()
-
-    opt_level = args.opt_level
-
-    num_iter = args.num_iter
-    batch_size = 1
-    num_classes = 1000
-    image_shape = (3, 224, 224)
-
-    data_shape = (batch_size,) + image_shape
-    out_shape = (batch_size, num_classes)
-    if args.model == 'resnet':
-        net, params = nnvm.testing.resnet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    elif args.model == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    else:
-        raise ValueError('no benchmark prepared for {}.'.format(args.model))
-
-
-    with nnvm.compiler.build_config(opt_level=opt_level):
-        graph, lib, params = nnvm.compiler.build(
-            net, tvm.target.rasp(), shape={"data": data_shape}, params=params)
-
-    tmp = util.tempdir()
-    lib_fname = tmp.relpath('net.o')
-    lib.save(lib_fname)
-
-    remote = rpc.connect(args.host, args.port)
-    remote.upload(lib_fname)
-
-    ctx = remote.cpu(0)
-    rlib = remote.load_module('net.o')
-    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
-    module = runtime.create(graph, rlib, ctx)
-    module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
-    module.set_input(**rparams)
-    module.run()
-    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
-    out.asnumpy()
-
-    print('benchmark args: {}'.format(args))
-    ftimer = module.module.time_evaluator("run", ctx, num_iter)
-    for i in range(3):
-        prof_res = ftimer()
-        print(prof_res)
-        # sleep for avoiding cpu overheat
-        time.sleep(45)
-
-
-if __name__ == '__main__':
-    main()
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@ -44,6 +44,9 @@ tvm.autotvm.tuner
 .. automodule:: tvm.autotvm.tuner.callback
    :members:

+.. automodule:: tvm.autotvm.tuner.graph_tuning
+    :members:
+
 tvm.autotvm.task
 ~~~~~~~~~~~~~~~~
 .. automodule:: tvm.autotvm.task
@ -55,6 +58,15 @@ tvm.autotvm.task
 .. automodule:: tvm.autotvm.task.space
    :members:

+.. automodule:: tvm.autotvm.task.dispatcher
+    :members:
+
+.. automodule:: tvm.autotvm.task.topi_integration
+    :members:
+
+.. automodule:: tvm.autotvm.task.nnvm_integration
+    :members:
+
 tvm.autotvm.record
 ~~~~~~~~~~~~~~~~~~
 .. automodule:: tvm.autotvm.record
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@ -60,6 +60,8 @@ The configuration of tvm can be modified by `config.cmake`.
 - Edit ``build/config.cmake`` to customize the compilation options

  - On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
+  - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
+    (OpenCL, RCOM, METAL, VULKAN, ...).

 - TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.

@ -84,7 +86,7 @@ The configuration of tvm can be modified by `config.cmake`.
      cmake ..
      make -j4

-If everything goes well, we can go to :ref:`python-package-installation`_
+If everything goes well, we can go to :ref:`python-package-installation`

 Building on Windows
 ~~~~~~~~~~~~~~~~~~~
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@ -172,6 +172,77 @@ struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
  static const constexpr int kBias = 2;
 };

+struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTransformParam> {
+    int tile_size;
+
+    DMLC_DECLARE_PARAMETER(WinogradWeightTransformParam) {
+      DMLC_DECLARE_FIELD(tile_size)
+        .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+    }
+
+    static const constexpr int kWeight = 0;
+};
+
+struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  int out_dtype;
+  bool use_bias;
+  int tile_size;
+
+  DMLC_DECLARE_PARAMETER(WinogradConv2DParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+      .add_enum("same", -1)
+      .set_default(-1)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+    DMLC_DECLARE_FIELD(tile_size)
+      .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};

 struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
  int channels;
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@ -6,6 +6,7 @@ import logging
 import tvm

 from tvm.contrib import graph_runtime
+from tvm import autotvm
 from . import graph_attr, graph_util
 from .. import graph as _graph
 from .. import symbol as sym
@ -238,67 +239,74 @@ def build(graph, target=None, shape=None, dtype="float32",
        raise ValueError("Target is not set in env or passed as argument.")
    target = tvm.target.create(target)

-    shape = shape if shape else {}
-    if not isinstance(shape, dict):
-        raise TypeError("require shape to be dict")
-    for value in shape.values():
-        if not all(isinstance(x, int) for x in value):
-            raise TypeError("shape value must be int iterator")
-
-    cfg = BuildConfig.current
-    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
-    shape, dtype = _update_shape_dtype(shape, dtype, params)
-
-    # correct layout if necessary
-    layout = layout if layout else {}
-    graph = graph_attr.set_layout_inputs(graph, layout)
-    graph = graph.apply("CorrectLayout")
-    index = graph.index
-    layouts = graph.json_attr("layout")
-    layout = {x : layouts[index.entry_id(x)] for x in index.input_names}
-
-    # Initial pass do shape type inference
-    ishape, _ = graph_util.infer_shape(graph, **shape)
-    shape.update(zip(graph.index.input_names, ishape))
-    if not isinstance(dtype, str):
-        idtype, _ = graph_util.infer_dtype(graph, **dtype)
-        dtype.update(zip(graph.index.input_names, idtype))
-    # Initialize all variables specified in _all_var_init
-    init_var = {}
-    if _all_var_init:
-        init_var = initialize_variables(shape, dtype)
-    # Apply optimization
-    with target:
-        graph = optimize(graph, shape, dtype, layout)
-
-    # Clear extra params without nodes.
-    _remove_noref_params(params, graph)
-
-    # Precompute prune
-    if params and cfg.pass_enabled("PrecomputePrune"):
-        graph, params = precompute_prune(graph, params)
-        shape, dtype = _update_shape_dtype(shape, dtype, params)
-    # Operator Fusion and generation
-    graph = graph_attr.set_shape_inputs(graph, shape)
-    graph = graph.apply("InferShape")
-    graph = graph_attr.set_dtype_inputs(graph, dtype)
-    graph._set_json_attr("target", str(target), "str")
-    if target_host is not None:
-        graph._set_json_attr("target_host", str(target_host), "str")
-    if cfg.pass_enabled("OpFusion"):
-        graph._set_json_attr("opt_level", 1, "int")
+    # if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
+    if autotvm.task.DispatchContext.current is None:
+        tophub_context = autotvm.tophub.context(target)
    else:
-        graph._set_json_attr("opt_level", 0, "int")
-    graph = graph.apply("InferShape").apply("InferType")
-    with target:
-        graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
-    libmod = graph_attr._move_out_module(graph, "module")
-    # Write variable initial values into params
-    if init_var:
-        if params is None:
-            params = {}
-        params.update(init_var)
-    return graph, libmod, params
+        tophub_context = autotvm.util.EmptyContext()
+
+    with tophub_context:
+        shape = shape if shape else {}
+        if not isinstance(shape, dict):
+            raise TypeError("require shape to be dict")
+        for value in shape.values():
+            if not all(isinstance(x, int) for x in value):
+                raise TypeError("shape value must be int iterator")
+
+        cfg = BuildConfig.current
+        graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+
+        # correct layout if necessary
+        layout = layout if layout else {}
+        graph = graph_attr.set_layout_inputs(graph, layout)
+        graph = graph.apply("CorrectLayout")
+        index = graph.index
+        layouts = graph.json_attr("layout")
+        layout = {x: layouts[index.entry_id(x)] for x in index.input_names}
+
+        # Initial pass do shape type inference
+        ishape, _ = graph_util.infer_shape(graph, **shape)
+        shape.update(zip(graph.index.input_names, ishape))
+        if not isinstance(dtype, str):
+            idtype, _ = graph_util.infer_dtype(graph, **dtype)
+            dtype.update(zip(graph.index.input_names, idtype))
+        # Initialize all variables specified in _all_var_init
+        init_var = {}
+        if _all_var_init:
+            init_var = initialize_variables(shape, dtype)
+        # Apply optimization
+        with target:
+            graph = optimize(graph, shape, dtype, layout)
+
+        # Clear extra params without nodes.
+        _remove_noref_params(params, graph)
+
+        # Precompute prune
+        if params and cfg.pass_enabled("PrecomputePrune"):
+            graph, params = precompute_prune(graph, params)
+            shape, dtype = _update_shape_dtype(shape, dtype, params)
+        # Operator Fusion and generation
+        graph = graph_attr.set_shape_inputs(graph, shape)
+        graph = graph.apply("InferShape")
+        graph = graph_attr.set_dtype_inputs(graph, dtype)
+        graph._set_json_attr("target", str(target), "str")
+        if target_host is not None:
+            graph._set_json_attr("target_host", str(target_host), "str")
+        if cfg.pass_enabled("OpFusion"):
+            graph._set_json_attr("opt_level", 1, "int")
+        else:
+            graph._set_json_attr("opt_level", 0, "int")
+        graph = graph.apply("InferShape").apply("InferType")
+        with target:
+            graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
+        libmod = graph_attr._move_out_module(graph, "module")
+        # Write variable initial values into params
+        if init_var:
+            if params is None:
+                params = {}
+            params.update(init_var)
+        return graph, libmod, params

 def _remove_noref_params(params, graph):
    """ Helper to clear non referenced params
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@ -89,7 +89,7 @@ def compute_conv2d(attrs, inputs, _):
    layout = attrs["layout"]
    kernel_layout = attrs["kernel_layout"]
    out_dtype = attrs["out_dtype"]
-    out_dtype = None if out_dtype == "same" else out_dtype
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
    assert layout == "NCHW" or layout == "NHWC"
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
@ -196,6 +196,53 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):

 reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)

+
+@reg.register_compute("_contrib_conv2d_winograd_weight_transform")
+def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
+    return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
+
+@reg.register_schedule("_contrib_conv2d_winograd_weight_transform")
+def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@reg.register_compute("_contrib_conv2d_winograd_without_weight_transform")
+def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
+    """Compute definition of conv2d NCHWc"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.get_string("layout")
+    out_dtype = attrs.get_string("out_dtype")
+    tile_size = attrs.get_int("tile_size")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+
+    # pylint: disable=assignment-from-no-return
+    out = topi.nn.conv2d_winograd_without_weight_transform(
+        inputs[0], inputs[1], strides, padding, layout, out_dtype,
+        tile_size)
+
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.add(out, bias)
+    return out
+
+@reg.register_schedule("_contrib_conv2d_winograd_without_weight_transform")
+def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_without_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # conv2d_transpose
@reg.register_compute("conv2d_transpose")
 def compute_conv2d_transpose(attrs, inputs, _):
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@ -130,11 +130,110 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
  return true;
 }

+inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
+                                     std::vector<TShape>* in_shape,
+                                     std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");

+  const WinogradConv2DParam& param = nnvm::get<WinogradConv2DParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  dshape = ConvertLayout(dshape, in_layout, kNCHW);
+
+  CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
+  CHECK_EQ(param.kernel_size.ndim(), 2U);
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+  CHECK_EQ(dshape[1] % param.groups, 0U)
+      << "input channels must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output channels must divide group size";
+
+  // NOTE: Do not check weight shape here!
+  // Different backend requires different layout to compute
+  // the batch gemm stage in winograd efficiently, but we want to
+  // make this NNVM symbol work for all backends.
+  // So we accept all weight shapes, and assume the TOPI developers
+  // can handle this correctly in alter_op_layout.
+
+  if (param.use_bias) {
+    static const Layout default_bias_layout("C");
+    TShape bias_shape({param.channels});
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_shape = ConvertLayout(bias_shape, default_bias_layout,
+                                 default_bias_layout.split('C', split_axis, oc_block));
+    }
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kBias, bias_shape);
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  TShape oshape({dshape[0], param.channels, 0, 0});
+  if (dshape[2] != 0) {
+    oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
+  }
+  if (dshape[3] != 0) {
+    oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
+  // Perform incomplete shape inference. Fill in the missing values in data shape.
+  // 1) We can always fill in the batch_size.
+  // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+  oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
+  dshape[0] = oshape[0];
+  if (oshape[2] && param.strides[0] == 1) {
+    dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
+  }
+  if (oshape[3] && param.strides[1] == 1) {
+    dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kData,
+                          ConvertLayout(dshape, kNCHW, in_layout));
+  // Check whether the kernel sizes are valid
+  if (dshape[2] != 0) {
+    CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
+      << "kernel size exceed input";
+  }
+  if (dshape[3] != 0) {
+    CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
+      << "kernel size exceed input";
+  }
+  return true;
+}
+
+template <typename PARAM>
 inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
                            std::vector<int>* in_type,
                            std::vector<int>* out_type) {
-  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
  if (param.use_bias) {
    CHECK_EQ(in_type->size(), 3U) << "Input:[data, weight, bias]";
  } else {
@ -154,11 +253,12 @@ inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
 }


+template<typename PARAM>
 inline bool Conv2DCorrectLayout(const NodeAttrs& attrs,
                                std::vector<Layout> *ilayouts,
                                const std::vector<Layout> *last_ilayouts,
                                std::vector<Layout> *olayouts) {
-  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);

  const Layout in_layout(param.layout);
  Layout out_layout(param.out_layout);
@ -213,8 +313,8 @@ a bias vector is created and added to the outputs.
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DInferShape)
-.set_attr<FInferType>("FInferType", Conv2DInferType)
-.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2)
@ -238,12 +338,81 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DInferShape)
-.set_attr<FInferType>("FInferType", Conv2DInferType)
-.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);

+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" NNVM_ADD_FILELINE)
+.add_argument("weight", "4D Tensor", "Weight tensor.")
+.add_arguments(WinogradWeightTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradWeightTransformParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradWeightTransformParam>)
+.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+                                         std::vector<TShape> *in_shape,
+                                         std::vector<TShape> *out_shape) {
+  const auto& param = nnvm::get<WinogradWeightTransformParam>(attrs.parsed);
+  const TShape &wshape = (*in_shape)[0];
+
+  CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
+
+  TShape oshape({param.tile_size + wshape[2] - 1,
+                 param.tile_size + wshape[3] - 1,
+                 wshape[0],
+                 wshape[1]});
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+  })
+.set_attr<FCorrectLayout>("FCorrectLayot", [](const NodeAttrs& attrs,
+                                              std::vector<Layout> *ilayouts,
+                                              const std::vector<Layout> *last_ilayouts,
+                                              std::vector<Layout> *olayouts) {
+  Layout layout("OIHW");
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
+  return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradWeightTransformParam);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
+.describe(R"code(Compute conv2d with winograd algorithm.
+
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check shape for this input tensor.
+
+- **bias**: (channels,)
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "Tensor", "Transformed weight tensor.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(WinogradConv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradConv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<WinogradConv2DParam>)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradConv2DParam);
+
 NNVM_REGISTER_OP(_conv2d_grad)
  .describe(R"code(2D convolution grad.

--- a/python/tvm/autotvm/init.py
+++ b/python/tvm/autotvm/init.py
@ -18,9 +18,12 @@ from . import record
 from . import task
 from . import tuner
 from . import util
+from . import env
+from . import tophub

 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
 from .tuner import callback
-from .task import template, get_config, create, ConfigSpace, ConfigEntity
-from .record import ApplyHistoryBest as apply_history_best
+from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
+    ApplyHistoryBest as apply_history_best
+from .env import GLOBAL_SCOPE
--- a/python/tvm/autotvm/env.py
+++ b/python/tvm/autotvm/env.py
@ -8,5 +8,6 @@ class AutotvmGlobalScope(object):
        AutotvmGlobalScope.current = self

        self.cuda_target_arch = None
+        self.in_tuning = False

 GLOBAL_SCOPE = AutotvmGlobalScope()
--- a/python/tvm/autotvm/measure/init.py
+++ b/python/tvm/autotvm/measure/init.py
@ -1,8 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""

-from .measure import MeasureInput, MeasureResult, MeasureErrorNo
-from .measure import create_measure_batch, measure_option
-from .measure_methods import request_remote
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
+from .measure_methods import request_remote, create_measure_batch, use_rpc

 from .local_executor import LocalExecutor
 from .executor import Future, Executor
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@ -8,7 +8,10 @@ try:
 except ImportError:
    from Queue import Empty

-import psutil
+try:
+    import psutil
+except ImportError:
+    psutil = None

 from . import executor

@ -106,22 +109,28 @@ class LocalFutureNoFork(executor.Future):


 class LocalExecutor(executor.Executor):
-    """Local executor that runs workers on the same machine with multiprocessing."""
-    def __init__(self, timeout=None):
+    """Local executor that runs workers on the same machine with multiprocessing.
+
+    Parameters
+    ----------
+    timeout: float, optional
+        timeout of a job. If time is out. A TimeoutError will be returned (not raised)
+    do_fork: bool, optional
+        For some runtime systems that do not support fork after initialization
+        (e.g. cuda runtime, cudnn). Set this to False if you have used these runtime
+        before submitting jobs.
+    """
+    def __init__(self, timeout=None, do_fork=True):
        self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
+        self.do_fork = do_fork
+
+        if self.do_fork:
+            if not psutil:
+                raise RuntimeError("Python package psutil is missing. "
+                                   "please try `pip install psutil`")

    def submit(self, func, *args, **kwargs):
-        """
-
-        Note
-        ----------
-        By default, the executor will fork a new process for a new job
-        But some runtime does not support fork (e.g. cuda runtime, cudnn).
-        In this circumstance, you should set 'fork_new_process' to False in kwargs
-        """
-        fork_new_process = kwargs.pop('fork_new_process', True)
-
-        if not fork_new_process:
+        if not self.do_fork:
            return LocalFutureNoFork(func(*args, **kwargs))

        queue = Queue(1)
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@ -1,18 +1,7 @@
 # pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
 """User facing API for specifying how to measure the generated code"""
-import time
 from collections import namedtuple

-import numpy as np
-
-from ... import build, nd, target as _target
-from ...rpc.tracker import Tracker
-from ...rpc.server import Server
-
-from ..util import get_const_tuple
-from .local_executor import LocalExecutor
-
-
 class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
    """
    Stores all the necessary inputs for a measurement.
@ -44,6 +33,7 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
        The absolute time stamp when we finish measurement.
    """

+
 class MeasureErrorNo(object):
    """Error type for MeasureResult"""
    NO_ERROR = 0              # no error
@ -55,38 +45,25 @@ class MeasureErrorNo(object):
    FLEET_ERROR = 6           # error of measure infrastructure


-def measure_option(mode,
+def measure_option(measure_func,
                   number=1,
                   repeat=1,
                   timeout=60,
                   parallel_num=1,
-                   pack_size=1,
+                   do_fork=True,
+                   build_func='default',
                   check_correctness=False,
-                   build_option=None,
-                   replay_db=None,
-                   save_to_replay_db=True,
-                   rpc_device_key=None,
-                   rpc_priority=1,
-                   rpc_timeout=60,
-                   rpc_tracker_addr=None,
-                   use_ndk=False,
-                   custom_measure_batch=None):
+                   replay_db=None):
    """Configure how to do measurement

    Parameters
    ----------
-    mode: str
-        'local': use the local device for measurement. In this mode,
-        the tuner starts a tracker and a RPC server silently for the user.
-
-        'rpc': request devices for measurement from rpc tracker. In this mode,
-        you should start a rpc tracker in a separate processing.
-
-        'custom': use custom measure function
-
-        'local-nofork': use local device for measure but does not use multiprocessing.
-        This mode is suitable for debug, but does not support timeout and parallel.
+    measure_func: str or callable
+        'local': use the local device for measurement. The tuner will start a tracker
+        and a RPC server silently for the user.

+        callable: It is a callable function for measurement.
+                  See the return value of measure/measure_methods.py::use_rpc for example.
    number : int, optional
        Number of times to do the measurement for average
    repeat : int, optional
@ -101,235 +78,50 @@ def measure_option(mode,
        The number of measurement task that can run in parallel.
        Set this according to the number of cpu cores (for compilation) and
        the number of devices you have (for measuring generate code).
-    pack_size : int, optional
-        Number of configs to measure in one RPC call.
-        Usually this can be set to 1. If your device has high cost to establish a rpc connection,
-        set this higher.
+    do_fork: bool, optional
+        Whether use multiprocessing (based on fork) for running measure jobs in parallel.
+        Set this to False if you want to debug (see trackback) or using fork is not suitable.
+        NOTE: If this is False, parallel and timeout do not work.
+    build_func: str or callable, optional
+        'default': call default builder. This works for normal target (llvm, cuda)
+
+        'ndk': use Android NDK to create shared library. Use this for android target.
+
+        callable: customized build function for other backends (e.g. VTA).
+                  See measure/measure_methods.py::default_build_func for example.
    check_correctness: bool
-        Whether check correctness after measurement.
-    build_option: Dict, optional
-        Build options for tvm.build_config
-
+        Whether check correctness after measurement. This will use llvm cpu as reference.
    replay_db : Database, optional
-        The database that we retrieve saved MeasureResults from
-    save_to_replay_db: bool, optional
-        Whether save measure result to database. This is useless when replay_db is None
-
-    rpc_priority: int, optional
-        Priority of this task, used by scheduler in tracker
-    rpc_device_key: str, optional
-        The device key of registered devices in tracker
-    rpc_timeout: int, optional
-        Timeout of rpc session
-    rpc_tracker_addr: Tuple(str, int), optional
-        The address of rpc tracker in Tuple(host, port) format.
-        If is set, will use this address.
-        If is not set, will use environment variable "TVM_TRACKER_HOST" and "TVM_TRACKER_PORT"
-
-    use_ndk: bool, option
-        Whether export requires ndk
-    custom_measure_batch: callable, optional
-        custom measure function
+        The database that we retrieve saved MeasureResult from.

    Returns
    -------
    options: dict
        A dict to store all options
+
+    Note
+    ----
+    To support customized measure, you can pass callable `measure_func` or
+    `build_func` in. The `measure_func` will call `build_func` to build binary library
+    and handle the logic of measurement.
+
+    Signature:
+    * measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
+    def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
+        return measure_results
+
+    * build_func (see measure/measure_methods.py::default_build_func for example)
+    def build_func(inp, tmp_dir, **kwargs):
+        return func, args, filename
    """
    return {
-        'mode': mode,
+        'measure_func': measure_func,
        'number': number,
        'repeat': repeat,
        'timeout': timeout,
        'parallel_num': parallel_num,
-        'pack_size': pack_size,
+        'do_fork': do_fork,
+        'build_func': build_func,
        'check_correctness': check_correctness,
-        'build_option': build_option,
-
        'replay_db': replay_db,
-        'save_to_replay_db': save_to_replay_db,
-
-        'rpc_device_key': rpc_device_key,
-        'rpc_priority': rpc_priority,
-        'rpc_timeout': rpc_timeout,
-        'rpc_tracker_addr': rpc_tracker_addr,
-
-        'use_ndk': use_ndk,
-        'custom_measure_batch': custom_measure_batch
-    }
-
-
-def create_measure_batch(task, options):
-    """Get a standard measure_batch function.
-
-    Parameters
-    ----------
-    task: tvm.autotvm.task.Task
-        The tuning task
-    options: dict
-        The option for measuring generated code.
-        You should use the return value of :any:`autotvm.measure_option` for this argument
-
-    Returns
-    -------
-    measure_batch: callable
-        a callback function to measure a batch of configs
-    """
-    from . import measure_methods
-    from ..database import filter_inputs
-
-    mode = options['mode']
-    number, repeat = options['number'], options['repeat']
-    timeout, parallel_num = options['timeout'], options['parallel_num']
-    pack_size = options['pack_size']
-    check_correctness = options['check_correctness']
-    build_option = options['build_option']
-    replay_db = options['replay_db']
-    save_to_replay_db = options['save_to_replay_db']
-    rpc_device_key = options['rpc_device_key']
-    rpc_priority, rpc_timeout = options['rpc_priority'], options['rpc_timeout']
-    use_ndk = options['use_ndk']
-    custom_measure_batch = options['custom_measure_batch']
-
-    kwargs = {}
-    executor = LocalExecutor(timeout=timeout)
-
-    if mode == 'local':
-        # start temporary rpc tracker and rpc server for the user
-        tracker = Tracker('localhost', port=9000, port_end=10000,
-                          silent=True)
-        rpc_device_key = '$local$device$%d' % tracker.port
-        server = Server('localhost', port=9000, port_end=10000,
-                        key=rpc_device_key,
-                        use_popen=True, silent=True,
-                        tracker_addr=(tracker.host, tracker.port))
-
-        fmeasure = measure_methods.measure_rpc
-        kwargs['rpc_device_key'] = rpc_device_key
-        kwargs['rpc_tracker_addr'] = (tracker.host, tracker.port)
-        kwargs['rpc_timeout'] = timeout
-    elif mode == 'rpc':
-        fmeasure = measure_methods.measure_rpc
-        kwargs['rpc_device_key'] = rpc_device_key
-        kwargs['rpc_priority'] = rpc_priority
-        kwargs['rpc_timeout'] = rpc_timeout
-        kwargs['use_ndk'] = use_ndk
-        assert rpc_device_key, "In rpc mode, a rpc_device_key must be provided"
-    elif mode == "custom":
-        assert callable(custom_measure_batch), "In custom mode, custom_measure_func " \
-                                               "must be a callable object"
-    elif mode == 'local-nofork':
-        fmeasure = measure_methods.measure_local
-        kwargs['fork_new_process'] = False
-    else:
-        raise RuntimeError("Invalid mode: " + mode)
-
-    if 'cuda' in task.target.keys and 'rpc_device_key' in kwargs:  # query cuda device info
-        add_cuda_device_info(kwargs['rpc_device_key'], kwargs.get('rpc_tracker_addr'), kwargs)
-    if 'opencl' in task.target.keys and 'rpc_device_key' in kwargs:
-        add_opencl_device_info(kwargs['rpc_device_key'], kwargs.get('rpc_tracker_addr'), kwargs)
-
-    if check_correctness:
-        # use llvm to generate a reference input/output
-        # this option works for tuning topi, but might not work for you custom op
-        with _target.create("llvm"):
-            s, arg_bufs = task.instantiate(task.config_space.get(0))
-        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
-                     for x in arg_bufs]
-        func = build(s, arg_bufs, "llvm")
-        tvm_buf = [nd.array(x) for x in ref_input]
-        func(*tvm_buf)
-        ref_output = [x.asnumpy() for x in tvm_buf]
-        kwargs['ref_input'], kwargs['ref_output'] = ref_input, ref_output
-
-    def measure_batch(measure_inputs):
-        """measure the time cost for a batch of configs in real machines"""
-        if replay_db is not None:
-            partial_results, measure_inputs =\
-                filter_inputs(replay_db, measure_inputs, retry=False)
-
-        # pack configs
-        input_packs = []
-        for i in range(0, len(measure_inputs), pack_size):
-            input_packs.append(measure_inputs[i:i + pack_size])
-
-        # send to measure
-        futures = []
-        for input_pack in input_packs:
-            future = executor.submit(
-                fmeasure, input_pack,
-                number=number,
-                repeat=repeat,
-                build_option=build_option,
-                **kwargs
-            )
-            futures.append(future)
-
-        # transform results
-        results = []
-        for future in futures:
-            result = future.get()
-            if isinstance(result, Exception):
-                if mode == 'local-nofork':
-                    # debug usage, raise exception
-                    raise result
-                tstamp = time.time()
-                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
-                                              timeout, tstamp)] * pack_size)
-            else:
-                results.extend(result)
-
-        if replay_db is not None:
-            if save_to_replay_db:  # save result to database
-                for measure_input, result in zip(measure_inputs, results):
-                    replay_db.save(measure_input, result)
-
-            result_idx = 0
-            for i in range(len(partial_results)):
-                if partial_results[i] is None:
-                    partial_results[i] = results[result_idx]
-                    result_idx += 1
-            return partial_results
-        return results
-
-    if mode == 'custom':
-        measure_batch = custom_measure_batch
-
-    measure_batch.parallel_num = parallel_num
-    if mode == 'local':
-        measure_batch.aux_objects = {"server": server, "tracker": tracker}
-    return measure_batch
-
-
-def add_cuda_device_info(device_key, rpc_tracker_addr, kwargs):
-    """Query cuda device info. This is used to set the flags for nvcc compiler
-    and check the validity of a generated code."""
-    from .measure_methods import request_remote
-
-    remote = request_remote(device_key, rpc_tracker_addr)
-    ctx = remote.context('cuda', 0)
-    max_dims = ctx.max_thread_dimensions
-    kwargs['check_gpu'] = {
-        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
-        'max_threads_per_block': ctx.max_threads_per_block,
-        'max_thread_x': max_dims[0],
-        'max_thread_y': max_dims[1],
-        'max_thread_z': max_dims[2],
-    }
-
-    kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
-
-def add_opencl_device_info(device_key, rpc_tracker_addr, kwargs):
-    """Query opencl device info. This is used to check the validity of a generated code."""
-    from .measure_methods import request_remote
-
-    remote = request_remote(device_key, rpc_tracker_addr)
-    ctx = remote.context('opencl', 0)
-    max_dims = ctx.max_thread_dimensions
-    kwargs['check_gpu'] = {
-        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
-        'max_threads_per_block': ctx.max_threads_per_block,
-        'max_thread_x': max_dims[0],
-        'max_thread_y': max_dims[1],
-        'max_thread_z': max_dims[2],
    }
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@ -12,20 +12,24 @@ from random import getrandbits

 import numpy as np

-from ...contrib import ndk, nvcc, util
-from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func
+from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
+    target as _target
+from ...contrib import nvcc, util, ndk

 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
-from .measure import MeasureResult, MeasureErrorNo
 from ..task.space import InstantiationError

+from .measure import MeasureResult, MeasureErrorNo
+from .local_executor import LocalExecutor
+

 class HashMismatchError(ValueError):
    """Raised when the code hash of a submitted config doesn't match that on the
       measure side """
    pass

+
 def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
    """request a remote session

@ -34,7 +38,9 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
    device_key: string
        device key of registered device in tracker
    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format
+        The address of rpc tracker in (host, port) format.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+        and "TVM_TRACKER_PORT"
    priority: int, optional
        priority of this request, larger is more prior
    timeout: float, optional
@ -46,8 +52,8 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
    """
    # connect to the tracker
    if tracker_addr:
-        host = tracker_addr[0]
-        port = tracker_addr[1]
+        host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
+        port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
    else:
        host = os.environ['TVM_TRACKER_HOST']
        port = int(os.environ['TVM_TRACKER_PORT'])
@ -58,30 +64,234 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
    return remote


-def _measure_generic(fbuild, input_pack, ref_input, ref_output):
-    """Generic measurement function
+def create_measure_batch(task, option):
+    """Get a standard measure_batch function.

    Parameters
    ----------
-    fbuild : function takes MeasureInput returns tuple of (time_func, ctx)
-        The build function used to build each input.
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    ref_input: Array of np.ndarray
-        Reference input for checking correctness
-    ref_output: Array of np.ndarray
-        Reference output for checking correctness
+    task: tvm.autotvm.task.Task
+        The tuning task
+    option: dict
+        The option for measuring generated code.
+        You should use the return value of function :any:`measure_option` for this argument.

    Returns
    -------
-    res_pack : array of MeasureResult
-        The list of execution result of measurement.
+    measure_batch: callable
+        a callback function to measure a batch of configs
+    """
+    from ..database import filter_inputs
+
+    measure_func = option['measure_func']
+    number, repeat = option['number'], option['repeat']
+    timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
+    build_func = option['build_func']
+    check_correctness = option['check_correctness']
+    replay_db = option['replay_db']
+
+    executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
+
+    # convert convenient string to function object
+    attach_objects = None
+    if measure_func == 'local':
+        # start temporary rpc tracker and rpc server for the user
+        tracker = rpc.Tracker('localhost', port=9000, port_end=10000, silent=True)
+        device_key = '$local$device$%d' % tracker.port
+        server = rpc.Server('localhost', port=9000, port_end=10000,
+                            key=device_key,
+                            use_popen=True, silent=True,
+                            tracker_addr=(tracker.host, tracker.port))
+
+        measure_func = use_rpc(device_key, tracker.host, tracker.port)
+        attach_objects = (server, tracker)
+
+    build_kwargs = {}
+    if build_func == 'default':
+        build_func = default_build_func
+    if build_func == 'ndk':
+        build_func = default_build_func
+        build_kwargs['use_ndk'] = True
+
+    # add device info of cuda and opencl target
+    if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
+            and hasattr(measure_func, 'rpc_info'):
+        rpc_info = measure_func.rpc_info
+        add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
+                            build_kwargs)
+
+    if check_correctness:
+        # use llvm cpu to generate a reference input/output
+        # this option works for tuning topi, but might not work for you custom op
+        with _target.create("llvm"):
+            s, arg_bufs = task.instantiate(task.config_space.get(0))
+        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
+                     for x in arg_bufs]
+        func = build(s, arg_bufs, "llvm")
+        tvm_buf = [nd.array(x) for x in ref_input]
+        func(*tvm_buf)
+        ref_output = [x.asnumpy() for x in tvm_buf]
+    else:
+        ref_input = ref_output = None
+
+    def measure_batch(measure_inputs):
+        """measure the time cost for a batch of configs in real machines"""
+        if replay_db is not None:
+            partial_results, measure_inputs = \
+                filter_inputs(replay_db, measure_inputs, retry=False)
+
+        # launch measure jobs in parallel
+        pack_size = getattr(measure_func, "pack_size", 1)  # measure `pack_size` inputs in one job
+        futures = []
+        for i in range(0, len(measure_inputs), pack_size):
+            input_pack = measure_inputs[i:i + pack_size]
+            ret = executor.submit(
+                measure_func,
+                input_pack,
+                build_func,
+                build_kwargs,
+                number,
+                repeat,
+                ref_input,
+                ref_output)
+            futures.append(ret)
+
+        # transform results
+        results = []
+        for future in futures:
+            result = future.get()
+            if isinstance(result, Exception):
+                tstamp = time.time()
+                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
+                                              timeout, tstamp)] * pack_size)
+            else:
+                results.extend(result)
+
+        if replay_db is not None:
+            result_idx = 0
+            for i in range(len(partial_results)):
+                if partial_results[i] is None:
+                    partial_results[i] = results[result_idx]
+                    result_idx += 1
+            return partial_results
+        return results
+
+    measure_batch.parallel_num = parallel_num
+    # attach server and tracker object to avoid them of being garbage-collected
+    measure_batch.attach_objects = attach_objects
+    return measure_batch
+
+
+def use_rpc(key,
+            host=None,
+            port=None,
+            priority=1,
+            session_timeout=60,
+            pack_size=1):
+    """
+    Create a standard measure_func which uses RPC Tracker for measurement.
+    This measure_func will request a device from the RPC Tracker and
+    upload the built binary library to that device for measurement.
+
+    Parameters
+    ----------
+    key: str
+        The registered key of the device in tracker. The tuner will request devices for
+        measurement by this key.
+    host: str, optional
+        The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        Priority of this task, used by scheduler in tracker
+    session_timeout: int, optional
+        Timeout of rpc session
+    pack_size: int, optional
+        The number of configs measure in one RPC session.
+        Usually this can be set to 1. If your device has high overhead to establish a
+        rpc connection, set this higher.
+    """
+    def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
+        """Do measurement for a list of inputs inside a same RPC session.
+
+        Parameters
+        ----------
+        input_pack: List of MeasureInput
+            The inputs of measurement
+        build_func: callable
+            Function for building the code. see :any:`default_build_func` for example
+        build_kwargs: dict
+            Extra arguments for build_func
+        number : int, optional
+            Number of times to do the measurement for average
+        repeat : int, optional
+            Number of times to repeat the measurement.
+            In total, the generated code will be run (1 + number x repeat) times,
+            where the first one is warm up. The returned result contains `repeat` costs,
+            each of which is the average of `number` test run.
+        ref_input: List of numpy array
+            Reference input for correctness check
+        ref_output: List of numpy array
+            Reference output for correctness check
+
+        Returns
+        -------
+        results: List of MeasureResult
+            The results for input_pack
+        """
+        remote = request_remote(key, (host, port), priority, session_timeout)
+
+        res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
+                              ref_input, ref_output,
+                              remote)
+        return res
+
+    fmeasure.pack_size = pack_size
+    fmeasure.rpc_info = {"key": key, "host": host, "port": port}
+    return fmeasure
+
+
+def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
+                    ref_input=None, ref_output=None, remote=None):
+    """Measure the time cost for a pack of inputs.
+
+    (Note: A pack is a list of inputs which will be measured inside a same RPC session)
+
+    Parameters
+    ----------
+    input_pack : list of MeasureInput
+        The inputs we need to evaluate
+    build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
+        The build function used to build each input.
+    build_kwargs: Dict
+        The extra keyword arguments to build_func
+    number : int, optional
+        Number of times to do the measurement for average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    ref_input: Array of np.ndarray, optional
+        Reference input for checking correctness
+    ref_output: Array of np.ndarray, optional
+        Reference output for checking correctness
+    remote: RPCSession, optional
+        The remote RPC session
+
+    Returns
+    -------
+    res_pack : Array of MeasureResult
+        The list of results of measurement.
    """
    res_pack = []
+    tmp_dir = util.tempdir() if remote else None
+
    for inp in input_pack:
        tic = time.time()
+
+        # build function
        try:
-            time_f, ctx, arg_bufs = fbuild(inp)
+            func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
        except TVMError as exc:
            tstamp = time.time()
            msg = str(exc)
@ -92,9 +302,7 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
                    msg = msg.split('\n')[-2].split(": ")[1]
                except Exception:  # pylint: disable=broad-except
                    pass
-                res_pack.append(MeasureResult((InstantiationError(msg),),
-                                              MeasureErrorNo.INSTANTIATION_ERROR,
-                                              tstamp - tic, tstamp))
+                raise InstantiationError(msg)
            else:
                res_pack.append(MeasureResult((RuntimeError(msg),),
                                              MeasureErrorNo.COMPILE_HOST,
@ -107,14 +315,26 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
                                          tstamp - tic, tstamp))
            continue

+        # upload built module
+        if remote:
+            remote.upload(tmp_dir.relpath(filename))
+            func = remote.load_module(filename)
+            ctx = remote.context(str(inp.target), 0)
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat)
+        else:
+            ctx = context(str(inp.target), 0)
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat)
+
        # measure time
        errno = MeasureErrorNo.NO_ERROR
        try:
            if ref_input:
-                args = [nd.array(x, ctx) for x in ref_input]
+                args = [nd.array(x, ctx=ctx) for x in ref_input]
            else:
-                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype,
-                                 ctx=ctx) for x in arg_bufs]
+                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
+                        for x in arg_bufs]
            costs = time_f(*args).results
            if len(costs) > 2:  # remove largest and smallest value to reduce variance
                costs = list(costs)
@ -135,10 +355,35 @@ def _measure_generic(fbuild, input_pack, ref_input, ref_output):
        res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
    return res_pack

-def _build_func(inp, build_option, kwargs):
-    """Build function module. Exception will be raised when error occurs"""
+
+def default_build_func(inp, tmp_dir=None, **kwargs):
+    """Build function module. Exception will be raised when any error occurs
+
+    Parameters
+    ----------
+    inp: MeasureInput
+       The input of this measurement
+    tmp_dir: tvm.contrib.util.TempDirectory, optional
+       The temporary directory for exporting built binary library.
+       If is not None (in RPC mode), the library in this directory will be uploaded to
+       remote devices.
+    kwargs: Dict, optional
+        Other extra arguments
+
+    Returns
+    -------
+    func: Function
+        TVM built function. Typically this is the return value of tvm.build.
+    args: Array of Buffer or Tensor
+        The argument list for the function. Typically this is the second argument of tvm.build.
+    filename: str
+        The filename of the output build library
+    """
+    # build function
    with inp.target:
        s, args = inp.task.instantiate(inp.config)
+
+        # check invalidity of template and code hash consistency
        if not inp.config.valid():
            raise InstantiationError(inp.config.errors)
        code_hash = getattr(s, 'code_hash', None)
@ -146,140 +391,49 @@ def _build_func(inp, build_option, kwargs):
            raise HashMismatchError('got {0:s}, expected {1:s}'
                                    .format(str(inp.config.code_hash), str(code_hash)))

-        opts = build_option or {}
-        if "check_gpu" in kwargs:
-            values = kwargs['check_gpu']
-            # Add gpu verify pass to filter out invalid configs in advance.
-            # This can accelerate the tuning process
-            check_keys = ['max_shared_memory_per_block', 'max_threads_per_block',
-                          'max_thread_x', 'max_thread_y', 'max_thread_z']
-            opts["add_lower_pass"] = [
-                (2, gpu_verify_pass(**{key: values[key] for key in check_keys}))]
-
+        opts = {}
+        if "check_gpu" in kwargs:  # Add verify pass to filter out invalid configs in advance.
+            opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
        if 'cuda_arch' in kwargs:
            set_cuda_target_arch(kwargs['cuda_arch'])

        with build_config(**opts):
            func = build(s, args, target_host=inp.task.target_host)

-    return func, args
-
-
-def measure_rpc(input_pack,
-                rpc_device_key,
-                number,
-                repeat=1,
-                build_option=None,
-                rpc_tracker_addr=None,
-                rpc_priority=1,
-                rpc_timeout=60,
-                **kwargs):
-    """Measure the time cost on a device by rpc
-
-    Parameters
-    ----------
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    rpc_device_key: str
-        The device key of registered devices in tracker
-    number : int
-        Number of times to get the running measurement
-    repeat : int, optional
-        How many times we want to repeat the measurement.
-    build_option: Dict
-        build options for tvm.build_config
-
-    rpc_tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format
-        If is none, will use environment variable
-    rpc_priority: int, optional
-        priority of this task, used by scheduler in tracker
-    rpc_timeout: int, optional
-        timeout of the rpc session
-
-    kwargs: dict, optional
-        Additional key word arguments
-
-    Returns
-    -------
-    res_pack : Array of MeasureResult
-        The list of execution results of measurement.
-    """
-    def _fbuild(inp):
-        """ Local build function."""
-        func, args = _build_func(inp, build_option, kwargs)
-
-        tmp_dir = util.tempdir()
-        if not kwargs.get('use_ndk', False):
-            file_name = "tmp_func_%0x.tar" % getrandbits(64)
-            path = tmp_dir.relpath(file_name)
-            func.export_library(path)
+    # export library to temp directory
+    if tmp_dir:
+        if kwargs.get('use_ndk', False):  # for Android NDK
+            filename = "tmp_func_%0x.so" % getrandbits(64)
+            func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
        else:
-            file_name = "tmp_func_%0x.so" % getrandbits(64)
-            path = tmp_dir.relpath(file_name)
-            func.export_library(path, ndk.create_shared)
-        remote = request_remote(rpc_device_key, rpc_tracker_addr, rpc_priority, rpc_timeout)
-        remote.upload(path)
-        func = remote.load_module(file_name)
-        ctx = remote.context(str(inp.target), 0)
-        time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat)
-        return time_f, ctx, args
+            filename = "tmp_func_%0x.tar" % getrandbits(64)
+            func.export_library(tmp_dir.relpath(filename))
+    else:
+        filename = None

-    ret = _measure_generic(_fbuild, input_pack,
-                           kwargs.get("ref_input", None), kwargs.get("ref_output", None))
-    return ret
+    return func, args, filename


-def measure_local(input_pack,
-                  number,
-                  repeat=1,
-                  build_option=None,
-                  **kwargs):
-    """Measure the time cost on a local machine.
+def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
+    """Add device info for gpu target.
+    The info will be used to check the validity of generated code."""
+    remote = request_remote(device_key, rpc_tracker_addr)
+    ctx = remote.context(str(target), 0)
+    max_dims = ctx.max_thread_dimensions
+    kwargs['check_gpu'] = {
+        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
+        'max_threads_per_block': ctx.max_threads_per_block,
+        'max_thread_x': max_dims[0],
+        'max_thread_y': max_dims[1],
+        'max_thread_z': max_dims[2],
+    }

-    Parameters
-    ----------
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    number : int
-        Number of times to get the running measurement
-    repeat : int, optional
-        How many times we want to repeat the measurement.
-    build_option: dict, optional
-        Build options for tvm.build_config
-    kwargs: dict, optional
-        Additional key word arguments
+    if 'cuda' in target.keys:
+        kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))

-    Returns
-    -------
-    res_pack : Array of MeasureResult
-        The list of execution results of measurement.
-    """
-
-    def _fbuild(inp):
-        """ Local build function """
-        func, args = _build_func(inp, build_option, kwargs)
-        ctx = context(str(inp.target), 0)
-        time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat)
-        return time_f, ctx, args
-
-    ret = _measure_generic(_fbuild, input_pack,
-                           kwargs.get("ref_input", None), kwargs.get("ref_output", None))
-    return ret
-
-
-def gpu_verify_pass(**kwargs):
-    """Verify the validity of a gpu kernel
-    This pass will check shared memory size and number of threads per block.
-    """
-    def verify_pass(stmt):
-        valid = ir_pass.VerifyGPUCode(stmt, kwargs)
-        if not valid:
-            raise InstantiationError("Skipped because of invalid gpu kernel")
-        return stmt
-    return verify_pass
+def set_cuda_target_arch(arch):
+    """set target architecture of nvcc compiler"""
+    AutotvmGlobalScope.current.cuda_target_arch = arch


@register_func
@ -288,6 +442,14 @@ def tvm_callback_cuda_compile(code):
    ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
    return ptx

-def set_cuda_target_arch(arch):
-    """set target architecture of nvcc compiler"""
-    AutotvmGlobalScope.current.cuda_target_arch = arch
+
+def gpu_verify_pass(**kwargs):
+    """Verify the validity of a gpu kernel.
+    This pass will check memory usage and number of threads per block.
+    """
+    def verify_pass(stmt):
+        valid = ir_pass.VerifyGPUCode(stmt, kwargs)
+        if not valid:
+            raise InstantiationError("Skipped because of invalid gpu kernel")
+        return stmt
+    return verify_pass
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@ -9,15 +9,12 @@ import multiprocessing
 import pickle
 import json
 import time
-import os
 from collections import OrderedDict

-import numpy as np
-
 from .. import build, lower, target as _target

 from . import task
-from .task import DispatchContext, ConfigEntity
+from .task import ConfigEntity, ApplyHistoryBest
 from .measure import MeasureInput, MeasureResult

 AUTOTVM_LOG_VERSION = 0.1
@ -120,8 +117,8 @@ def decode(row, protocol='json'):
        tgt = _target.create(str(tgt))

        def clean_json_to_python(x):
-            """1. convert all list in x to tuple (hashable)
-               2. convert unicode to str for python2
+            """1. Convert all list in x to tuple (hashable)
+               2. Convert unicode to str for python2
            """
            if isinstance(x, list):
                return tuple([clean_json_to_python(a) for a in x])
@ -151,6 +148,7 @@ def decode(row, protocol='json'):
    else:
        raise RuntimeError("Invalid log protocol: " + protocol)

+
 def load_from_file(filename):
    """Generator: load records from file.
    This is a generator that yields the records.
@ -168,105 +166,6 @@ def load_from_file(filename):
        yield decode(row)


-class ApplyHistoryBest(DispatchContext):
-    """
-    Apply the history best config
-
-    Parameters
-    ----------
-    records : str or iterator of (MeasureInput, MeasureResult)
-        Collection of tuning records.
-        If is str, then it should be the filename of a records log file.
-                   Each row of this file is an encoded record pair.
-        Otherwise, it is an iterator.
-    default: ConfigEntity, optional
-        The default config to return when no history records
-    """
-    def __init__(self, records, default=None):
-        super(ApplyHistoryBest, self).__init__()
-
-        self.best_by_targetkey = {}
-        self.best_by_model = {}
-        self._default = default
-
-        self.load(records)
-
-    def load(self, records):
-        """Load records to this dispatch context
-
-        Parameters
-        ----------
-        records : str or iterator of (MeasureInput, MeasureResult)
-            Collection of tuning records.
-            If is str, then it should be the filename of a records log file.
-                       Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
-        """
-        if isinstance(records, str):
-            records = load_from_file(records)
-        if not records:
-            return
-
-        best_by_targetkey = self.best_by_targetkey
-        best_by_model = self.best_by_model
-
-        counter = 0
-        for inp, res in records:
-            counter += 1
-            if res.error_no != 0:
-                continue
-
-            # use target keys in tvm target system as key to build best map
-            for k in inp.target.keys:
-                key = (k, inp.task.workload)
-                if key not in best_by_targetkey:
-                    best_by_targetkey[key] = (inp, res)
-                else:
-                    _, other_res = best_by_targetkey[key]
-                    if np.mean(other_res.costs) > np.mean(res.costs):
-                        best_by_targetkey[key] = (inp, res)
-
-            # use model as key to build best map
-            for opt in inp.target.options:
-                if opt.startswith("-model"):
-                    model = opt[7:]
-                    key = (model, inp.task.workload)
-                    if key not in best_by_model:
-                        best_by_model[key] = (inp, res)
-                    else:
-                        _, other_res = best_by_model[key]
-                        if np.mean(other_res.costs) > np.mean(res.costs):
-                            best_by_model[key] = (inp, res)
-                    break
-
-        logging.info("Finish loading %d records", counter)
-
-    def query(self, target, workload):
-        if target is None:
-            raise RuntimeError("Need a target context to find the history best. "
-                               "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
-                               " above the dispatcher call. So does other target. ")
-
-        # first try matching by model
-        for opt in target.options:
-            if opt.startswith("-model"):
-                model = opt[7:]
-                key = (model, workload)
-                if key in self.best_by_model:
-                    return self.best_by_model[key][0].config
-
-        # then try matching by target key
-        for k in target.keys:
-            key = (k, workload)
-            if key in self.best_by_targetkey:
-                return self.best_by_targetkey[key][0].config
-
-        if self._default:
-            return self._default
-        raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s" % (target, workload))
-
-
 def split_workload(in_file, clean=True):
    """Split a log file into separate files, each of which contains only a single workload
    This function can also delete duplicated records in log file
@ -326,7 +225,7 @@ def pick_best(in_file, out_file):
    ----------
    in_file: str
        The filename of input
-    out_file:
+    out_file: str or file
        The filename of output
    """
    best_context = ApplyHistoryBest(load_from_file(in_file))
@ -338,31 +237,13 @@ def pick_best(in_file, out_file):
    for v in best_context.best_by_targetkey.values():
        best_set.add(measure_str_key(v[0]))

-    logging.info("Extract %d best records from the log file", len(best_set))
+    logging.info("Extract %d best records from the %s", len(best_set), in_file)
+    fout = open(out_file, 'w') if isinstance(out_file, str) else out_file

-    fout = open(out_file, 'w')
    for inp, res in load_from_file(in_file):
        if measure_str_key(inp) in best_set:
            fout.write(encode(inp, res) + "\n")
-
-
-def load_op_param(rootpath=os.path.join(os.path.expanduser('~'), ".tvm", "op_params")):
-    """Load pre-tuned parameters of operators.
-    This function will load all "*.log" file under root path and select best configs.
-
-    Parameters
-    ----------
-    rootpath: str
-        The root path of stored parameters
-    """
-    best_context = ApplyHistoryBest([])
-    for dirpath, _, filenames in os.walk(rootpath):
-        for filename in filenames:
-            if os.path.splitext(filename)[1] == '.log':
-                best_context.load(os.path.join(dirpath, filename))
-
-    assert not DispatchContext.current, "Cannot load pre-tuned parameters inside a dispatch context"
-    DispatchContext.current = best_context
+            best_set.remove(measure_str_key(inp))

 """
 Usage:
--- a/python/tvm/autotvm/task/init.py
+++ b/python/tvm/autotvm/task/init.py
@ -9,4 +9,7 @@ of typical tasks of interest.
 from .task import Task, create, register, template, get_config, args_to_workload
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import DispatchContext, ApplyConfig, dispatcher
+from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
+
+from .topi_integration import register_topi_compute, register_topi_schedule
+from .nnvm_integration import extract_from_graph
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@ -12,7 +12,10 @@ of the DispatchContext base class.
 """
 from __future__ import absolute_import as _abs

+import logging
+
 from decorator import decorate
+import numpy as np

 from tvm import target as _target

@ -52,25 +55,6 @@ class DispatchContext(object):
        DispatchContext.current = self._old_ctx


-class ApplyConfig(DispatchContext):
-    """Apply a specific config entity during query.
-
-    Parameters
-    ----------
-    config : ConfigSpace or ConfigEntity
-        The specific configuration we care about.
-    """
-    def __init__(self, config):
-        super(ApplyConfig, self).__init__()
-        self._config = config
-        self.workload = None
-
-    def query(self, target, workload):
-        """Override query"""
-        self.workload = workload
-        return self._config
-
-
 def dispatcher(fworkload):
    """Wrap a workload dispatcher function.

@ -137,3 +121,124 @@ def dispatcher(fworkload):
    fdecorate = decorate(fworkload, dispatch_func)
    fdecorate.register = register
    return fdecorate
+
+
+class ApplyConfig(DispatchContext):
+    """Apply a specific config entity during query.
+
+    Parameters
+    ----------
+    config : ConfigSpace or ConfigEntity
+        The specific configuration we care about.
+    """
+    def __init__(self, config):
+        super(ApplyConfig, self).__init__()
+        self._config = config
+        self.workload = None
+
+    def query(self, target, workload):
+        """Override query"""
+        self.workload = workload
+        return self._config
+
+
+class ApplyHistoryBest(DispatchContext):
+    """
+    Apply the history best config
+
+    Parameters
+    ----------
+    records : str or iterator of (MeasureInput, MeasureResult)
+        Collection of tuning records.
+        If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+        Otherwise, it is an iterator.
+    default: ConfigEntity, optional
+        The default config to return when no history records
+    """
+    def __init__(self, records, default=None):
+        super(ApplyHistoryBest, self).__init__()
+
+        self.best_by_targetkey = {}
+        self.best_by_model = {}
+        self._default = default
+
+        if records:
+            self.load(records)
+
+    def load(self, records):
+        """Load records to this dispatch context
+
+        Parameters
+        ----------
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+                       Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+        """
+        from ..record import load_from_file
+
+        if isinstance(records, str):
+            records = load_from_file(records)
+        if not records:
+            return
+
+        best_by_targetkey = self.best_by_targetkey
+        best_by_model = self.best_by_model
+
+        counter = 0
+        for inp, res in records:
+            counter += 1
+            if res.error_no != 0:
+                continue
+
+            # use target keys in tvm target system as key to build best map
+            for k in inp.target.keys:
+                key = (k, inp.task.workload)
+                if key not in best_by_targetkey:
+                    best_by_targetkey[key] = (inp, res)
+                else:
+                    _, other_res = best_by_targetkey[key]
+                    if np.mean(other_res.costs) > np.mean(res.costs):
+                        best_by_targetkey[key] = (inp, res)
+
+            # use model as key to build best map
+            for opt in inp.target.options:
+                if opt.startswith("-model"):
+                    model = opt[7:]
+                    key = (model, inp.task.workload)
+                    if key not in best_by_model:
+                        best_by_model[key] = (inp, res)
+                    else:
+                        _, other_res = best_by_model[key]
+                        if np.mean(other_res.costs) > np.mean(res.costs):
+                            best_by_model[key] = (inp, res)
+                    break
+
+        logging.debug("Finish loading %d records", counter)
+
+    def query(self, target, workload):
+        if target is None:
+            raise RuntimeError("Need a target context to find the history best. "
+                               "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
+                               " above the dispatcher call. So does other target. ")
+
+        # first try matching by model
+        for opt in target.options:
+            if opt.startswith("-model"):
+                model = opt[7:]
+                key = (model, workload)
+                if key in self.best_by_model:
+                    return self.best_by_model[key][0].config
+
+        # then try matching by target key
+        for k in target.keys:
+            key = (k, workload)
+            if key in self.best_by_targetkey:
+                return self.best_by_targetkey[key][0].config
+
+        if self._default:
+            return self._default
+        raise RuntimeError(
+            "Cannot find config for target=%s, workload=%s" % (target, workload))
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@ -0,0 +1,177 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorator and utilities for the integration with TOPI and NNVM
+
+"""
+import warnings
+
+from ... import tensor, placeholder, target as _target
+
+from ..util import get_const_tuple
+from .task import create, register
+
+
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+# Task extractor for nnvm graph
+class TaskExtractEnv:
+    """Global environment for extracting tuning tasks from nnvm graph"""
+    current = None
+
+    def __init__(self):
+        import topi
+        import nnvm
+
+        self.symbol2topi = {
+            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
+        }
+
+        self.topi_to_task = {
+            topi.nn.conv2d: "topi_nn_conv2d",
+            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+        }
+
+        self._register_dummy()
+        self._register_topi_task()
+        self.task_collection = []
+
+    def _register_dummy(self):
+        """Register dummy function to track the topi function call"""
+        for func in self.topi_to_task:
+            def _local_scope(local_func):
+                """build a scope to holds the function"""
+                @local_func.register("dummy", )
+                def _dummy_func(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when" \
+                                       "kwargs is used in TOPI function call." \
+                                       "Please modify it to use only positional args."
+
+                    if (self.topi_to_task[local_func], serialize_args(args)) \
+                            not in self.task_collection:
+                        self.task_collection.append((self.topi_to_task[local_func],
+                                                     serialize_args(args)))
+                    with _target.create("opencl"):
+                        return local_func(*args)
+
+            _local_scope(func)
+
+    def _register_topi_task(self):
+        """register tuning wrapper for topi function"""
+        import topi
+
+        # Tuning wrapper for topi functions
+        @register("topi_nn_conv2d")
+        def _topi_nn_conv2d(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            layout = args[-2]
+            assert layout == 'NCHW', "only support NCHW currently"
+            C = topi.nn.conv2d(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_depthwise_conv2d_nchw")
+        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
+            return s, [A, W, C]
+
+    def reset(self):
+        """Reset task collections"""
+        self.task_collection = []
+
+    def get_tasks(self):
+        """Get collected tasks"""
+        return self.task_collection
+
+    @staticmethod
+    def get():
+        """Get the single instance of TaskExtractEnv"""
+        if not TaskExtractEnv.current:
+            TaskExtractEnv.current = TaskExtractEnv()
+        return TaskExtractEnv.current
+
+
+def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
+    """ Extract tuning tasks from a nnvm graph.
+
+    This function collects tunning tasks by building the graph
+    with a "dummy" target and tracing all the calls to topi.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to tune
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    import nnvm.compiler
+
+    env = TaskExtractEnv.get()
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in env.symbol2topi:
+            topi_funcs.extend(env.symbol2topi[sym_name])
+        else:
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset()
+    dummy_target = _target.create("opencl -device=dummy")
+    nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@ -21,6 +21,11 @@ from tvm.autotvm.util import get_const_int

 Axis = namedtuple('Axis', ['space', 'index'])

+try:
+    _long = long
+except NameError:
+    _long = int
+

 class InstantiationError(ValueError):
    """Actively detected error in instantiating a template with a config,
@ -103,7 +108,7 @@ class VirtualAxis(TransformSpace):
            VirtualAxis.name_ct += 1

        self.name = name
-        if isinstance(var, int):
+        if isinstance(var, (int, _long)):
            self.length = var
        elif isinstance(var, schedule.IterVar):
            self.name = var.var.name
@ -114,7 +119,7 @@ class VirtualAxis(TransformSpace):
        elif isinstance(var, VirtualAxis):
            self.length = var.length
        else:
-            raise RuntimeError("Invalid type of axis")
+            raise RuntimeError("Invalid type of axis: " + str(type(var)))

    @staticmethod
    def get_num_output(var, name=None):
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@ -362,7 +362,7 @@ def compute_flop(sch):
                exp = body[0]

                ret += num_element * _count_flop(exp)
-                ret += traverse([sch[t].op for t in op.input_tensors])
+                ret += traverse([t.op for t in op.input_tensors])

            elif isinstance(op, tensor.PlaceholderOp):
                pass
@ -382,5 +382,4 @@ def compute_flop(sch):
        raise RuntimeError("Cannot find float number operation in this operator. "
                           "Please use `cfg.add_flop` to manually set "
                           "FLOP for this operator")
-
    return ret
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@ -0,0 +1,193 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorators for registering tunable templates to TOPI.
+
+These decorators can make your simple implementation be able to use different configurations
+for different workloads.
+Here we directly use all arguments to the TOPI call as "workload", so make sure all the arguments
+(except tvm.Tensor) in you calls are hashable. For tvm.Tensor, we will serialize it to a hashable
+tuple.
+
+See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+"""
+
+from ... import _api_internal, tensor
+
+from ..util import get_func_name
+from .task import args_to_workload, dispatcher
+
+
+# A table that records all registered dispatcher for all targets
+_REGISTED_DISPATHCER = {
+}
+
+
+def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
+    """Register a tunable template for a topi compute function.
+
+    After the registration. This topi compute will become a configuration dispatcher. It uses
+    all its argument as workload and dispatches configurations according to the input workload.
+
+    It also stores this "workload" to its final ComputeOp, which can be used to reconstruct
+    "workload" in the following topi_schedule call.
+
+    Parameters
+    ----------
+    topi_compute: GenericFunc
+        The topi compute function that will be overloaded
+    target_keys: str or list of str
+        The compilation target. The same as the argument of GenericFunc.register.
+    template_keys: str or list of str
+        The template key.
+        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
+        The template key is used to identity the algorithm strategy.
+        Every operator must have a "direct" template, which is used by default.
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
+
+    Returns
+    -------
+    decorator: callable
+        A decorator
+
+    Examples
+    --------
+    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+    """
+    fname = get_func_name(topi_compute)
+
+    def _decorator(f):
+        targets = [target_keys] if isinstance(target_keys, str) else target_keys
+        for target_key in targets:
+            if target_key not in _REGISTED_DISPATHCER:
+                _REGISTED_DISPATHCER[target_key] = {}
+            if topi_compute not in _REGISTED_DISPATHCER:
+                @topi_compute.register(target_key)
+                @dispatcher
+                def config_dispatcher(*args, **kwargs):
+                    """override topi call as a config dispatcher"""
+                    assert not kwargs, "Do not support kwargs in template function call"
+                    return (fname, ) + args_to_workload(args)
+                _REGISTED_DISPATHCER[target_key][topi_compute] = config_dispatcher
+
+            config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_compute]
+
+            @config_dispatcher.register(template_keys)
+            def template_call(cfg, *args, **kwargs):
+                """call the topi func and attach workload to compute node"""
+                assert not kwargs, "Do not support kwargs in template function call"
+
+                if f == topi_compute.fdefault:
+                    node = f(*args, **kwargs)
+                else:
+                    node = f(cfg, *args, **kwargs)
+
+                # attach workload to return op
+                op = node.op
+                attrs = {}
+                for k, v in node.op.attrs.items():
+                    attrs[k] = v
+                attrs['workload'] = (fname, ) + args_to_workload(args)
+                if isinstance(op, tensor.ComputeOp):
+                    op = _api_internal._ComputeOp(
+                        op.name, op.tag, attrs, op.axis, op.body)
+                elif isinstance(op, tensor.ExternOp):
+                    op = _api_internal._ExternOp(
+                        op.name, op.tag, attrs,
+                        op.inputs, op.input_placeholders,
+                        op.output_placeholders, op.body)
+                else:
+                    raise RuntimeError("Unsupported op type: " + str(type(op)))
+
+                if isinstance(node, tensor.Tensor):
+                    return op.output(0)
+                return [op.output(i) for i in range(len(node))]
+
+        return f
+
+    if func:
+        _decorator(func)
+
+    return _decorator
+
+
+def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None):
+    """Register a tunable template for a topi schedule function.
+
+    After the registration. This topi schedule will become a configuration dispatcher. It dispatches
+    configurations according to the input workload.
+
+    Note that this function will try to find "workload" from all the ComputeOp in the input.
+    You can attach "workload" to your compute op by using :any:`register_topi_compute`.
+
+    Parameters
+    ----------
+    topi_schedule: GenericFunc
+        The topi schedule function that will be overloaded
+    target_keys: str or list of str
+        The compilation target
+    template_keys: str or list of str
+        The template key.
+        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
+        The template key is used to identity the algorithm strategy.
+        Every operator must have a "direct" template, which is used by default.
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
+
+    Returns
+    -------
+    decorator: callable
+        A decorator
+
+    Examples
+    --------
+    See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
+    """
+    def _decorator(f):
+        targets = [target_keys] if isinstance(target_keys, str) else target_keys
+        for target_key in targets:
+            if target_key not in _REGISTED_DISPATHCER:
+                _REGISTED_DISPATHCER[target_key] = {}
+            if topi_schedule not in _REGISTED_DISPATHCER[target_key]:
+                @topi_schedule.register(target_key)
+                @dispatcher
+                def config_dispatcher(outs):
+                    """override topi call as a workload dispatcher"""
+                    def traverse(tensors):
+                        """traverse all ops to find attached workload"""
+                        for t in tensors:
+                            op = t.op
+                            if 'workload' in op.attrs:
+                                return op.attrs['workload']
+                            wkl = traverse(op.input_tensors)
+                            if wkl:
+                                return wkl
+                        return None
+
+                    outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                    workload = traverse(outs)
+
+                    if workload is None:
+                        raise RuntimeError("Cannot find workload in attribute of this schedule")
+
+                    return args_to_workload(workload)
+
+                _REGISTED_DISPATHCER[target_key][topi_schedule] = config_dispatcher
+
+            config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_schedule]
+
+            @config_dispatcher.register(template_keys)
+            def template_call(cfg, outs):
+                """call the schedule func"""
+                if f == topi_schedule.fdefault:
+                    return f(outs)
+                return f(cfg, outs)
+
+        return f
+
+    if func:
+        _decorator(func)
+
+    return _decorator
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@ -0,0 +1,123 @@
+"""
+TopHub: Tensor Operator Hub
+To get the best performance, we typically need auto-tuning for the specific devices.
+TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
+TVM will download these parameters for you when you create the target for the first time.
+"""
+
+import logging
+import os
+import json
+
+from .task import ApplyHistoryBest
+from .. import target as _target
+from ..contrib.util import tempdir
+from ..contrib.download import download
+
+AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
+
+
+def _alias(name):
+    """convert alias for some packages"""
+    table = {
+        'vtacpu': 'vta',
+    }
+    return table.get(name, name)
+
+
+def context(target, extra_files=None):
+    """Return the dispatch context with pre-tuned parameters.
+    The corresponding downloaded *.log files under tophub root path will be loaded.
+    Users can also add their own files in argument `extra_files`.
+
+    Parameters
+    ----------
+    target: Target
+        The compilation target
+    extra_files: list of str, optional
+        Extra log files to load
+    """
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+    best_context = ApplyHistoryBest([])
+
+    if isinstance(target, str):
+        target = _target.create(target)
+
+    big_target = str(target).split()[0]
+    if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
+        best_context.load(os.path.join(rootpath, big_target + ".log"))
+
+    for opt in target.options:
+        if opt.startswith("-device"):
+            model = _alias(opt[8:])
+            if os.path.isfile(os.path.join(rootpath, model) + ".log"):
+                best_context.load(os.path.join(rootpath, model) + ".log")
+
+    if extra_files:
+        for filename in extra_files:
+            best_context.load(filename)
+
+    return best_context
+
+
+def download_package(backend):
+    """Download pre-tuned parameters of operators for a backend
+
+    Parameters
+    ----------
+    backend: str
+        The name of package
+    """
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+
+    if not os.path.isdir(rootpath):
+        # make directory
+        splits = os.path.split(rootpath)
+        for j in range(1, len(splits)+1):
+            path = os.path.join(*splits[:j])
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+    backend = _alias(backend)
+    logging.info("Download pre-tuned parameters for %s", backend)
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
+             os.path.join(rootpath, backend + ".log"), True, verbose=0)
+
+
+def check_package(backend):
+    """Check whether have pre-tuned parameters of the certain target.
+    If not, will download it.
+
+    Parameters
+    ----------
+    backend: str
+        The name of package
+    """
+    backend = _alias(backend)
+
+    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
+        return
+    download_package(backend)
+
+
+def list_packages():
+    """List all available pre-tuned op parameters for targets
+
+    Returns
+    -------
+    ret: List
+        All available packets
+    """
+    path = tempdir()
+    filename = path.relpath("info.json")
+    logging.info("Download meta info for pre-tuned parameters")
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
+             filename, True, verbose=0)
+
+    with open(filename, "r") as fin:
+        text = "".join(fin.readlines())
+    info = json.loads(text)
+    keys = list(info.keys())
+    keys.sort()
+
+    return [(k, info[k]) for k in keys]
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@ -1,10 +1,13 @@
 # pylint: disable=consider-using-enumerate,invalid-name
 """Namespace of callback utilities of AutoTVM"""
+import sys
+import time

 import numpy as np

 from .. import record

+
 def log_to_file(file_out, protocol='json'):
    """Log the tuning records into file.
    The rows of the log are stored in the format of autotvm.record.encode.
@ -21,7 +24,6 @@ def log_to_file(file_out, protocol='json'):
    callback : callable
        Callback function to do the logging.
    """
-
    def _callback(_, inputs, results):
        """Callback implementation"""
        if isinstance(file_out, str):
@ -34,55 +36,21 @@ def log_to_file(file_out, protocol='json'):
    return _callback


-def save_tuner_state(prefix, save_every_sample=100):
-    """Save the state of tuner
+def log_to_database(db):
+    """Save the tuning records to a database object.

    Parameters
    ----------
-    prefix : srt
-        prefix of the filename to store state
-    save_every_sample: int
-        save the state every x samples
-
-    Returns
-    -------
-    callback : function
-        Callback function to do the auto saving.
+    db: Database
+        The database
    """
-    def _callback(tuner, inputs, results):
-        for _, __ in zip(inputs, results):
-            try:
-                ct = len(tuner.visited)
-            except AttributeError:
-                ct = 0
-            if ct % save_every_sample == 0:
-                tuner.save_state(prefix + "_%d.state" % ct)
-
-    return _callback
-
-
-def log_to_redis(host="localhost", port=6379, dbn=11):
-    """Record the tuning record to a redis DB.
-
-    Parameters
-    ----------
-    host: str, optional
-        Host address of redis db
-    port: int, optional
-        Port of redis db
-    dbn: int, optional
-        which redis db to use, default 11
-    """
-    # import here so only depend on redis when necessary
-    import redis
-    red = redis.StrictRedis(host=host, port=port, db=dbn)
-
    def _callback(_, inputs, results):
        """Callback implementation"""
        for inp, result in zip(inputs, results):
-            red.set(inp, result)
+            db.save(inp, result)
    return _callback

+
 class Monitor(object):
    """A monitor to collect statistic during tuning"""
    def __init__(self):
@ -110,3 +78,47 @@ class Monitor(object):
    def trial_timestamps(self):
        """get wall clock time stamp of all trials"""
        return np.array(self.timestamps)
+
+
+def progress_bar(total, prefix=''):
+    """Display progress bar for tuning
+
+    Parameters
+    ----------
+    total: int
+        The total number of trials
+    prefix: str
+        The prefix of output message
+    """
+    class _Context:
+        """Context to store local variables"""
+        def __init__(self):
+            self.best_flops = 0
+            self.cur_flops = 0
+            self.ct = 0
+            self.total = total
+
+        def __del__(self):
+            sys.stdout.write(' Done.\n')
+
+    ctx = _Context()
+    tic = time.time()
+
+    def _callback(tuner, inputs, results):
+        ctx.ct += len(inputs)
+
+        flops = 0
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                flops = inp.task.flop / np.mean(res.costs)
+
+        ctx.cur_flops = flops
+        ctx.best_flops = tuner.best_flops
+
+        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                         '| %.2f s' %
+                         (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
+                          time.time() - tic))
+        sys.stdout.flush()
+
+    return _callback
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@ -117,3 +117,6 @@ class GATuner(Tuner):

    def has_next(self):
        return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
+
+    def load_history(self, data_set):
+        pass
--- a/python/tvm/autotvm/tuner/gridsearch_tuner.py
+++ b/python/tvm/autotvm/tuner/gridsearch_tuner.py
@ -25,6 +25,9 @@ class GridSearchTuner(Tuner):
    def has_next(self):
        return self.counter < len(self.task.config_space)

+    def load_history(self, data_set):
+        pass
+
    def __getstate__(self):
        return {"counter": self.counter}

@ -56,6 +59,9 @@ class RandomTuner(Tuner):
    def has_next(self):
        return len(self.visited) < len(self.task.config_space)

+    def load_history(self, data_set):
+        pass
+
    def __getstate__(self):
        return {"visited": self.counter}

--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@ -242,7 +242,7 @@ class ModelBasedTuner(Tuner):
                self.ys.append(flops)
            else:
                self.xs.append(index)
-                self.ys.append(0)
+                self.ys.append(0.0)

        # if we have enough new training samples
        if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@ -26,11 +26,11 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
        If is an Array, then perform linear cooling from temp[0] to temp[1]
    early_stop: int, optional
        Stop iteration if the optimal set do not change in `early_stop` rounds
-    verbose: int, optional
-        Print log every `verbose` iterations
+    log_interval: int, optional
+        Print log every `log_interval` iterations
    """
    def __init__(self, task, n_iter=500, temp=(1, 0), persistent=True, parallel_size=128,
-                 early_stop=50, verbose=50):
+                 early_stop=50, log_interval=50):
        super(SimulatedAnnealingOptimizer, self).__init__()

        self.task = task
@ -41,12 +41,13 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
        self.persistent = persistent
        self.parallel_size = min(parallel_size, len(self.task.config_space))
        self.early_stop = early_stop or 1e9
-        self.verbose = verbose
+        self.log_interval = log_interval
        self.points = None

    def find_maximums(self, model, num, exclusive):
        tic = time.time()
-        temp, n_iter, early_stop, verbose = self.temp, self.n_iter, self.early_stop, self.verbose
+        temp, n_iter, early_stop, log_interval = \
+                self.temp, self.n_iter, self.early_stop, self.log_interval

        if self.persistent and self.points is not None:
            points = self.points
@ -100,19 +101,18 @@ class SimulatedAnnealingOptimizer(ModelOptimizer):
            k += 1
            t -= cool

-            if verbose >= 1 and k % verbose == 0:
+            if log_interval and k % log_interval == 0:
                t_str = "%.2f" % t
-                logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
-                             "elapsed: %.2f",
-                             k, k_last_modify, heap_items[0][0],
-                             np.max([v for v, _ in heap_items]), t_str,
-                             time.time() - tic)
+                logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
+                              "elapsed: %.2f",
+                              k, k_last_modify, heap_items[0][0],
+                              np.max([v for v, _ in heap_items]), t_str,
+                              time.time() - tic)

        heap_items.sort(key=lambda item: -item[0])
-        if verbose:
-            logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
-                         k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
-            logging.info("SA Maximums: %s", heap_items)
+        logging.debug("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
+                      k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
+        logging.debug("SA Maximums: %s", heap_items)

        if self.persistent:
            self.points = points
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@ -7,6 +7,7 @@ import numpy as np
 from ..measure import MeasureInput
 from ..measure import create_measure_batch

+from ..env import GLOBAL_SCOPE

 class Tuner(object):
    """Base class for tuners
@ -64,7 +65,7 @@ class Tuner(object):
        """
        pass

-    def tune(self, n_trial, measure_option, early_stop=None, verbose=1, callbacks=()):
+    def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
        """Begin tuning

        Parameters
@ -74,11 +75,8 @@ class Tuner(object):
        measure_option: dict
            The options for how to measure generated code.
            You should use the return value ot autotvm.measure_option for this argument.
-        early_stop: int
+        early_stopping: int
            Early stop the tuning when not finding better configs in this number of trials
-        verbose: int
-            0: silent mode, no output
-            1: print every measurement result
        callbacks: List of callable
            A list of callback functions. The signature of callback function is
            (Tuner, List of MeasureInput, List of MeasureResult)
@ -87,8 +85,9 @@ class Tuner(object):
        """
        measure_batch = create_measure_batch(self.task, measure_option)
        parallel_num = getattr(measure_batch, 'parallel_num', 1)
-        early_stop = early_stop or 1e9
+        early_stopping = early_stopping or 1e9

+        GLOBAL_SCOPE.in_tuning = True
        i = 0
        while i < n_trial:
            if not self.has_next():
@ -99,23 +98,22 @@ class Tuner(object):
            inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
            results = measure_batch(inputs)

-            # print info
-            if verbose >= 1:
-                for k, (inp, res) in enumerate(zip(inputs, results)):
-                    config = inp.config
-                    if res.error_no == 0:
-                        flops = inp.task.flop / np.mean(res.costs)
-                    else:
-                        flops = 0
-                    if flops > self.best_flops:
-                        self.best_flops = flops
-                        self.best_config = config
-                        self.best_measure_pair = (inp, res)
-                        self.best_iter = i + k
+            # keep best config
+            for k, (inp, res) in enumerate(zip(inputs, results)):
+                config = inp.config
+                if res.error_no == 0:
+                    flops = inp.task.flop / np.mean(res.costs)
+                else:
+                    flops = 0
+                if flops > self.best_flops:
+                    self.best_flops = flops
+                    self.best_config = config
+                    self.best_measure_pair = (inp, res)
+                    self.best_iter = i + k

-                    logging.info("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
-                                 i + k + 1, flops / 1e9, self.best_flops / 1e9,
-                                 res, config)
+                logging.debug("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
+                              i + k + 1, flops / 1e9, self.best_flops / 1e9,
+                              res, config)

            i += len(results)

@ -124,10 +122,12 @@ class Tuner(object):
            for callback in callbacks:
                callback(self, inputs, results)

-            if i > self.best_iter + early_stop:
-                logging.info("Early stopped. Best iter: %d.", self.best_iter)
+            if i > self.best_iter + early_stopping:
+                logging.debug("Early stopped. Best iter: %d.", self.best_iter)
                break

+        GLOBAL_SCOPE.in_tuning = False
+
        del measure_batch

    def reset(self):
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@ -42,10 +42,10 @@ class XGBoostCostModel(CostModel):
                     The cost model predicts relative rank score.
    num_threads: int, optional
        The number of threads.
-    verbose: int, optional
-        If is not none, the cost model will print training log every `verbose` iterations.
+    log_interval: int, optional
+        If is not none, the cost model will print training log every `log_interval` iterations.
    """
-    def __init__(self, task, feature_type, loss_type, num_threads=None, verbose=20):
+    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
        super(XGBoostCostModel, self).__init__()

        if xgb is None:
@ -60,7 +60,7 @@ class XGBoostCostModel(CostModel):
        self.fea_type = feature_type
        self.loss_type = loss_type
        self.num_threads = num_threads
-        self.verbose = verbose
+        self.log_interval = log_interval

        if loss_type == 'reg':
            self.xgb_params = {
@ -139,7 +139,8 @@ class XGBoostCostModel(CostModel):

        x_train = self._get_feature(xs)
        y_train = np.array(ys)
-        y_train = y_train / np.max(y_train)
+        y_max = np.max(y_train)
+        y_train = y_train / max(y_max, 1e-8)

        valid_index = y_train > 1e-6
        index = np.random.permutation(len(x_train))
@ -160,19 +161,20 @@ class XGBoostCostModel(CostModel):
                                 fevals=[
                                     xgb_average_recalln_curve_score(plan_size),
                                 ],
-                                 verbose_eval=self.verbose)])
+                                 verbose_eval=self.log_interval)])

-        logging.info("train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
-                     time.time() - tic, len(xs),
-                     len(xs) - np.sum(valid_index),
-                     self.feature_cache.size(self.fea_type))
+        logging.debug("XGB train: %.2f\tobs: %d\terror: %d\tn_cache: %d",
+                      time.time() - tic, len(xs),
+                      len(xs) - np.sum(valid_index),
+                      self.feature_cache.size(self.fea_type))

    def fit_log(self, records, plan_size):
        tic = time.time()
        self._reset_pool()

        args = list(records)
-        logging.info("Load %d entries from history log file", len(args))
+        logging.debug("XGB load %d entries from history log file", len(args))
+
        if self.fea_type == 'itervar':
            feature_extract_func = _extract_itervar_feature_log
        elif self.fea_type == 'knob':
@ -187,7 +189,8 @@ class XGBoostCostModel(CostModel):

        x_train = xs
        y_train = ys
-        y_train /= np.max(y_train)
+        y_max = np.max(y_train)
+        y_train = y_train / max(y_max, 1e-8)

        index = np.random.permutation(len(x_train))
        dtrain = xgb.DMatrix(x_train[index], y_train[index])
@ -203,9 +206,9 @@ class XGBoostCostModel(CostModel):
                                 fevals=[
                                     xgb_average_recalln_curve_score(plan_size),
                                 ],
-                                 verbose_eval=self.verbose)])
+                                 verbose_eval=self.log_interval)])

-        logging.info("train: %.2f\tobs: %d", time.time() - tic, len(xs))
+        logging.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))

    def predict(self, xs, output_margin=False):
        feas = self._get_feature(xs)
@ -232,7 +235,7 @@ class XGBoostCostModel(CostModel):

    def clone_new(self):
        return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
-                                self.num_threads, self.verbose)
+                                self.num_threads, self.log_interval)

    def _get_feature(self, indexes):
        """get features for indexes, run extraction if we do not have cache for them"""
@ -282,7 +285,7 @@ def _extract_itervar_feature_log(arg):
    if res.error_no == 0:
        y = inp.task.flop / np.mean(res.costs)
    else:
-        y = 0
+        y = 0.0
    return x, y

 def _extract_knob_feature_index(index):
@ -301,7 +304,7 @@ def _extract_knob_feature_log(arg):
            inp.task.instantiate(config)
        y = inp.task.flop / np.mean(res.costs)
    else:
-        y = 0
+        y = 0.0
    return x, y

 def _extract_curve_feature_index(index):
@ -325,12 +328,11 @@ def _extract_curve_feature_log(arg):
    if res.error_no == 0:
        y = inp.task.flop / np.mean(res.costs)
    else:
-        y = 0
+        y = 0.0
    return x, y


 def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
-                    save_file="xgb_checkpoint", save_every=None,
                    maximize=False, verbose_eval=True):
    """callback function for xgboost to support multiple custom evaluation functions"""
    from xgboost.core import EarlyStopException
@ -400,18 +402,12 @@ def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
                continue
            infos.append("%s: %.6f" % (item[0], item[1]))

-        if not isinstance(verbose_eval, bool) and i % verbose_eval == 0:
-            logging.info("\t".join(infos))
+        if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0:
+            logging.debug("\t".join(infos))
        if log_file:
            with open(log_file, "a") as fout:
                fout.write("\t".join(infos) + '\n')

-        ##### save model #####
-        if save_every and i % save_every == 0:
-            filename = save_file + ".%05d.bst" % i
-            logging.info("save model to %s ...", filename)
-            bst.save_model(filename)
-
        ##### choose score and do early stopping #####
        score = None
        for item in eval_res:
@ -439,7 +435,7 @@ def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
        elif env.iteration - best_iteration >= stopping_rounds:
            best_msg = state['best_msg']
            if verbose_eval and env.rank == 0:
-                logging.info("Stopping. Best iteration: %s ", best_msg)
+                logging.debug("XGB stopped. Best iteration: %s ", best_msg)
            raise EarlyStopException(best_iteration)

    return callback
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@ -40,16 +40,21 @@ class XGBTuner(ModelBasedTuner):
        If is not None, the tuner will first select
        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
        and then pick batch_size of them according to the diversity metric.
+    log_interval: int, optional
+        The verbose level.
+        If is 0, output nothing.
+        Otherwise, output debug information every `verbose` iterations.
    """
    def __init__(self, task, plan_size=32,
                 feature_type='itervar', loss_type='rank', num_threads=None,
-                 optimizer='sa', diversity_filter_ratio=None):
+                 optimizer='sa', diversity_filter_ratio=None, log_interval=50):
        cost_model = XGBoostCostModel(task,
                                      feature_type=feature_type,
                                      loss_type=loss_type,
-                                      num_threads=num_threads)
+                                      num_threads=num_threads,
+                                      log_interval=log_interval // 2)
        if optimizer == 'sa':
-            optimizer = SimulatedAnnealingOptimizer(task)
+            optimizer = SimulatedAnnealingOptimizer(task, log_interval=log_interval)
        else:
            assert isinstance(optimizer, ModelOptimizer), "Optimizer must be " \
                                                          "a supported name string" \
--- a/python/tvm/autotvm/util.py
+++ b/python/tvm/autotvm/util.py
@ -8,6 +8,16 @@ import numpy as np

 from .. import expr, ir_pass

+
+class EmptyContext(object):
+    """An empty context"""
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
 def get_rank(values):
    """get rank of items

--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@ -6,7 +6,7 @@ import os
 import sys
 import time

-def download(url, path, overwrite=False, size_compare=False):
+def download(url, path, overwrite=False, size_compare=False, verbose=1):
    """Downloads the file from the internet.
    Set the input options correctly to overwrite or do the size comparison

@ -23,9 +23,10 @@ def download(url, path, overwrite=False, size_compare=False):

    size_compare : bool, optional
        Whether to do size compare to check downloaded file.
-    """

-    import requests
+    verbose: int, optional
+        Verbose level
+    """
    if sys.version_info >= (3,):
        import urllib.request as urllib2
    else:
@ -33,6 +34,7 @@ def download(url, path, overwrite=False, size_compare=False):

    if os.path.isfile(path) and not overwrite:
        if size_compare:
+            import requests
            file_size = os.path.getsize(path)
            res_head = requests.head(url)
            res_get = requests.get(url, stream=True)
@ -45,7 +47,9 @@ def download(url, path, overwrite=False, size_compare=False):
                return
        print('File {} exists, skip.'.format(path))
        return
-    print('Downloading from url {} to {}'.format(url, path))
+
+    if verbose >= 1:
+        print('Downloading from url {} to {}'.format(url, path))

    # Stateful start time
    start_time = time.time()
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@ -142,3 +142,35 @@ def which(exec_name):
        if os.path.isfile(full_path) and os.access(full_path, os.X_OK):
            return full_path
    return None
+
+def get_lower_ir(s):
+    """Get lower ir code of a schedule.
+    This is useful for debug, since you don't have to find all inputs/outputs
+    for a schedule in a fused subgraph.
+
+    Parameters
+    ----------
+    s: Schedule
+
+    Returns
+    -------
+    ir: str
+        The lower ir
+    """
+    from .. import tensor
+    from ..build_module import lower
+
+    outputs = s.outputs
+
+    inputs = []
+    def find_all(op):
+        if isinstance(op, tensor.PlaceholderOp):
+            inputs.append(op.output(0))
+        else:
+            for x in op.input_tensors:
+                find_all(x.op)
+
+    for out in outputs:
+        find_all(out)
+
+    return lower(s, inputs, simple_mode=True)
--- a/python/tvm/exec/autotvm_log_editor.py
+++ b/python/tvm/exec/autotvm_log_editor.py
@ -0,0 +1,44 @@
+# pylint: disable=invalid-name
+"""Pick best log entries from a large file and store them to a small file"""
+
+import argparse
+import os
+import logging
+import warnings
+
+from .. import autotvm
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--act", type=str, choices=['pick-best'],
+                        help="The action")
+    parser.add_argument("--i", type=str, help="The input file or directory")
+    parser.add_argument("--o", type=str, help="The output file")
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    if args.act == 'pick-best':
+        if os.path.isfile(args.i):
+            args.o = args.o or args.i + ".best.log"
+            autotvm.record.pick_best(args.i, args.o)
+        elif os.path.isdir(args.i):
+            args.o = args.o or "best.log"
+            tmp_filename = args.o + ".tmp"
+
+            with open(tmp_filename, 'w') as tmp_fout:
+                for filename in os.listdir(args.i):
+                    if filename.endswith(".log"):
+                        try:
+                            autotvm.record.pick_best(filename, tmp_fout)
+                        except Exception:  # pylint: disable=broad-except
+                            warnings.warn("Ignore invalid file %s" % filename)
+
+            logging.info("Run final filter...")
+            autotvm.record.pick_best(tmp_filename, args.o)
+            os.remove(tmp_filename)
+            logging.info("Output to %s ...", args.o)
+        else:
+            raise ValueError("Invalid input file: " + args.i)
+    else:
+        raise ValueError("Invalid action " + args.act)
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@ -40,20 +40,21 @@ if __name__ == "__main__":
                        help='The port of the PRC')
    parser.add_argument('--port-end', type=int, default=9199,
                        help='The end search port of the PRC')
+    parser.add_argument('--tracker', type=str,
+                        help="The address of RPC tracker in host:port format. "
+                             "e.g. (10.77.1.234:9190)")
    parser.add_argument('--key', type=str, default="",
-                        help="RPC key used to identify the connection type.")
-    parser.add_argument('--load-library', type=str, default="",
+                        help="The key used to identify the device type in tracker.")
+    parser.add_argument('--silent', action='store_true',
+                        help="Whether run in silent mode.")
+    parser.add_argument('--load-library', type=str,
                        help="Additional library to load")
-    parser.add_argument('--tracker', type=str, default="",
-                        help="Report to RPC tracker")
    parser.add_argument('--no-fork', dest='fork', action='store_false',
                        help="Use spawn mode to avoid fork. This option \
                         is able to avoid potential fork problems with Metal, OpenCL \
                         and ROCM compilers.")
    parser.add_argument('--custom-addr', type=str,
                        help="Custom IP Address to Report to RPC Tracker")
-    parser.add_argument('--silent', action='store_true',
-                        help="Whether run in silent mode.")

    parser.set_defaults(fork=True)
    args = parser.parse_args()
--- a/python/tvm/exec/rpc_tracker.py
+++ b/python/tvm/exec/rpc_tracker.py
@ -6,13 +6,12 @@ import logging
 import argparse
 import multiprocessing
 import sys
-from ..rpc.tracker import Tracker
-
+from .. import rpc

 def main(args):
    """Main funciton"""
-    tracker = Tracker(args.host, port=args.port, port_end=args.port_end,
-                      silent=args.silent)
+    tracker = rpc.Tracker(args.host, port=args.port, port_end=args.port_end,
+                          silent=args.silent)
    tracker.proc.join()


--- a/python/tvm/exec/tophub.py
+++ b/python/tvm/exec/tophub.py
@ -0,0 +1,36 @@
+# pylint: disable=invalid-name
+"""Download pre-tuned parameters of ops"""
+
+import argparse
+import logging
+
+from ..autotvm.tophub import list_packages, download_package
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--download", type=str, nargs='+',
+                        help="Target to download. Use 'all' to download for all targets")
+    parser.add_argument("-l", "--list", action='store_true', help="List available packages")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    if args.list:
+        info = list_packages()
+        print("\n%-20s %-20s" % ("Target", "Size"))
+        print("-" * 41)
+        for target, info in info:
+            print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
+
+    if args.download:
+        info = list_packages()
+        all_targets = [x[0] for x in info]
+        if 'all' in args.download:
+            targets = all_targets
+        else:
+            targets = args.download
+
+        for t in targets:
+            if t not in all_targets:
+                print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
+            download_package(t)
--- a/python/tvm/rpc/init.py
+++ b/python/tvm/rpc/init.py
@ -10,4 +10,6 @@ upload and run remote RPC server, get the result back to verify correctness.
 """

 from .server import Server
+from .tracker import Tracker
+from .proxy import Proxy
 from .client import RPCSession, LocalSession, TrackerSession, connect, connect_tracker
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@ -225,18 +225,24 @@ class TrackerSession(object):
            res += item["key"] + "\n"
        res += "----------------------------\n"
        res += "\n"
-        res += "Queue Status\n"
-        res += "----------------------------\n"
-        res += "key\tfree\tpending\n"
-        res += "----------------------------\n"
+
+        # compute max length of device key
        queue_info = data['queue_info']
        keys = list(queue_info.keys())
        if keys:
            keys.sort()
            max_key_len = max([len(k) for k in keys])
-            for k in keys:
-                res += ("%%-%d" % max_key_len + "s\t%d\t%g\n") % \
-                       (k, queue_info[k]["free"], queue_info[k]["pending"])
+        else:
+            max_key_len = 0
+
+        res += "Queue Status\n"
+        res += "----------------------------\n"
+        res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
+        res += "----------------------------\n"
+        for k in keys:
+            res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
+                   (k, queue_info[k]["free"], queue_info[k]["pending"])
+
        res += "----------------------------\n"
        return res

--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@ -460,6 +460,10 @@ class Proxy(object):
    timeout_server : float, optional
        Timeout of server until it sees a matching connection.

+    tracker_addr: Tuple (str, int) , optional
+        The address of RPC Tracker in tuple (host, ip) format.
+        If is not None, the server will register itself to the tracker.
+
    index_page : str, optional
        Path to an index page that can be used to display at proxy index.

--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@ -20,6 +20,7 @@ import multiprocessing
 import subprocess
 import time
 import sys
+import signal

 from .._ffi.function import register_func
 from .._ffi.base import py_str
@ -257,7 +258,7 @@ def _popen(cmd):


 class Server(object):
-    """Start RPC server on a seperate process.
+    """Start RPC server on a separate process.

    This is a simple python implementation based on multi-processing.
    It is also possible to implement a similar C based sever with
@ -284,14 +285,21 @@ class Server(object):
        This is recommended to switch on if we want to do local RPC demonstration
        for GPU devices to avoid fork safety issues.

-    silent: bool, optional
-        Whether run this server in silent mode.
+    tracker_addr: Tuple (str, int) , optional
+        The address of RPC Tracker in tuple(host, ip) format.
+        If is not None, the server will register itself to the tracker.

    key : str, optional
-        The key used to identify the server in Proxy connection.
+        The key used to identify the device type in tracker.

    load_library : str, optional
        List of additional libraries to be loaded during execution.
+
+    custom_addr: str, optional
+        Custom IP Address to Report to RPC Tracker
+
+    silent: bool, optional
+        Whether run this server in silent mode.
    """
    def __init__(self,
                 host,
@ -299,11 +307,11 @@ class Server(object):
                 port_end=9199,
                 is_proxy=False,
                 use_popen=False,
-                 silent=False,
                 tracker_addr=None,
                 key="",
                 load_library=None,
-                 custom_addr=None):
+                 custom_addr=None,
+                 silent=False):
        try:
            if base._ServerLoop is None:
                raise RuntimeError("Please compile with USE_RPC=1")
@ -313,6 +321,7 @@ class Server(object):
        self.port = port
        self.libs = []
        self.custom_addr = custom_addr
+        self.use_popen = use_popen

        self.logger = logging.getLogger("RPCServer")
        if silent:
@ -334,10 +343,7 @@ class Server(object):
            if silent:
                cmd += ["--silent"]

-            self.proc = multiprocessing.Process(
-                target=subprocess.check_call, args=(cmd,))
-            self.proc.deamon = True
-            self.proc.start()
+            self.proc = subprocess.Popen(cmd, preexec_fn=os.setsid)
            time.sleep(0.5)
        elif not is_proxy:
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@ -371,9 +377,14 @@ class Server(object):

    def terminate(self):
        """Terminate the server process"""
-        if self.proc:
-            self.proc.terminate()
-            self.proc = None
+        if self.use_popen:
+            if self.proc:
+                os.killpg(self.proc.pid, signal.SIGTERM)
+                self.proc = None
+        else:
+            if self.proc:
+                self.proc.terminate()
+                self.proc = None

    def __del__(self):
        self.terminate()
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@ -40,6 +40,8 @@ We can also use other specific function in this module to create specific target
 """
 from __future__ import absolute_import

+import warnings
+
 from ._ffi.base import _LIB_NAME
 from ._ffi.node import NodeBase, register_node
 from . import _api_internal
@ -51,7 +53,6 @@ except ImportError as err_msg:
    if _LIB_NAME != "libtvm_runtime.so":
        raise err_msg

-
 def _merge_opts(opts, new_opts):
    """Helper function to merge options"""
    if isinstance(new_opts, str):
@ -72,7 +73,7 @@ class Target(NodeBase):
    Do not use class constructor, you can create target using the following functions

    - :any:`tvm.target.create` create target from string
-    - :any:`tvm.target.rasp` create raspberry pi target
+    - :any:`tvm.target.arm_cpu` create arm_cpu target
    - :any:`tvm.target.cuda` create CUDA target
    - :any:`tvm.target.rocm` create ROCM target
    - :any:`tvm.target.mali` create Mali target
@ -374,22 +375,6 @@ def rocm(options=None):
    return _api_internal._TargetCreate("rocm", *options)


-def rasp(options=None):
-    """Returns a rasp target.
-
-    Parameters
-    ----------
-    options : str or list of str
-        Additional options
-    """
-    opts = ["-device=rasp",
-            "-mtriple=armv7l-none-linux-gnueabihf",
-            "-mcpu=cortex-a53",
-            "-mattr=+neon"]
-    opts = _merge_opts(opts, options)
-    return _api_internal._TargetCreate("llvm", *opts)
-
-
 def mali(options=None):
    """Returns a ARM Mali GPU target.

@ -428,6 +413,52 @@ def opengl(options=None):
    return _api_internal._TargetCreate("opengl", *options)


+def arm_cpu(model='unknown', options=None):
+    """Returns a ARM CPU target.
+    This function will also download pre-tuned op parameters when there is none.
+
+    Parameters
+    ----------
+    model: str
+        SoC name or phone name of the arm board.
+    options : str or list of str
+        Additional options
+    """
+    from . import autotvm
+
+    trans_table = {
+        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android"],
+        "mate10":    ["-model=kirin970", "-target=arm64-linux-android"],
+        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
+        "p20":       ["-model=kirin970", "-target=arm64-linux-android"],
+        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android"],
+        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
+        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu"],
+        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi"],
+    }
+    pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
+
+    # download pre-tuned parameters for arm_cpu if there is not any.
+    autotvm.tophub.check_package('arm_cpu')
+
+    opts = ["-device=arm_cpu"] + pre_defined_opt
+    opts = _merge_opts(opts, options)
+    return _api_internal._TargetCreate("llvm", *opts)
+
+
+def rasp(options=None):
+    """Return a Raspberry 3b target.
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    warnings.warn('tvm.target.rasp() is going to be deprecated. '
+                  'Please use tvm.target.arm_cpu("rasp3b")')
+    return arm_cpu('rasp3b', options)
+
+
 def create(target_str):
    """Get a target given target string.

--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@ -261,15 +261,6 @@ Target metal(const std::vector<std::string>& options) {
  return CreateTarget("metal", options);
 }

-Target rasp(const std::vector<std::string>& options) {
-  return CreateTarget("llvm", MergeOptions(options, {
-    "-device=rasp",
-    "-mtriple=armv7l-none-linux-gnueabihf",
-    "-mcpu=cortex-a53",
-    "-mattr=+neon"
-  }));
-}
-
 Target mali(const std::vector<std::string>& options) {
  return CreateTarget("opencl", MergeOptions(options, {
    "-device=mali"
@ -731,11 +722,6 @@ TVM_REGISTER_API("_GetCurrentTarget")
 TVM_REGISTER_API("_EnterTargetScope")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
  Target target = args[0];
-  auto current = Target::current_target();
-  if (current.defined() && target->str() != current->str()) {
-    LOG(WARNING) << "Overriding target " << current->str()
-      << " with new target scope " << target->str();
-  }
  Target::EnterTargetScope(target);
  });

--- a/src/codegen/opt/build_opencl_off.cc
+++ b/src/codegen/opt/build_opencl_off.cc
@ -13,7 +13,6 @@ Module OpenCLModuleCreate(
    std::string fmt,
    std::unordered_map<std::string, FunctionInfo> fmap,
    std::string source) {
-  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "opencl");
 }

--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@ -108,19 +108,19 @@ def test_task_tuner_without_measurement():
    """test task and tuner without measurement"""
    task, target = get_sample_task()

-    def measure_batch(inputs):
+    def custom_measure(input_pack, build_func, build_args, number, repeat,
+                       ref_input, ref_output):
        from tvm.autotvm import MeasureResult

        results = []
-        for inp in inputs:
+        for inp in input_pack:
            tic = time.time()
            # do nothing
            time.sleep(0.001)
            results.append(MeasureResult([time.time() - tic], 0,
                                         time.time() - tic, time.time()))
        return results
-    measure_option = autotvm.measure_option(mode='custom',
-                                            custom_measure_batch=measure_batch)
+    measure_option = autotvm.measure_option(custom_measure)

    logging.info("%s", task.config_space)

@ -128,6 +128,7 @@ def test_task_tuner_without_measurement():
    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
        tuner = tuner_class(task)
        tuner.tune(n_trial=10, measure_option=measure_option)
+        assert tuner.best_flops > 1

 def test_tuning_with_measure():
    def check(target, target_host):
@ -140,7 +141,7 @@ def test_tuning_with_measure():
        task, target = get_sample_task(target, target_host)
        logging.info("%s", task.config_space)

-        measure_option = autotvm.measure_option(mode='local',
+        measure_option = autotvm.measure_option('local',
                                                timeout=4,
                                                number=2)

@ -152,7 +153,8 @@ def test_tuning_with_measure():

 if __name__ == "__main__":
    # only print log when invoked from main
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.DEBUG)

    test_task_tuner_without_measurement()
    test_tuning_with_measure()
+
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
@ -47,7 +47,7 @@ def test_db_filter():

    batch_size = 2

-    measure_option = autotvm.measure_option(mode='local-nofork', timeout=2)
+    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    ct = 0
@ -72,7 +72,7 @@ def test_db_filter():
    db.flush()

    # First setting, memoize one input at a time, check that each is saved and replayed
-    measure_option = autotvm.measure_option(mode='local-nofork', timeout=2, replay_db=db)
+    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    for i in range(len(all_inputs)+1):
@ -160,9 +160,10 @@ def test_db_save_replay():
    if not ctx.exist:
        logging.warning("Skip this test because there is no supported device for test")

-    measure_option = autotvm.measure_option(mode='local-nofork',
+    measure_option = autotvm.measure_option('local',
+                                            do_fork=False,
                                            timeout=2,
-                                            replay_db=_db, save_to_replay_db=True)
+                                            replay_db=_db)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    batch_size = 2
@ -182,6 +183,8 @@ def test_db_save_replay():
        results = measure_batch(inputs)
        all_results += results
        ct += 1
+    callback = autotvm.callback.log_to_database(_db)
+    callback(None, all_inputs, all_results)

    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
@ -207,7 +210,7 @@ def test_check_hashmismatch():
    if not ctx.exist:
        logging.warning("Skip this test because there is no supported device for test")

-    measure_option = autotvm.measure_option(mode='local-nofork')
+    measure_option = autotvm.measure_option('local', do_fork=False)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    inputs = list()
--- a/tests/python/unittest/test_autotvm_feature.py
+++ b/tests/python/unittest/test_autotvm_feature.py
@ -84,7 +84,7 @@ def test_feature_shape():
    targets = [
        tvm.target.cuda(),
        tvm.target.mali(),
-        tvm.target.rasp(),
+        tvm.target.arm_cpu(),
    ]

    for target in targets:
--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@ -28,7 +28,7 @@ def test_target_dispatch():
    with tvm.target.create("cuda"):
        assert mygeneric(1) == 3

-    with tvm.target.rasp():
+    with tvm.target.arm_cpu():
        assert mygeneric(1) == 11

    with tvm.target.create("metal"):
--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
@ -2,6 +2,9 @@

 export PYTHONPATH=python:nnvm/python:vta/python:topi/python

+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*/*.pyc
+rm -rf ~/.tvm
+
 echo "Running unittest..."
 python -m nose -v vta/tests/python/unittest || exit -1
 python3 -m nose -v vta/tests/python/unittest || exit -1
--- a/topi/python/topi/init.py
+++ b/topi/python/topi/init.py
@ -24,7 +24,7 @@ from .broadcast import *
 from . import nn
 from . import x86
 from . import cuda
-from . import rasp
+from . import arm_cpu
 from . import mali
 from . import intel_graphics
 from . import opengl
--- a/topi/python/topi/arm_cpu/init.py
+++ b/topi/python/topi/arm_cpu/init.py
@ -0,0 +1,5 @@
+"""Schedule for ARM CPU"""
+
+from . import conv2d
+from . import depthwise_conv2d
+from . import bitserial_conv2d
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@ -43,7 +43,7 @@ _QUANTIZED_SCHEDULES_NCHW = [
    SpatialPackNCHW(1, 1, 8, 1, 16),
 ]

-@_get_schedule.register("rasp")
+@_get_schedule.register("arm_cpu")
 def _get_schedule_bitserial_conv2d(wkl, layout):
    if wkl not in _WORKLOADS:
        raise ValueError("no schedule for such workload: {}".format(wkl))
@ -55,7 +55,7 @@ def _get_schedule_bitserial_conv2d(wkl, layout):
    return sch


-@bitserial_conv2d.register("rasp")
+@bitserial_conv2d.register("arm_cpu")
 def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
                                  layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
    if out_dtype is None:
@ -323,7 +323,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
    s = s.normalize()
    return s

-@generic.schedule_bitserial_conv2d_nhwc.register(["rasp"])
+@generic.schedule_bitserial_conv2d_nhwc.register(["arm_cpu"])
 def schedule_bitserial_conv2d_nhwc(outs):
    """Raspverry pi schedule for bitserial conv2d"""
    s = tvm.create_schedule([x.op for x in outs])
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@ -0,0 +1,515 @@
+# pylint: disable=invalid-name,unused-variable,no-else-return
+"""Conv2D schedule for ARM CPU"""
+from __future__ import absolute_import as _abs
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_tuple, const_matrix
+from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
+from ..nn.util import get_const_int, get_pad_tuple
+
+def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
+    """convert argument to workload"""
+    if len(kernel.shape) == 4:
+        raw_kernel = kernel
+    else:  # the input kernel is transformed by alter_op_layout
+        shape = get_const_tuple(kernel.shape)
+        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
+                                     dtype=kernel.dtype)
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
+
+@conv2d.register('arm_cpu')
+@autotvm.task.dispatcher
+def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
+    """TOPI compute callback. Mark this function as a dispatcher, so
+    this template can assign config according to workload"""
+    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
+
+@conv2d_arm_cpu.register(['direct'])
+def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
+    """spatial packing template"""
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
+
+@autotvm.task.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
+def schedule_conv2d_nchw_arm_cpu(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
+
+        if 'winograd_conv_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+    assert layout == "NCHW", "Only support NCHW"
+    out_dtype = out_dtype or data.dtype
+
+    _, CI, IH, IW = get_const_tuple(data.shape)
+    if len(kernel.shape) == 4:
+        pre_packed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:  # kernel tensor is pre packed
+        pre_packed = True
+        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
+        CO = CO * VC
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    N = 1
+    OH = (IH + pad_top + pad_down - KH) // HSTR + 1
+    OW = (IW + pad_left + pad_right - KW) // WSTR + 1
+    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
+
+    # ==================== define configuration space ====================
+    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    elif num_tile == 3:   # for mali gpu
+        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
+        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
+        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
+    kvshape = (CO // VC, CI, KH, KW, VC)
+    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, CO, OH, OW)
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                           name='data_vec')
+
+    if pre_packed:
+        kernel_vec = kernel
+    else:
+        kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
+                                 kernel[co*VC+vc][ci][kh][kw],
+                                 name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                axis=[ci, kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_conv_output',
+                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
+                                                                  layout, out_dtype)})
+    return output
+
+def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
+                           conv, output, last):
+    """schedule implementation"""
+    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
+    ci, kh, kw = s[conv].op.reduce_axis
+
+    # schedule conv
+    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kh.dom.extent),
+                                       get_const_int(kw.dom.extent)],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[cfg['tile_oh'].size[-1],
+                                        cfg['tile_ow'].size[-1],
+                                        cfg['tile_co'].size[-1]],
+                             max_unroll=16,
+                             cfg=cfg)
+
+    # schedule fusion
+    n, co, h, w = s[last].op.axis
+    co, vc = cfg['tile_co'].apply(s, last, co)
+    oh, vh = cfg['tile_oh'].apply(s, last, h)
+    ow, vw = cfg['tile_ow'].apply(s, last, w)
+    s[last].reorder(n, co, oh, ow, vh, vw, vc)
+    if last != output:
+        s[output].compute_inline()
+        cfg["ann_spatial"].apply(s, last, [vh, vw, vc],
+                                 axis_lens=[cfg['tile_oh'].size[-1],
+                                            cfg['tile_ow'].size[-1],
+                                            cfg['tile_co'].size[-1]],
+                                 max_unroll=16,
+                                 cfg=cfg)
+    s[conv].compute_at(s[last], ow)
+
+    # mark parallel
+    s[last].parallel(co)
+
+    _, h, _, _, _, _ = s[data_vec].op.axis
+    s[data_vec].parallel(h)
+
+    if kernel_vec.op.name == 'kernel_vec':
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compliation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(co, 'debug_skip_region')
+        else:
+            s[kernel_vec].parallel(co)
+
+    return s
+
+
+@conv2d_arm_cpu.register('winograd')
+def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+    tile_size = 4
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+
+def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if len(kernel.shape) == 4:
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
+        CO *= VC
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
+    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], dtype=np.float32)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], np.float32)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
+    r = 3
+    alpha = m + r - 1
+    K = CO
+    C = CI
+
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    VP = cfg['tile_p'].size[-1]
+    VK = cfg['tile_k'].size[-1]
+
+    # pack input tile
+    input_tile = tvm.compute((C, P // VP, alpha, alpha, VP),
+                             lambda c, b, eps, nu, bb:
+                             data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps]
+                             [(b*VP+bb) % nW * m + nu],
+                             name='d')
+
+    # transform kernel
+    if pre_computed:
+        U = kernel
+    else:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+        U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk:
+                        tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
+                                G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
+
+    # transform image
+    B = const_matrix(B_data, 'B')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb:
+                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
+                            B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')
+
+    # batch gemm
+    c = tvm.reduce_axis((0, C), name='c')
+    M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
+                    tvm.sum(U[eps][nu][k // VK][c][k % VK] *
+                            V[eps][nu][b // VP][c][b % VP], axis=c), name='M')
+
+    # inverse transform
+    A = const_matrix(A_data, 'A')
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
+                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                            axis=[r_eps, r_nu]), name='Y')
+
+    # unpack output
+    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
+                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
+                         name='output', tag='winograd_conv_output',
+                         attrs={'workload': _winograd_conv_arg_to_workload(
+                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+
+    # we have to manually assign effective GFLOP for winogard
+    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
+    return output
+
+def _schedule_winograd(cfg, s, output, last):
+    Y = output.op.input_tensors[0]
+    M, A = Y.op.input_tensors
+    U, V = M.op.input_tensors
+    d, B = V.op.input_tensors
+    data_pad = d.op.input_tensors[0]
+
+    # padding
+    s[data_pad].compute_inline()
+
+    # pack input tiles
+    s[d].compute_inline()
+
+    # transform kernel
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        kernel, G = U.op.input_tensors
+        s[G].compute_inline()
+        eps, nu, k, c, kk, = s[U].op.axis
+        r_kh, r_kw = s[U].op.reduce_axis
+        s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
+        s[U].unroll(eps)
+        s[U].unroll(nu)
+        s[U].unroll(r_kh)
+        s[U].unroll(r_kw)
+        s[U].vectorize(kk)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(k, 'debug_skip_region')
+        else:
+            s[U].parallel(k)
+
+    # transform image
+    DD = s.cache_read(d, 'global', [V])
+    s[B].compute_inline()
+    eps, nu, b, c, bb = s[V].op.axis
+    r_eps, r_nu = s[V].op.reduce_axis
+    s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
+    s[V].unroll(eps)
+    s[V].unroll(nu)
+    s[V].unroll(r_eps)
+    s[V].unroll(r_nu)
+    s[DD].compute_at(s[V], c)
+    s[V].vectorize(bb)
+    s[V].parallel(b)
+
+    # batch gemm
+    eps, nu, k, b = s[M].op.axis
+    c = s[M].op.reduce_axis[0]
+    cfg.define_split('tile_c', c, num_outputs=2, filter=lambda x: x.size[-1] <= 16)
+    co, ci = cfg['tile_c'].apply(s, M, c)
+    xo, xi = cfg['tile_p'].apply(s, M, b)
+    s[M].reorder(eps, nu, xo, co, k, ci, xi)
+    cfg.define_annotate('ann_reduce', [ci], policy='try_unroll')
+    cfg.define_annotate('ann_spatial', [k, xi], policy='try_unroll_vec')
+    cfg['ann_reduce'].apply(s, M, [ci],
+                            axis_lens=[cfg['tile_c'].size[-1]],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg['ann_spatial'].apply(s, M, [k, xi])
+
+    # inverse transform
+    s[A].compute_inline()
+    k, b, vh, vw = s[Y].op.axis
+    r_eps, r_nu = s[Y].op.reduce_axis
+    s[Y].unroll(vh)
+    s[Y].unroll(vw)
+    s[Y].unroll(r_eps)
+    s[Y].unroll(r_nu)
+
+    # output
+    n, co, h, w = s[last].op.axis
+    co, coi = cfg['tile_k'].apply(s, last, co)
+    s[M].compute_at(s[last], co)
+    s[last].parallel(co)
+
+    MM = s.cache_read(M, 'global', [Y])
+    m = get_const_int(V.shape[0]) + 1 - 3
+    ho, wo, hi, wi = s[last].tile(h, w, m, m)
+    s[Y].compute_at(s[last], wo)
+    s[MM].compute_at(s[last], wo)
+
+    if output != last:
+        s[output].compute_inline()
+
+
+def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size):
+    """convert argument to workload"""
+    K = 3
+    shape = get_const_tuple(kernel.shape)
+    alpha = tile_size + K - 1
+    if len(kernel.shape) == 4:
+        assert shape[2:] == (K, K)
+        CO, CI = shape[:2]
+    else:
+        assert shape[:2] == (alpha, alpha)
+        CO, CI, VCO = shape[2:]
+        CO *= VCO
+
+    raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
+    return ('conv2d', ) + autotvm.task.args_to_workload(
+        [data, raw_kernel, strides, padding, layout, out_dtype])
+
+
+@conv2d_winograd_without_weight_transform.register(['arm_cpu'])
+@autotvm.task.dispatcher
+def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
+                                          tile_size)
+
+
+@winograd_ww_config_dispatcher_.register(['winograd'])
+def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
+                          tile_size)
+
+
+@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                     'arm_cpu', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@conv2d_alter_layout.register(["arm_cpu", "mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    """Alter op layout for pre-computing kernel transformation"""
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    groups = attrs.get_int('groups')
+    layout = attrs["layout"]
+    out_dtype = attrs["out_dtype"]
+    out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
+
+    if groups == 1:
+        # query config of this workload
+        workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
+                                         layout, out_dtype)
+        cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
+
+        if cfg.template_key == 'direct':  # packing weight tensor
+            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
+            return sym.conv2d(*copy_inputs, **new_attrs)
+        else:  # pre-compute weight transformation in winograd
+            tile_size = 4
+
+            weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
+                                                                  tile_size=tile_size)
+            CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
+            VC = cfg['tile_k'].size[-1]
+            weight = sym.reshape(weight,
+                                 shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
+            weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
+
+            copy_inputs[1] = weight
+            new_attrs['tile_size'] = tile_size
+            return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+
+    # do nothing for depthwise convolution
+    return None
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@ -0,0 +1,94 @@
+# pylint: disable=invalid-name,unused-variable
+"""Depthwise convolution schedule for ARM CPU"""
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_depthwise_conv2d_nchw
+from ..nn import depthwise_conv2d_nchw
+from ..util import traverse_inline
+
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
+                                   depthwise_conv2d_nchw.fdefault)
+
+# register customized schedule for arm cpu.
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
+def schedule_depthwise_conv2d_nchw_(cfg, outs):
+    """Schedule depthwise conv2d"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(cfg, s, data, data_pad, kernel, output):
+        A, B, C = data, kernel, output
+        s[data_pad].compute_inline()
+
+        # define tile
+        n, c, h, w = s[output].op.axis
+        cfg.define_split('tile_c', c, num_outputs=2)
+        cfg.define_split('tile_h', h, num_outputs=2)
+        cfg.define_split('tile_w', w, num_outputs=2)
+
+        # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
+        A0 = s.cache_read(data_pad, "global", C)
+        _, c, h, w = s[A0].op.axis
+        c, vc = cfg['tile_c'].apply(s, A0, c)
+        s[A0].reorder(c, h, w, vc)
+        A1 = s.cache_write(A0, 'global')
+        s[A0].compute_inline()
+
+        # park kernel to vector form  [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
+        B0 = s.cache_read(B, "global", C)
+        c, m, h, w = s[B0].op.axis
+        c, vc, = cfg['tile_c'].apply(s, B0, c)
+        s[B0].reorder(c, m, h, w, vc)
+        B1 = s.cache_write(B0, 'global')
+        s[B0].compute_inline()
+
+        _, c, h, w = s[C].op.axis
+        c, vc, = cfg['tile_c'].apply(s, C, c)
+        s[C].reorder(c, h, w, vc)
+
+        # depthwise conv
+        C0 = s.cache_write(C, 'global')
+        _, c, h, w, vc = s[C0].op.axis
+        dh, dw = s[C0].op.reduce_axis
+        oh, ih = cfg['tile_h'].apply(s, C0, h)
+        ow, iw = cfg['tile_w'].apply(s, C0, w)
+        s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
+        s[A1].compute_at(s[C0], oh)
+
+        # try unroll and vectorization
+        cfg.define_annotate('ann', [ih, iw, vc], policy='try_unroll_vec')
+        cfg['ann'].apply(s, C0, [ih, iw, vc],
+                         axis_lens=[cfg['tile_h'].size[-1],
+                                    cfg['tile_w'].size[-1],
+                                    cfg['tile_c'].size[-1]],
+                         max_unroll=16,
+                         cfg=cfg)
+
+        # mark parallel
+        n, c, h, w = s[C].op.axis
+        s[C].parallel(c)
+
+        n, c, h, w, vc = s[C0].op.axis
+        s[C0].parallel(c)
+
+        c, m, h, w, vc = s[B1].op.axis
+        s[B1].parallel(c)
+
+        return s
+
+    def _callback(op):
+        if op.tag == 'depthwise_conv2d_nchw':
+            output = op.output(0)
+            kernel = op.input_tensors[1]
+            data = op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+            _schedule(cfg, s, data, data_pad, kernel, output)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@ -91,6 +91,54 @@ def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
    return _default_schedule(outs, False)


+@tvm.target.generic_func
+def schedule_conv2d_winograd_weight_transform(outs):
+    """Schedule for weight transformation of winograd
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    # Typically this is computed in nnvm PreCompute pass
+    # so we make a schedule here for cpu llvm
+    s = tvm.create_schedule([x.op for x in outs])
+    output = outs[0]
+    _, G = s[output].op.input_tensors
+    s[G].compute_inline()
+    eps, nu, co, ci = s[output].op.axis
+    r_kh, r_kw = s[output].op.reduce_axis
+    s[output].reorder(co, ci, r_kh, r_kw, eps, nu)
+    for axis in [r_kh, r_kw, eps, nu]:
+        s[output].unroll(axis)
+    s[output].parallel(co)
+    return s
+
+
+@tvm.target.generic_func
+def schedule_conv2d_winograd_without_weight_transform(outs):
+    """Schedule for winograd without weight transformation
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
@tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
    """Schedule for conv2d_transpose_nchw
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@ -3,78 +3,18 @@
 """Conv2D operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
+import numpy as np
 import tvm
+
 from .pad import pad
 from .util import get_pad_tuple
-from ..util import simplify
+from ..util import simplify, const_matrix, get_const_tuple

 # workload description of conv2d
 Workload = namedtuple('Workload',
                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])

-# schedule description of spatial
-SpatialPack = namedtuple('SpatialPack',
-                         ['vh', 'vw', 'vc', 'ba', 'bc', 'unroll'])
-
-# schedule description of im2col
-Im2ColPack = namedtuple('Im2ColPack',
-                        ['vp', 'vq', 'ba', 'bc', 'unroll'])
-
-_WORKLOADS = [
-    # workloads of resnet18 on imagenet
-    Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-    Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-    Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-    Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-    Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-    Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-    Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-    Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-    Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-    Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-    Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-    # workloads of mobile net on imagenet
-    Workload('float32', 'float32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2),
-    Workload('float32', 'float32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1),
-    Workload('float32', 'float32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1),
-    # workloads of resnet18 on imagenet (int16->int32 version)
-    Workload('int16', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-    Workload('int16', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-    Workload('int16', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-    Workload('int16', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-    Workload('int16', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-    Workload('int16', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-    Workload('int16', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-    Workload('int16', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-    Workload('int16', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-    Workload('int16', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-    Workload('int16', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-    # workloads of mobile net on imagenet (int16->int32 version)
-    Workload('int16', 'int32', 224, 224, 3, 32, 3, 3, 1, 1, 2, 2),
-    Workload('int16', 'int32', 112, 112, 32, 64, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 56, 56, 128, 128, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 28, 28, 256, 256, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 14, 14, 512, 512, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 7, 7, 512, 1024, 1, 1, 0, 0, 1, 1),
-    Workload('int16', 'int32', 7, 7, 1024, 1024, 1, 1, 0, 0, 1, 1),
-]
-
-# platform specific schedule
-_CONV_SCHEDULE = {}
-
@tvm.target.generic_func
 def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
    """Conv2D operator.
@ -178,137 +118,6 @@ def _get_schedule_NCHWc(wkl, layout, out_layout):
    return wkl


-def _spatial_pack(data, kernel, stride, padding, out_dtype=None):
-    """ Compute convolution with pack on spatial axes. """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI, CO = wkl.in_filter, wkl.out_filter
-    KH, KW = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    HCAT, WCAT = KH-1, KW-1
-
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    UNROLL = sch.unroll
-
-    TH = H + 2*HPAD
-    TW = W + 2*WPAD
-    OH = (H + 2*HPAD - KH) // HSTR + 1
-    OW = (W + 2*WPAD - KW) // WSTR + 1
-
-    dshape = (1, CI, H, W)
-    dpshape = (1, CI, TH, TW)
-    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
-
-    kshape = (CO, CI, KH, KW)
-    kvshape = (CO/VC, CI, KH, KW, VC)
-
-    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (1, CO, OH, OW)
-
-    DOPAD = (HPAD != 0 and WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
-        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
-        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
-                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
-                axis=[ci, dh, dw]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv_output')
-
-    return output
-
-
-def _im2col_pack(data, kernel, stride, padding, out_dtype=None):
-    """ Compute convolution with im2col pack layout. """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert data.shape[0].value == 1, "im2col pack convolution only support batch size=1"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-
-    N = 1
-    H, W = wkl.height, wkl.width
-    CI = wkl.in_filter
-    CO = wkl.out_filter
-    KH, KW = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.hpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    OH = (H + 2*HPAD - KH) // HSTR + 1
-    OW = (W + 2*WPAD - KW) // WSTR + 1
-
-    P = sch.vp
-    Q = sch.vq
-    UNROLL = sch.unroll
-
-    dshape = (N, CI, H, W)
-    dpshape = (N, CI, H+2*HPAD, W+2*WPAD)
-    dcshape = (N, OH, OW, CI, KH, KW)
-    dvshape = (N, OH * OW // P, CI, KH, KW, P)
-
-    kshape = (CO, CI, KH, KW)
-    kvshape = (CO // Q, CI, KH, KW, Q)
-
-    ovshape = (N, CO // Q, OH * OW // P, P, Q)
-    oshape = (N, CO, OH, OW)
-
-    ############### declaration
-
-    DO_PAD = (wkl.hpad != 0 and wkl.wpad != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    data_col = tvm.compute(dcshape, lambda n, oh, ow, ci, hk, wk: \
-        data_pad[n][ci][oh*HSTR+hk][ow*WSTR+wk], name='data_col')
-
-    data_vec = tvm.compute(dvshape, lambda n, im, ci, hk, wk, vim: \
-        data_col[n][(im*P+vim)//OW][(im*P+vim)%OW][ci][hk][wk], name='data_vec')
-
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
-        kernel[co*Q+vc][ci][dh][dw], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    hk = tvm.reduce_axis((0, KH), name='hk')
-    wk = tvm.reduce_axis((0, KW), name='wk')
-
-    conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \
-        tvm.sum(data_vec[n][im][ci][hk][wk][vim].astype(out_dtype) *
-                kernel_vec[co][ci][hk][wk][vco].astype(out_dtype),
-                axis=[ci, hk, wk]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w: \
-                         conv[n][co//Q][(h*OW+w)//P][(h*OW+w)%P][co%Q],
-                         name='output_vec', tag='im2col_conv_output')
-
-    return output
-
-
 def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
    """Convolution operator in NCHW layout.

@ -435,7 +244,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
    Returns
    -------
    output : tvm.Tensor
-        4-D with shape [batch, out_height,  out_width, out_channel]
+        4-D with shape [batch, out_height, out_width, out_channel]
    """
    assert isinstance(stride, int) or len(stride) == 2
    batch, in_height, in_width, in_channel = Input.shape
@ -465,6 +274,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
        name="Conv2dOutput", tag="conv2d_nhwc")
    return Output

+
@tvm.target.generic_func
 def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
                 padding, layout, out_layout, out_dtype='float32'):
@ -510,8 +320,80 @@ def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
    # default declaration
    raise ValueError("missing register for topi.nn.conv2d_NCHWc")

-# map from schedule type to declaration function
-_SCH_TO_DECL_FUNC = {
-    SpatialPack: _spatial_pack,
-    Im2ColPack: _im2col_pack,
-}
+
+def conv2d_winograd_weight_transform(kernel, tile_size):
+    """Weight transformation for winograd
+
+    Parameters
+    ----------
+    kernel: Tensor
+        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now
+    tile_size: int
+        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [alpha, alpha, CO, CI]
+    """
+    K = 3
+
+    shape = get_const_tuple(kernel.shape)
+    assert shape[2:] == (K, K), "Only support 3x3 kernel"
+
+    r = tile_size + K - 1
+    shape = (r, r) + shape[:2]
+
+    if tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1],
+        ], dtype=kernel.dtype)
+    elif tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]
+        ], dtype=kernel.dtype)
+    else:
+        raise ValueError("Unsupoorted tile size:" + tile_size)
+
+    G = const_matrix(G_data, 'G')
+    r_kh = tvm.reduce_axis((0, K), name='r_kh')
+    r_kw = tvm.reduce_axis((0, K), name='r_kw')
+    return tvm.compute(shape, lambda eps, nu, co, ci:
+                       tvm.sum(kernel[co][ci][r_kh][r_kw] *
+                               G[eps][r_kh] * G[nu][r_kw],
+                               axis=[r_kh, r_kw]), name='transform_weight')
+
+
+@tvm.target.generic_func
+def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
+                                             layout, out_dtype, tile_size):
+    """Compute convolution in winograd algorithm. The filter is supposed to be transformed
+    in advance.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    filter : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+    tile_size: int
+        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
--- a/topi/python/topi/rasp/init.py
+++ b/topi/python/topi/rasp/init.py
@ -1,7 +0,0 @@
-# pylint: disable=redefined-builtin, wildcard-import
-"""Raspberry pi specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
-from .conv2d import schedule_conv2d_nchw
-from .depthwise_conv2d import schedule_depthwise_conv2d_nchw
-from .bitserial_conv2d import schedule_bitserial_conv2d_nhwc
--- a/topi/python/topi/rasp/conv2d.py
+++ b/topi/python/topi/rasp/conv2d.py
@ -1,358 +0,0 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name
-"""Conv2D schedule on raspberry pi"""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import tag
-from ..nn.conv2d import conv2d as _conv2d, _get_schedule
-from ..nn.conv2d import SpatialPack, Im2ColPack
-from ..nn.conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC
-from ..nn.conv2d import _get_workload
-from ..nn.util import infer_pad, infer_stride
-from .. import generic
-
-_SCHEDULES = [
-    # float32 imagenet
-    SpatialPack(1, 8, 4, 1, 4, True),
-    SpatialPack(1, 7, 4, 2, 4, True),
-    SpatialPack(1, 4, 8, 4, 1, True),
-    SpatialPack(1, 4, 4, 1, 16, False),
-    SpatialPack(1, 4, 8, 4, 8, False),
-    SpatialPack(1, 7, 4, 3, 8, True),
-    SpatialPack(1, 2, 8, 1, 8, True),
-    SpatialPack(2, 1, 16, 1, 4, True),
-    SpatialPack(1, 7, 4, 1, 1, True),
-    Im2ColPack(7, 4, 1, 16, True),
-    Im2ColPack(7, 4, 1, 8, False),
-    Im2ColPack(7, 4, 1, 16, False),
-
-    # float32 mobilenet
-    SpatialPack(2, 2, 4, 28, 1, True),
-    SpatialPack(1, 4, 8, 14, 1, False),
-    SpatialPack(1, 2, 16, 8, 1, True),
-    SpatialPack(1, 4, 8, 8, 8, True),
-    SpatialPack(2, 2, 8, 1, 1, False),
-    SpatialPack(1, 4, 8, 4, 8, False),
-    SpatialPack(2, 2, 8, 1, 4, False),
-    SpatialPack(2, 2, 8, 1, 8, False),
-    Im2ColPack(7, 4, 1, 16, False),
-    Im2ColPack(7, 4, 1, 4, True),
-
-    # int8 imagenet
-    SpatialPack(2, 2, 4, 19, 8, False),
-    SpatialPack(2, 2, 8, 1, 4, True),
-    SpatialPack(2, 2, 8, 7, 4, False),
-    SpatialPack(2, 4, 4, 7, 16, False),
-    SpatialPack(1, 7, 4, 14, 4, True),
-    SpatialPack(2, 2, 8, 5, 1, False),
-    SpatialPack(1, 2, 16, 3, 8, True),
-    SpatialPack(1, 7, 4, 1, 16, True),
-    SpatialPack(2, 2, 8, 2, 16, True),
-    SpatialPack(1, 1, 8, 4, 4, True),
-    SpatialPack(1, 1, 4, 1, 8, False),
-    SpatialPack(1, 1, 8, 1, 16, True),
-
-    # int8 mobilenet
-    SpatialPack(2, 2, 8, 8, 1, True),
-    SpatialPack(1, 7, 4, 16, 4, True),
-    SpatialPack(1, 4, 8, 1, 1, True),
-    SpatialPack(1, 4, 8, 1, 1, True),
-    SpatialPack(1, 4, 8, 4, 8, True),
-    SpatialPack(1, 4, 8, 7, 1, True),
-    SpatialPack(1, 2, 8, 2, 32, True),
-    SpatialPack(1, 2, 16, 2, 16, True),
-    SpatialPack(1, 1, 32, 1, 16, False),
-    SpatialPack(1, 1, 16, 1, 32, True),
-]
-
-@_get_schedule.register("rasp")
-def _get_schedule_conv2d(wkl):
-    if wkl not in _WORKLOADS:
-        raise ValueError("no schedule for such workload: {}".format(wkl))
-    idx = _WORKLOADS.index(wkl)
-    sch = _SCHEDULES[idx]
-    return sch
-
-
-@_conv2d.register("rasp")
-def _declaration_conv2d(data, kernel, stride, padding, layout, out_dtype):
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert layout == 'NCHW', "only support NCHW convolution on rasp"
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-    return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, out_dtype)
-
-
-def _schedule_spatial_conv2d(s, data, data_pad, data_vec,
-                             kernel, kernel_vec,
-                             conv_out, output, last):
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI, CO = wkl.in_filter, wkl.out_filter
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    HCAT, WCAT = HK-1, WK-1
-    DOPAD = (HPAD != 0 and WPAD != 0)
-
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    UNROLL = sch.unroll
-
-    A, B, C = data, kernel, last
-    A0, A1 = data_pad, data_vec
-    B0 = kernel_vec
-    C0, C1 = conv_out, output
-
-    CC = s.cache_write(C0, "global")
-
-    _, co, oh, ow, vh, vw, vc = s[C0].op.axis
-    if UNROLL:
-        s[C0].unroll(vw)
-    s[C0].vectorize(vc)
-
-    s[CC].compute_at(s[C0], ow)
-    _, co, oh, ow, vh, vw, vc = s[CC].op.axis
-    ci, dh, dw = s[CC].op.reduce_axis
-    s[CC].reorder(ci, dh, vh, dw, vw, vc)
-
-    if UNROLL:
-        s[CC].unroll(vw)
-    s[CC].vectorize(vc)
-
-    ##### Schedule A
-    if DOPAD:
-        s[A0].compute_inline()
-
-    _, h, _, _, _, _ = s[A1].op.axis
-    if sch.ba == 1:
-        oaxis = h
-        paxis = h
-    else:
-        oh, ih = s[A1].split(h, sch.ba)
-        oaxis = oh
-        paxis = ih
-
-    s[A1].parallel(paxis)
-    s[A1].pragma(oaxis, "parallel_launch_point")
-    s[A1].pragma(paxis, "parallel_stride_pattern")
-    s[A1].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule B
-    co, _, _, _, _ = s[B0].op.axis
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[B0].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[B0].parallel(paxis)
-    s[B0].pragma(oaxis, "parallel_launch_point")
-    s[B0].pragma(paxis, "parallel_stride_pattern")
-    s[B0].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule C
-    n, co, h, w = s[C].op.axis
-    co, vc = s[C].split(co, VC)
-    oh, ow, vh, vw = s[C].tile(h, w, VH, VW)
-    s[C].reorder(n, co, oh, ow, vh, vw, vc)
-    if C != C1:
-        s[C1].compute_inline()
-    s[C0].compute_at(s[C], ow)
-
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[C].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[C].parallel(paxis)
-    s[C].pragma(oaxis, "parallel_launch_point")
-    s[C].pragma(paxis, "parallel_stride_pattern")
-    s[C].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
-                            kernel, kernel_vec,
-                            conv_out, output, last):
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    H, W = wkl.height, wkl.width
-    CI = wkl.in_filter
-    CO = wkl.out_filter
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    HCAT, WCAT = HK-1, WK-1
-    DOPAD = (HPAD != 0 and WPAD != 0)
-
-    P = sch.vp
-    Q = sch.vq
-    UNROLL = sch.unroll
-
-    A, B, C = data, kernel, last
-    A0, A1, A2 = data_pad, data_col, data_vec
-    B0 = kernel_vec
-    C0, C1 = conv_out, output
-
-    CC = s.cache_write(C0, "global")
-    AA = s.cache_read(A2, "global", [CC])
-    BB = s.cache_read(B0, "global", [CC])
-
-
-    ##### Schedule CC
-    _, co, im, vim, vco = s[C0].op.axis
-    s[C0].unroll(vim)
-    s[C0].vectorize(vco)
-
-    s[CC].compute_at(s[C0], im)
-    _, co, im, vim, vco = s[CC].op.axis
-    ci, hk, wk = s[CC].op.reduce_axis
-    s[CC].reorder(ci, hk, wk, vim, vco)
-    s[CC].unroll(vim)
-    s[CC].vectorize(vco)
-    # s[CC].unroll(ccr)
-
-    ### Schedule C
-    _, co, h, w = s[C].op.axis
-    im = s[C].fuse(h, w)
-    im, vim = s[C].split(im, P)
-    co, vco = s[C].split(co, Q)
-    s[C].reorder(co, im, vim, vco)
-
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[C].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[C].parallel(paxis)
-    s[C].pragma(oaxis, "parallel_launch_point")
-    s[C].pragma(paxis, "parallel_stride_pattern")
-    s[C].pragma(oaxis, "parallel_barrier_when_finish")
-    if C1 != C:
-        s[C1].compute_inline()
-
-    s[C0].compute_at(s[C], paxis)
-
-    ##### Schedule A
-    if DOPAD:
-        s[A0].compute_inline()
-    s[A1].compute_inline()
-    s[AA].compute_at(s[CC], wk)
-    s[AA].unroll(AA.op.axis[4])
-
-    _, im, _, _, _, _ = s[A2].op.axis
-    if sch.ba == 1:
-        oaxis = im
-        paxis = im
-    else:
-        oim, iim = s[A2].split(im, sch.ba)
-        oaxis = oim
-        paxis = iim
-
-    s[A2].parallel(paxis)
-    s[A2].pragma(oaxis, "parallel_launch_point")
-    s[A2].pragma(paxis, "parallel_stride_pattern")
-    s[A2].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule B
-    s[BB].compute_at(s[CC], wk)
-    s[BB].vectorize(BB.op.axis[4])
-
-    co, _, _, _, _ = s[B0].op.axis
-    if sch.bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[B0].split(co, sch.bc)
-        oaxis = oco
-        paxis = ico
-
-    s[B0].parallel(paxis)
-    s[B0].pragma(oaxis, "parallel_launch_point")
-    s[B0].pragma(paxis, "parallel_stride_pattern")
-    s[B0].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-@generic.schedule_conv2d_nchw.register(["rasp"])
-def schedule_conv2d_nchw(outs):
-    """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-
-        if 'spatial_conv_output' in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            _schedule_spatial_conv2d(s, data, data_pad, data_vec,
-                                     kernel, kernel_vec,
-                                     conv_out, output, outs[0])
-
-        if 'im2col_conv_output' in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[0]
-            data_col = data_vec.op.input_tensors[0]
-            data = data_col.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
-                                    kernel, kernel_vec,
-                                    conv_out, output, outs[0])
-
-    traverse(outs[0].op)
-    return s
--- a/topi/python/topi/rasp/depthwise_conv2d.py
+++ b/topi/python/topi/rasp/depthwise_conv2d.py
@ -1,207 +0,0 @@
-# pylint: disable=invalid-name,unused-variable, unused-argument
-"""Schedule for depthwise_conv2d with auto fusion"""
-from __future__ import absolute_import as _abs
-from collections import namedtuple
-import tvm
-from .. import tag
-from ..nn.util import infer_pad, infer_stride, get_pad_tuple
-from .. import generic
-
-_Workload = namedtuple('Workload',
-                       ['in_dtype', 'out_dtype', 'height', 'width', 'channel', 'multiplier',
-                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
-
-_Schedule = namedtuple('Schedule', ['vh', 'vw', 'vc', 'bc', 'unroll'])
-
-# workloads of depthwise conv mobile net on imagenet
-_WORKLOADS = [
-    _Workload('float32', 'float32', 112, 112, 32, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('float32', 'float32', 112, 112, 64, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('float32', 'float32', 56, 56, 128, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('float32', 'float32', 28, 28, 256, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('float32', 'float32', 14, 14, 512, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('float32', 'float32', 7, 7, 1024, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('int16', 'int32', 112, 112, 32, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('int16', 'int32', 112, 112, 64, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('int16', 'int32', 56, 56, 128, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('int16', 'int32', 56, 56, 128, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('int16', 'int32', 28, 28, 256, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('int16', 'int32', 28, 28, 256, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('int16', 'int32', 14, 14, 512, 1, 3, 3, 1, 1, 1, 1),
-    _Workload('int16', 'int32', 14, 14, 512, 1, 3, 3, 1, 1, 2, 2),
-    _Workload('int16', 'int32', 7, 7, 1024, 1, 3, 3, 1, 1, 1, 1),
-]
-
-_SCHEDULES = [
-    _Schedule(2, 1, 4, 1, True),
-    _Schedule(2, 4, 4, 2, True),
-    _Schedule(2, 1, 4, 2, False),
-    _Schedule(2, 4, 4, 1, True),
-    _Schedule(4, 1, 4, 8, True),
-    _Schedule(1, 1, 4, 2, True),
-    _Schedule(1, 1, 8, 8, True),
-    _Schedule(1, 1, 4, 1, False),
-    _Schedule(1, 1, 4, 4, False),
-    _Schedule(2, 4, 4, 2, False),
-    _Schedule(2, 7, 4, 1, True),
-    _Schedule(2, 4, 4, 4, False),
-    _Schedule(2, 2, 4, 4, False),
-    _Schedule(2, 2, 8, 4, False),
-    _Schedule(2, 2, 4, 4, True),
-    _Schedule(2, 2, 8, 4, False),
-    _Schedule(1, 2, 8, 4, True),
-    _Schedule(1, 1, 4, 8, True),
-]
-
-def _get_workload(data, kernel, stride, padding, out_dtype):
-    _, C, IH, IW = [x.value for x in data.shape]
-    _, MT, KH, KW = [x.value for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    return _Workload(data.dtype, out_dtype, IH, IW, C, MT, KH, KW, HPAD, WPAD, HSTR, WSTR)
-
-
-def _schedule(s, data, data_pad, kernel, output, last):
-    padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-
-    if wkl not in _WORKLOADS:
-        return s
-
-    # use specified schedule
-    sch = _SCHEDULES[_WORKLOADS.index(wkl)]
-
-    H, W = wkl.height, wkl.width
-    CN = wkl.channel
-    MT = wkl.multiplier
-
-    HK, WK = wkl.hkernel, wkl.wkernel
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    VH, VW = sch.vh, sch.vw
-    BC = sch.bc
-    VC = sch.vc
-
-    TH = H + 2*HPAD
-    TW = W + 2*WPAD
-    OH = (H + 2*HPAD - HK) / HSTR + 1
-    OW = (W + 2*WPAD - WK) / WSTR + 1
-
-
-    A, B, C = data, kernel, output
-    A0 = data_pad
-
-    A1 = s.cache_read(A0, "global", C)
-    _, c, h, w = s[A1].op.axis
-    c, vc = s[A1].split(c, VC)
-    s[A1].reorder(c, h, w, vc)
-
-    A2 = s.cache_write(A1, 'global')
-    s[A0].compute_inline()
-    s[A1].compute_inline()
-
-    B0 = s.cache_read(B, "global", C)
-    c, m, h, w = s[B0].op.axis
-    c, vc = s[B0].split(c, VC)
-    s[B0].reorder(c, m, h, w, vc)
-
-    B1 = s.cache_write(B0, 'global')
-    s[B0].compute_inline()
-
-    _, c, h, w = s[C].op.axis
-    c, vc = s[C].split(c, VC)
-    s[C].reorder(c, h, w, vc)
-
-
-    C0 = s.cache_write(C, 'global')
-    _, c, h, w, vc = s[C0].op.axis
-    dh, dw = s[C0].op.reduce_axis
-    oh, ow, ih, iw = s[C0].tile(h, w, VH, VW)
-    s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
-    if sch.unroll:
-        s[C0].unroll(iw)
-    s[C0].vectorize(vc)
-
-
-    # # s[C0].compute_at(s[C0], ow)
-    launch, c, _, _ = s[C].op.axis
-    s[C].pragma(launch, "parallel_launch_point")
-
-    s[C].parallel(c)
-    s[C].pragma(c, "parallel_stride_pattern")
-    s[C].pragma(c, "parallel_barrier_when_finish")
-
-
-    s[C0].compute_at(s[C], launch)
-    _, c, h, w, vc = s[C0].op.axis
-    s[C0].parallel(c)
-    s[C0].pragma(c, "parallel_stride_pattern")
-    s[C0].pragma(c, "parallel_barrier_when_finish")
-
-
-    s[A2].compute_at(s[C0], oh)
-    # parallel(s[A2], s[A2].op.axis[1], BC)
-
-    # # s[B0].compute_at(s[C0], ow)
-    s[B1].compute_at(s[C], launch)
-    c, m, h, w, vc = s[B1].op.axis
-    s[B1].parallel(c)
-    s[B1].pragma(c, "parallel_stride_pattern")
-    s[B1].pragma(c, "parallel_barrier_when_finish")
-
-    return s
-
-
-@generic.schedule_depthwise_conv2d_nchw.register(["cpu", "rasp"])
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d nchw forward.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for depthwise_conv2d nchw.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        # schedule depthwise_conv2d
-        if op.tag == 'depthwise_conv2d_nchw':
-            output = op.output(0)
-            kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule(s, data, data_pad, kernel, output, outs[0])
-
-    traverse(outs[0].op)
-    return s
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@ -1,7 +1,30 @@
+# pylint: disable=invalid-name
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
 import tvm

+from . import tag
+
+def traverse_inline(s, op, callback):
+    """Traverse computation graph and do auto inline
+
+    Parameters
+    ----------
+    s: schedule
+        The schedule
+    op: Operation
+        The final output operator.
+    callback: callable
+        The callback function on each op
+    """
+    if tag.is_injective(op.tag):
+        if op not in s.outputs:
+            s[op].compute_inline()
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors:
+                traverse_inline(s, tensor.op, callback)
+    callback(op)
+

 def prod(x):
    """Get the product of every items in the tuple.
@ -151,3 +174,33 @@ def unravel_index(idx, shape):
        idx = idx // shape[i]
    indices = indices[::-1]
    return indices
+
+
+def const_matrix(matrix, name="const_matrix"):
+    """convert a const numpy 2-dimensional matrix to tvm tensor
+
+    Parameters
+    ----------
+    matrix: numpy.ndarray
+        Const input array
+    name: str, optional
+        The name of output op
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    row, col = matrix.shape
+    dtype = str(matrix.dtype)
+
+    def select_array(i, j):
+        now = tvm.const(0.0, dtype)
+        for ii in range(row):
+            for jj in range(col):
+                now = tvm.select(tvm.all(i % row == ii, j % col == jj),
+                                 tvm.const(matrix[ii][jj], dtype),
+                                 now)
+        return now
+
+    return tvm.compute(matrix.shape, select_array, name=name)
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@ -22,7 +22,7 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
    input_type='uint32'
    out_dtype='int32'

-    with tvm.target.rasp():
+    with tvm.target.arm_cpu('rasp3b'):
        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
--- a/topi/tests/python/test_topi_conv2d.py
+++ b/topi/tests/python/test_topi_conv2d.py
@ -2,6 +2,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@ -11,10 +12,10 @@ from topi.util import get_const_tuple
 def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
    in_height = in_width = in_size

-    with tvm.target.rasp():
+    with tvm.target.arm_cpu():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-        B = topi.nn.conv2d(A, W, stride, padding)
+        B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
        s = topi.generic.schedule_conv2d_nchw([B])

    a_shape = get_const_tuple(A.shape)
@ -39,7 +40,8 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

 def test_conv2d():
-    verify_conv2d(1, 56,  64, 64,  3, 1, 1)
+    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
+        verify_conv2d(1, 56, 64, 64, 3, 1, 1)

 if __name__ == "__main__":
    test_conv2d()
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@ -8,6 +8,27 @@ NVIDIA GPU. By running auto-tuner on this template, we can outperform the
 vendor provided library CuDNN in many cases.
 """

+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
 import logging
 import sys
 import numpy as np
@ -133,7 +154,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 # for this template

 # logging config (for printing tuning log to screen)
-logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

 # the last layer in resnet
 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
@ -144,12 +165,12 @@ print(task.config_space)

 # use local gpu, measure 5 times for every config to reduce variance
 # run 8 parallel threads for compilation
-measure_option = autotvm.measure_option(mode='local',
-                                        number=10,
+measure_option = autotvm.measure_option('local',
+                                        number=5,
                                        parallel_num=8,
                                        timeout=20)

-# begin tuning, log records to file `conv2d.tsv`
+# begin tuning, log records to file `conv2d.log`
 tuner = autotvm.tuner.XGBTuner(task)
 tuner.tune(n_trial=20,
           measure_option=measure_option,
@ -186,6 +207,6 @@ np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

 # Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
-
 evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
 print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
+
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@ -0,0 +1,364 @@
+"""
+Auto-tuning a convolutional network for ARM CPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for a specific ARM device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for ARM CPU in TVM is written in template form.
+It has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will do tuning for all convolution and depthwise convolution operators
+in the neural network. After the tuning, we can get a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`ARM CPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#arm-cpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    shape = {"data": (batch_size, 3, 224, 224)}
+    output_shape = (batch_size, 1000)
+
+    if name =='resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name =='mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name =='squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name =='vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name =='custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, shape, output_shape
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, then we can run 10 measurements in parallel, which accelerates
+# the tuning process.
+#
+# To start an RPC tracker, run this command in the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------
+#    key          free    pending
+#    ----------------------------
+#    mate10pro    2       0
+#    rk3399       2       0
+#    rpi3b        11      0
+#    ----------------------------
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should do some configurations. Here I use an RK3399 board
+# in our environment as example. In your setting, you should modify the target
+# and device_key accordingly.
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target = tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu')
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# tuning option
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+   'log_filename': log_file,
+
+   'tuner':'xgb',
+   'n_trial': 1000,
+   'early_stopping': 200,
+
+   'measure_option': autotvm.measure_option(
+       autotvm.use_rpc(device_key, host='localhost', port=9190),
+       number=4,
+       parallel_num=1,
+       timeout=10),
+
+   'use_transfer_learning': True,
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well. It is the same
+#   value that we used to generate pre-tuned parameters.
+#   If you have multiple devices, you can set :code:`parallel_num` to
+#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
+#   boards to the tracker).
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device is very slow or a single conv2d operator in your network has large FLOPs,
+#   consider setting timeout larger.
+#
+#   **For android phone**, add :code:`build_func='ndk'` to the argument list of
+#   :code:`autotvm.measure_option` to use Android NDK for creating shared library.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tune them in sequential order.
+# Later we will bring more sophisticated tuner scheduler.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=500,
+               early_stopping=200,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                tasks.append(tsk)
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(tasks):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate():
+    # extract workloads from nnvm graph
+    net, params, shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,),
+                                            target=target)
+
+    # run tuning tasks
+    tune_tasks(tasks, **tuning_option)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target,
+                shape=shape, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if tuning_option['measure_option']['build_func'] == 'ndk': # for android
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
+        module = runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
+        prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+                (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run by yourself.
+# tune_and_evaluate()
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to train xgboost models and use them for prediction.
+# So a high performance CPU is recommended.
+# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
+# One sample output is
+#
+# .. code-block:: bash
+#
+#    [Task  1/16]  Current/Best:   13.15/  20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
+#    [Task  2/16]  Current/Best:   16.66/  22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
+#    [Task  3/16]  Current/Best:   10.33/  14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
+#    [Task  4/16]  Current/Best:   13.29/  20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
+#    [Task  5/16]  Current/Best:   13.28/  15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
+#    [Task  6/16]  Current/Best:   20.16/  23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
+#    [Task  7/16]  Current/Best:    9.22/  22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
+#    [Task  8/16]  Current/Best:   14.12/  17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
+#    [Task  9/16]  Current/Best:   14.59/  24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
+#    [Task 10/16]  Current/Best:    9.86/  21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
+#    [Task 11/16]  Current/Best:    5.01/  18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
+#    [Task 12/16]  Current/Best:    8.61/  25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
+#    [Task 13/16]  Current/Best:   10.87/  25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
+#    [Task 14/16]  Current/Best:   15.33/  29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
+#    [Task 15/16]  Current/Best:   12.09/  38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
+#    [Task 16/16]  Current/Best:   16.77/  47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 156.51 ms (0.89 ms)
+#
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@ -12,6 +12,27 @@ In this tutorial, you can learn how to perform these two steps in tvm.
 The whole workflow is illustrated by a matrix multiplication example.
 """

+######################################################################
+# Install dependencies
+# ----------------------------------------
+# To use autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost
+#
+# To make tvm run faster in tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
 import logging
 import sys

@ -247,10 +268,10 @@ print(task.config_space)
 # used to get the best config later.

 # logging config (for printing tuning log to screen)
-logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

 # use local cpu, measure 5 times for every config to reduce variance
-measure_option = autotvm.measure_option(mode='local',
+measure_option = autotvm.measure_option('local',
                                        number=5)

 # begin tuning, log records to file `matmul.log`
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@ -3,113 +3,68 @@

 Cross Compilation and RPC
 =========================
-**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_
+**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_

 This tutorial introduces cross compilation and remote device
 execution with RPC in TVM.

 With cross compilation and RPC, you can **compile program on your
-local machine then run it on remote device**. It is useful when the
-resource of remote device is limited, like Raspberry Pi and mobile
-platforms, so you do not wish to put the compilation procedure on
-the device in order to save time and space.
-In this tutorial, I will take Raspberry Pi as our target platform
-for example.
+local machine then run it on the remote device**. It is useful when
+the resource of remote devices is limited, like Raspberry Pi and mobile
+platforms. In this tutorial, we will take Raspberry Pi for CPU example
+and Firefly-RK3399 for opencl example.
 """

 ######################################################################
 # Build TVM Runtime on Device
 # ---------------------------
 #
-# There're some prerequisites: similar as compiling TVM on your
-# local machine, we need build runtime on remote device.
+# The first step is to build tvm runtime on the remote device.
 #
 # .. note::
 #
 #   All instructions in both this section and next section should be
 #   executed on the target device, e.g. Raspberry Pi. And we assume it
 #   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
 #
-# To get started, clone tvm repo from github. It is important to clone
-# the submodules along, with --recursive option (Assuming you are in
-# your home directory):
+# .. code-block:: bash
 #
-#   .. code-block:: bash
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   make runtime -j2
 #
-#     git clone --recursive https://github.com/dmlc/tvm
+# After building runtime successfully, we need to set environment variables
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
 #
-# .. note::
+# .. code-block:: bash
 #
-#   Usually device has limited resources and we only need to build
-#   runtime. The idea is we will use TVM compiler on the local server
-#   to compile and upload the compiled program to the device and run
-#   the device function remotely.
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
 #
-#   .. code-block:: bash
-#
-#     cd tvm
-#     cp make/config.mk .
-#     echo USE_RPC=1>> config.mk
-#
-#   Also make sure that you have set :code:`USE_RPC=1` in your
-#   :code:`config.mk`. We don't need LLVM when building runtime, so
-#   :code:`LLVM_CONFIG = llvm-config` in :code:`config.mk` is commented
-#   out by default. After that, build runtime!
-#
-#   .. code-block:: bash
-#
-#     make runtime
-#
-# After building runtime successfully, we need to set environment varibles
-# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
-# of system enviroment variables. Assuming your TVM directory is in
-# :code:`~/tvm` and set environment variables below your account.
-#
-#   .. code-block:: bash
-#
-#    vi ~/.bashrc
-#
-# We need to edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
-# lines below (Assuming your TVM directory is in :code:`~/tvm`):
-#
-#   .. code-block:: bash
-#
-#    export TVM_HOME=~/tvm
-#    export PATH=$PATH:$TVM_HOME/lib
-#    export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
-#
-# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
+# To update the environment variables, execute :code:`source ~/.bashrc`.

 ######################################################################
 # Set Up RPC Server on Device
 # ---------------------------
-# To set up a TVM RPC server on the Raspberry Pi (our remote device),
-# we have prepared a one-line script so you only need to run this
-# command after following the installation guide to install TVM on
-# your device:
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in this example).
 #
 #   .. code-block:: bash
 #
 #     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
 #
-# After executing the command above, if you see these lines below, it means
-# the RPC server started successfully on your device.
+# If you see the line below, it means the RPC server started
+# successfully on your device.
 #
 #    .. code-block:: bash
 #
-#      Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
 #      INFO:root:RPCServer: bind to 0.0.0.0:9090
 #
-# In the following code block, we simply start an RPC server on the
-# same machine, for demonstration. This line can be omitted if we
-# started an remote server.
-#
-from __future__ import absolute_import, print_function
-
-import tvm
-import numpy as np
-from tvm import rpc
-from tvm.contrib import util

 ######################################################################
 # Declare and Cross Compile Kernel on Local Machine
@ -117,36 +72,50 @@ from tvm.contrib import util
 #
 # .. note::
 #
-#   Now we back to the local machine, which has a full TVM installed.
+#   Now we back to the local machine, which has a full TVM installed
+#   (with LLVM).
 #
-# Here we will declare a simple kernel with TVM on the local machine:
+# Here we will declare a simple kernel on the local machine:
+
+import numpy as np
+
+import tvm
+from tvm import rpc
+from tvm.contrib import util

 n = tvm.convert(1024)
 A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+B = tvm.compute((n,), lambda i: A[i] + 1.0, name='B')
 s = tvm.create_schedule(B.op)

 ######################################################################
-# Then we cross compile the kernel:
-#
+# Then we cross compile the kernel.
+# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
+# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
+# on our webpage building server. See the detailed note in the following block.

-# the target here should be 'llvm -target=armv7l-none-linux-gnueabihf',
-# and we use 'llvm' here to make example run locally, see the detailed
-# note in the following block
-f = tvm.build(s, [A, B], target='llvm', name='myadd')
-# save the lib at local temp folder
+local_demo = True
+
+if local_demo:
+    target = 'llvm'
+else:
+    target = 'llvm -target=armv7l-linux-gnueabihf'
+
+func = tvm.build(s, [A, B], target=target, name='add_one')
+# save the lib at a local temp folder
 temp = util.tempdir()
-path = temp.relpath('mylib.o')
-f.save(path)
+path = temp.relpath('lib.tar')
+func.export_library(path)

 ######################################################################
 # .. note::
 #
-#   the argument :code:`target` in :code:`build` should be replaced
-#   :code:`'llvm'` with the target triple of your device, which might be
-#   different for different device. For example, it is
-#   :code:`'llvm -target=armv7l-none-linux-gnueabihf'` for my Raspberry
-#   Pi. Here we use :code:`'llvm'` directly to make the tutorial runable.
+#   To run this tutorial with real remote device, change :code:`local_demo`
+#   to False and replace :code:`target` in :code:`build` with the true
+#   target triple of your device. The target triple which might be
+#   different for different devices. For example, it is
+#   :code:`'llvm -target=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
+#   :code:`'llvm -target=aarch64-linux-gnu'` for RK3399.
 #
 #   Usually, you can query the target by execute :code:`gcc -v` on your
 #   device, and look for the line starting with :code:`Target:`
@ -155,8 +124,6 @@ f.save(path)
 #   Besides :code:`-target`, you can also set other compilation options
 #   like:
 #
-#   * -mtriple=<target triple>
-#       Specify the target triple, same as '-target'.
 #   * -mcpu=<cpuname>
 #       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
 #   * -mattr=a1,+a2,-a3,...
@ -168,13 +135,6 @@ f.save(path)
 #         llc -mtriple=<your device target triple> -mattr=help
 #
 #   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
-#   So for my board, to get the best performance, the complete compilation
-#   option would be:
-#
-#   .. code-block:: bash
-#
-#     llvm -mtriple=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon
-#
 #   It is recommended to set target triple and feature set to contain specific
 #   feature available, so we can take full advantage of the features of the
 #   board.
@ -184,43 +144,40 @@ f.save(path)
 ######################################################################
 # Run CPU Kernel Remotely by RPC
 # ------------------------------
-# Here we will show you how to run the kernel on the remote device:
-#
-# .. note::
-#    In order to have this tutorial runs locally to build the nice HTML, we
-#    start a RPC server on the local machine. You can ignore it if you already
-#    started the server on the target device. And then change host IP properly.
+# We show how to run the generated cpu kernel on the remote device.
+# First we obtain an RPC session from remote device.

-# Can be ignored if you already started the RPC server
-server = rpc.Server(host='0.0.0.0', port=9090, use_popen=True)
-host = '0.0.0.0'  # Change to your target device IP
-port = 9090
-# connect the remote device
-remote = rpc.connect(host, port)
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)

 ######################################################################
-# Here we upload the lib to the remote device, then invoke a device local
-# compiler for shared lib and load it into device memory. now `f` is a
-# remote module object.
-remote.upload(path)
-f = remote.load_module('mylib.o')
+# Upload the lib to the remote device, then invoke a device local
+# compiler to relink them. Now `func` is a remote module object.

-# create array on the remote device
-ctx = remote.cpu(0)
+remote.upload(path)
+func = remote.load_module('lib.tar')
+
+# create arrays on the remote device
+ctx = remote.cpu()
 a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
 b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
 # the function will run on the remote device
-f(a, b)
+func(a, b)
 np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

 ######################################################################
 # When you want to evaluate the performance of the kernel on the remote
-# device, it is important to avoid overhead of remote function calls.
+# device, it is important to avoid the overhead of network.
 # :code:`time_evaluator` will returns a remote function that runs the
 # function over number times, measures the cost per run on the remote
-# device and returns the measured cost.
-#
-time_f = f.time_evaluator(f.entry_name, ctx, number=10)
+# device and returns the measured cost. Network overhead is excluded.
+
+time_f = func.time_evaluator(func.entry_name, ctx, number=10)
 cost = time_f(a, b).mean
 print('%g secs/op' % cost)

@ -228,69 +185,54 @@ print('%g secs/op' % cost)
 # Run OpenCL Kernel Remotely by RPC
 # ---------------------------------
 # As for remote OpenCL devices, the workflow is almost the same as above.
-# You can define the kernel, upload files, and run by RPC. The files
-# include host object, kernel source code and module meta file. We rely
-# on remote compiler to re-link them.
+# You can define the kernel, upload files, and run by RPC. 
 #
 # .. note::
 #
 #    Raspberry Pi does not support OpenCL, the following code is tested on
 #    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
-#    to setup the RK3399 OS and OpenCL driver.
+#    to setup the OS and OpenCL driver for RK3399.
 #
-#    The target_host should be 'llvm -target=aarch64-linux-gnu'.
-#    But here we set 'llvm' to enable this tutorial to run locally.
+#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the tvm
+#    root directory, execute
 #
-#    Also we need to build the runtime with the flag `USE_OPENCL=1` to
-#    build the kernel (different from cpu, we need bind axis for OpenCL)
+# .. code-block:: bash
 #
-# The following functions shows how we can deploy CL
-def deploy_cl():
+#    cp cmake/config.cmake .
+#    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
+#    make runtime -j4
+#
+# The following function shows how we run OpenCL kernel remotely
+
+def run_opencl():
+    # NOTE: This is the setting for my rk3399 board. You need to modify
+    # them according to your environment.
+    target_host = "llvm -target=aarch64-linux-gnu"
+    opencl_device_host = '10.77.1.145'
+    opencl_device_port = 9090
+
+    # create scheule for the above "add one" compute decleration
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=32)
    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
+    func = tvm.build(s, [A, B], "opencl", target_host=target_host)

-    # save files
-    path_o = temp.relpath("myadd.o")
-    path_cl = temp.relpath("myadd.cl")
-    path_json = temp.relpath("myadd.tvm_meta.json")
-    f.save(path_o)
-    f.imported_modules[0].save(path_cl)
+    remote = rpc.connect(opencl_device_host, opencl_device_port)

-    # upload files
-    remote.upload(path_o)
-    remote.upload(path_cl)
-    remote.upload(path_json)
-
-    # load files on remote device
-    fhost = remote.load_module("myadd.o")
-    fdev = remote.load_module("myadd.cl")
-    fhost.import_module(fdev)
+    # export and upload
+    path = temp.relpath('lib_cl.tar')
+    func.export_library(path)
+    remote.upload(path)
+    func = remote.load_module('lib_cl.tar')

    # run
-    ctx = remote.cl(0)
+    ctx = remote.cl()
    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    fhost(a, b)
+    func(a, b)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
-
-
-#####################################################################
-# Instead of uploading files separately, there is a more convinient way.
-# You can export libraray as a tar ball.
-# The following functions shows how we can deploy by tar ball
-def deploy_cl_by_tar():
-    path_tar = temp.relpath("myadd.tar")
-    f.export_library(path_tar)
-    remote.upload(path_tar)
-    fhost = remote.load_module("myadd.tar")
-    fhost(a, b)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
-
-# terminate the server after experiment
-server.terminate()
+    print("OpenCP test passed!")

 ######################################################################
 # Summary
--- a/tutorials/nnvm/README.txt
+++ b/tutorials/nnvm/README.txt
@ -1,2 +1,4 @@
+.. _tutorial-nnvm:
+
 Compile Deep Learning Models
 ----------------------------
--- a/tutorials/nnvm/deploy_model_on_mali_gpu.py
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@ -6,14 +6,10 @@ Deploy the Pretrained Model on ARM Mali GPU
 **Author**: `Lianmin Zheng <https://lmzheng.net/>`_, `Ziheng Jiang <https://ziheng.org/>`_

 This is an example of using NNVM to compile a ResNet model and
-deploy it on Firefly-RK3399 with ARM Mali GPU.  We will use the
+deploy it on Firefly-RK3399 with ARM Mali GPU. We will use the
 Mali-T860 MP4 GPU on this board to accelerate the inference.
-
-This tutorial is based on the tutorial for deploying on Raspberry Pi by `Ziheng Jiang <https://ziheng.org/>`_.
-Great thanks to the original author, I only do several lines of modification.
-
-To begin with, we import nnvm (for compilation) and TVM (for deployment).
 """
+
 import tvm
 import nnvm.compiler
 import nnvm.testing
@ -24,92 +20,65 @@ from tvm.contrib import util, graph_runtime as runtime
 # Build TVM Runtime on Device
 # ---------------------------
 #
-# There're some prerequisites: we need build tvm runtime and set up
-# a RPC server on remote device.
-#
-# To get started, clone tvm repo from github. It is important to clone
-# the submodules along, with --recursive option (Assuming you are in
-# your home directory):
-#
-#   .. code-block:: bash
-#
-#     git clone --recursive https://github.com/dmlc/tvm
+# The first step is to build tvm runtime on the remote device.
 #
 # .. note::
 #
-#   Usually device has limited resources and we only need to build
-#   runtime. The idea is we will use TVM compiler on the local server
-#   to compile and upload the compiled program to the device and run
-#   the device function remotely.
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Rk3399. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device. Make sure you have opencl driver in your board.
+# You can refer to `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+# to setup OS and opencl driver for rk3399.
 #
-#   .. code-block:: bash
+# .. code-block:: bash
 #
-#     make runtime
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   cp cmake/config.cmake .
+#   sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake 
+#   make runtime -j4
 #
-# After success of buildind runtime, we need set environment varibles
-# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
-# of system enviroment variables. Assuming your TVM directory is in
-# :code:`~/tvm` and set environment variables below your account.
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
 #
-#   .. code-block:: bash
+# .. code-block:: bash
 #
-#    vi ~/.bashrc
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
 #
-# We need edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
-# lines below (Assuming your TVM directory is in :code:`~/tvm`):
-#
-#   .. code-block:: bash
-#
-#    export TVM_HOME=~/tvm
-#    export PATH=$PATH:$TVM_HOME/lib
-#    export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
-#
-# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
+# To update the environment variables, execute :code:`source ~/.bashrc`.

 ######################################################################
 # Set Up RPC Server on Device
 # ---------------------------
-# To set up a TVM RPC server on the your ARM device (our remote device),
-# we have prepared a one-line script so you only need to run this
-# command after following the installation guide to install TVM on
-# your device:
+# To start an RPC server, run the following command on your remote device
+# (Which is RK3399 in our example).
 #
 #   .. code-block:: bash
 #
 #     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
 #
-# After executing command above, if you see these lines below, it's
-# successful to start RPC server on your device.
+# If you see the line below, it means the RPC server started
+# successfully on your device.
 #
 #    .. code-block:: bash
 #
-#      Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
 #      INFO:root:RPCServer: bind to 0.0.0.0:9090
 #

 ######################################################################
-# For demonstration, we simply start an RPC server on the same machine,
-# if :code:`use_mali` is False. If you have set up the remote
-# environment, please change the three lines below: change the
-# :code:`use_mali` to True, also change the :code:`host` and :code:`port`
-# with your device's host address and port number.
-
-use_mali = False
-host = '10.42.0.96'
-port = 9090
-
-if not use_mali:
-    # run server locally
-    host = 'localhost'
-    port = 9095
-    server = rpc.Server(host=host, port=port, use_popen=True)
-
-######################################################################
-# Prepare the Pretrained Model
-# ----------------------------
-# Back to the host machine, firstly, we need to download a MXNet Gluon
-# ResNet model from model zoo, which is pretrained on ImageNet. You
-# can found more details about this part at `Compile MXNet Models`
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.

 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon.utils import download
@ -135,7 +104,6 @@ def transform_image(image):

 x = transform_image(image)

-
 ######################################################################
 # synset is used to transform the label from number of ImageNet class to
 # the word human can understand.
@ -143,6 +111,7 @@ synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                      'imagenet1000_clsid_to_human.txt'])
+
 synset_name = 'synset.txt'
 download(synset_url, synset_name)
 with open(synset_name) as f:
@ -176,21 +145,29 @@ out_shape = (batch_size, num_classes)
 # triplet for host ARM device by setting the parameter :code:`target_host`.

 ######################################################################
-# If we run the example locally for demonstration, we can simply set
-# it as :code:`llvm`. If to run it on the ARM device, you need to specify
-# its instruction set. Here is the option I use for my Firefly-RK3399.
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the RK3399, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you
+# want to run this tutorial with a real device.

-if use_mali:
-    target_host = "llvm -target=aarch64-linux-gnu -mattr=+neon"
-    target = tvm.target.mali()
-else:
+local_demo = True
+
+if local_demo:
    target_host = "llvm"
-    target = tvm.target.cuda()
+    target = "llvm"
+else:
+    # Here is the setting for my rk3399 board
+    # If you don't use rk3399, you can query your target triple by 
+    # execute `gcc -v` on your board.
+    target_host = "llvm -target=aarch64-linux-gnu"

-# set target as  `tvm.target.mali` instead of 'opencl' to enable
-# target-specified optimization
-graph, lib, params = nnvm.compiler.build(net, target=target,
-        shape={"data": data_shape}, params=params, target_host=target_host)
+    # set target as  `tvm.target.mali` instead of 'opencl' to enable
+    # optimization for mali
+    target = tvm.target.mali()
+
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(net, target=target,
+            shape={"data": data_shape}, params=params, target_host=target_host)

 # After `nnvm.compiler.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
@ -207,14 +184,20 @@ lib.export_library(lib_fname)
 # With RPC, you can deploy the model remotely from your host machine
 # to the remote device.

-# connect the server
-remote = rpc.connect(host, port)
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.145'
+    port = 9090
+    remote = rpc.connect(host, port)

 # upload the library to remote device and load it
 remote.upload(lib_fname)
 rlib = remote.load_module('net.tar')

-ctx = remote.cl(0) if use_mali else remote.gpu(0)
+ctx = remote.cpu(0) if local_demo else remote.cl(0)
 # upload the parameter
 rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

@ -231,7 +214,3 @@ out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))
-
-if not use_mali:
-    # terminate the local server
-    server.terminate()
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@ -7,9 +7,8 @@ Deploy the Pretrained Model on Raspberry Pi

 This is an example of using NNVM to compile a ResNet model and deploy
 it on raspberry pi.
-
-To begin with, we import nnvm(for compilation) and TVM(for deployment).
 """
+
 import tvm
 import nnvm.compiler
 import nnvm.testing
@ -17,102 +16,73 @@ from tvm import rpc
 from tvm.contrib import util, graph_runtime as runtime

 ######################################################################
+# .. _build-tvm-runtime-on-device:
+#
 # Build TVM Runtime on Device
 # ---------------------------
 #
-# There're some prerequisites: we need build tvm runtime and set up
-# a RPC server on remote device.
-#
-# To get started, clone tvm repo from github. It is important to clone
-# the submodules along, with --recursive option (Assuming you are in
-# your home directory):
-#
-#   .. code-block:: bash
-#
-#     git clone --recursive https://github.com/dmlc/tvm
+# The first step is to build tvm runtime on the remote device.
 #
 # .. note::
 #
-#   Usually device has limited resources and we only need to build
-#   runtime. The idea is we will use TVM compiler on the local server
-#   to compile and upload the compiled program to the device and run
-#   the device function remotely.
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
 #
-#   .. code-block:: bash
+# .. code-block:: bash
 #
-#     make runtime
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   make runtime -j4
 #
-# After success of buildind runtime, we need set environment varibles
-# in :code:`~/.bashrc` file of yourself account or :code:`/etc/profile`
-# of system enviroment variables. Assuming your TVM directory is in
-# :code:`~/tvm` and set environment variables below your account.
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
 #
-#   .. code-block:: bash
+# .. code-block:: bash
 #
-#    vi ~/.bashrc
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
 #
-# We need edit :code:`~/.bashrc` using :code:`vi ~/.bashrc` and add
-# lines below (Assuming your TVM directory is in :code:`~/tvm`):
-#
-#   .. code-block:: bash
-#
-#    export TVM_HOME=~/tvm
-#    export PATH=$PATH:$TVM_HOME/lib
-#    export PYTHONPATH=$PYTHONPATH:$TVM_HOME/python
-#
-# To enable updated :code:`~/.bashrc`, execute :code:`source ~/.bashrc`.
+# To update the environment variables, execute :code:`source ~/.bashrc`.

 ######################################################################
 # Set Up RPC Server on Device
 # ---------------------------
-# To set up a TVM RPC server on the Raspberry Pi (our remote device),
-# we have prepared a one-line script so you only need to run this
-# command after following the installation guide to install TVM on
-# your device:
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in our example).
 #
 #   .. code-block:: bash
 #
 #     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
 #
-# After executing command above, if you see these lines below, it's
-# successful to start RPC server on your device.
+# If you see the line below, it means the RPC server started
+# successfully on your device.
 #
 #    .. code-block:: bash
 #
-#      Loading runtime library /home/YOURNAME/code/tvm/lib/libtvm_runtime.so... exec only
 #      INFO:root:RPCServer: bind to 0.0.0.0:9090
-
+#

 ######################################################################
-# For demonstration, we simply start an RPC server on the same machine,
-# if :code:`use_rasp` is False. If you have set up the remote
-# environment, please change the three lines below: change the
-# :code:`use_rasp` to True, also change the :code:`host` and :code:`port`
-# with your device's host address and port number.
-
-use_rasp = False
-host = 'rasp0'
-port = 9090
-
-if not use_rasp:
-    # run server locally
-    host = 'localhost'
-    port = 9091
-    server = rpc.Server(host=host, port=port, use_popen=True)
-
-######################################################################
-# Prepare the Pretrained Model
-# ----------------------------
-# Back to the host machine, firstly, we need to download a MXNet Gluon
-# ResNet model from model zoo, which is pretrained on ImageNet. You
-# can found more details about this part at `Compile MXNet Models`
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.

 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon.utils import download
 from PIL import Image
 import numpy as np

-# only one line to get the model
+# one line to get the model
 block = get_model('resnet18_v1', pretrained=True)

 ######################################################################
@ -131,7 +101,6 @@ def transform_image(image):

 x = transform_image(image)

-
 ######################################################################
 # synset is used to transform the label from number of ImageNet class to
 # the word human can understand.
@ -173,29 +142,32 @@ out_shape = (batch_size, num_classes)
 # will lead to very different performance.

 ######################################################################
-# If we run the example locally for demonstration, we can simply set
-# it as :code:`llvm`. If to run it on the Raspberry Pi, you need to
-# specify its instruction set. Here is the option I use for my Raspberry
-# Pi, which has been proved as a good compilation configuration.
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.

-if use_rasp:
-    target = tvm.target.rasp()
-else:
+local_demo = True
+
+if local_demo:
    target = tvm.target.create('llvm')
+else:
+    target = tvm.target.arm_cpu('rasp3b')
+    # The above line is a simple form of
+    # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')

-graph, lib, params = nnvm.compiler.build(
-    net, target, shape={"data": data_shape}, params=params)
+with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+    graph, lib, params = nnvm.compiler.build(
+        net, target, shape={"data": data_shape}, params=params)

 # After `nnvm.compiler.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
 # change the parameters but keep the result of model as the same.

-
 # Save the library at local temporary directory.
 tmp = util.tempdir()
-lib_fname = tmp.relpath('net.o')
-lib.save(lib_fname)
-
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)

 ######################################################################
 # Deploy the Model Remotely by RPC
@ -203,15 +175,21 @@ lib.save(lib_fname)
 # With RPC, you can deploy the model remotely from your host machine
 # to the remote device.

-# connect the server
-remote = rpc.connect(host, port)
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)

 # upload the library to remote device and load it
 remote.upload(lib_fname)
-rlib = remote.load_module('net.o')
+rlib = remote.load_module('net.tar')

+# upload the parameter (this may take a while)
 ctx = remote.cpu(0)
-# upload the parameter
 rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

 # create the remote runtime module
@ -227,7 +205,3 @@ out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))
-
-if not use_rasp:
-    # terminate the local server
-    server.terminate()
--- a/tutorials/nnvm/from_mxnet.py
+++ b/tutorials/nnvm/from_mxnet.py
@ -1,4 +1,6 @@
 """
+.. _tutorial-from-mxnet:
+
 Compile MXNet Models
 ====================
 **Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
--- a/tutorials/nnvm/imagenet_inference_gpu.py
+++ b/tutorials/nnvm/imagenet_inference_gpu.py
@ -1,89 +0,0 @@
-"""
-Compile GPU Inference
-=====================
-**Author**: `Yuwei Hu <https://huyuwei.github.io/>`_
-
-This is an example of using NNVM to compile MobileNet/ResNet model and deploy its inference on GPU.
-
-To begin with, we import nnvm(for compilation) and TVM(for deployment).
-"""
-import tvm
-import numpy as np
-from tvm.contrib import nvcc, graph_runtime
-import nnvm.compiler
-import nnvm.testing
-
-######################################################################
-# Register the NVCC Compiler Option
-# ---------------------------------
-# NNVM optimizes the graph and relies on TVM to generate fast GPU code.
-# To get the maximum performance, we need to enable nvcc's compiler hook.
-# This usually gives better performance than nvrtc mode.
-
-@tvm.register_func("tvm_callback_cuda_compile", override=True)
-def tvm_callback_cuda_compile(code):
-    ptx = nvcc.compile_cuda(code, target="ptx")
-    return ptx
-
-######################################################################
-# Prepare the Benchmark
-# ---------------------
-# We construct a standard imagenet inference benchmark.
-# NNVM needs two things to compile a deep learning model:
-#
-# - net: the graph representation of the computation
-# - params: a dictionary of str to parameters
-#
-# We use nnvm's testing utility to produce the model description and random parameters
-# so that the example does not depend on a specific front-end framework.
-#
-# .. note::
-#
-#   In a typical workflow, we can get this pair from :any:`nnvm.frontend`
-#
-target = "cuda"
-ctx = tvm.gpu(0)
-batch_size = 1
-num_classes = 1000
-image_shape = (3, 224, 224)
-data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_classes)
-# To use ResNet to do inference, run the following instead
-#net, params = nnvm.testing.resnet.get_workload(
-#    batch_size=1, image_shape=image_shape)
-net, params = nnvm.testing.mobilenet.get_workload(
-    batch_size=1, image_shape=image_shape)
-
-######################################################################
-# Compile the Graph
-# -----------------
-# To compile the graph, we call the build function with the graph
-# configuration and parameters.
-# When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),
-# and return the updated parameters.
-
-graph, lib, params = nnvm.compiler.build(
-    net, target, shape={"data": data_shape}, params=params)
-
-
-######################################################################
-# Run the Compiled Module
-# -----------------------
-#
-# To deploy the module, we call :any:`tvm.contrib.graph_runtime.create` passing in the graph, the lib, and context.
-# Thanks to TVM, we can deploy the compiled module to many platforms and languages.
-# The deployment module is designed to contain minimum dependencies.
-# This example runs on the same machine.
-#
-# Note that the code below no longer depends on NNVM, and only relies TVM's runtime to run(deploy).
-data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-module = graph_runtime.create(graph, lib, ctx)
-# set input
-module.set_input(**params)
-module.set_input("data", data)
-# run
-module.run()
-# get output
-out = module.get_output(0, tvm.nd.empty(out_shape))
-# convert to numpy
-out.asnumpy()
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@ -6,9 +6,8 @@ Quick Start Tutorial for Compiling Deep Learning Models
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_

 This example shows how to build a neural network with NNVM python frontend and
-generate runtime library for Nvidia GPU and Raspberry Pi with TVM.
-To run this notebook, you need to install tvm and nnvm.
-Notice that you need to build tvm with cuda and llvm.
+generate runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
 """

 ######################################################################
@ -22,10 +21,13 @@ Notice that you need to build tvm with cuda and llvm.
 #
 # In this tutorial, we'll choose cuda and llvm as target backends.
 # To begin with, let's import NNVM and TVM.
-import tvm
+
+import numpy as np
+
 import nnvm.compiler
 import nnvm.testing
-
+import tvm
+from tvm.contrib import graph_runtime

 ######################################################################
 # Define Neural Network in NNVM
@ -33,7 +35,8 @@ import nnvm.testing
 # First, let's define a neural network with nnvm python frontend.
 # For simplicity, we'll use pre-defined resnet-18 network in NNVM.
 # Parameters are initialized with Xavier initializer.
-# NNVM also supports other model formats such as MXNet, CoreML and ONNX.
+# NNVM also supports other model formats such as MXNet, CoreML, ONNX and 
+# Tensorflow.
 #
 # In this tutorial, we assume we will do inference on our device
 # and the batch size is set to be 1. Input images are RGB color
@ -46,7 +49,8 @@ image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
 out_shape = (batch_size, num_class)

-net, params = nnvm.testing.resnet.get_workload(batch_size=batch_size, image_shape=image_shape)
+net, params = nnvm.testing.resnet.get_workload(layers=18,
+        batch_size=batch_size, image_shape=image_shape)
 print(net.debug_str())

 ######################################################################
@ -54,10 +58,8 @@ print(net.debug_str())
 # -----------
 # Next step is to compile the model using the NNVM/TVM pipeline.
 # Users can specify the optimization level of the compilation.
-# Currently this value can be 0 to 2, which corresponds to
-# "SimplifyInference", "OpFusion" and "PrecomputePrune" respectively.
-# In this example we set optimization level to be 0
-# and use Raspberry Pi as compile target.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
 #
 # :any:`nnvm.compiler.build` returns three components: the execution graph in
 # json format, the TVM module library of compiled functions specifically
@ -68,24 +70,50 @@ print(net.debug_str())
 #
 # We'll first compile for Nvidia GPU. Behind the scene, `nnvm.compiler.build`
 # first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
-# then registers the operators (i.e. the nodes of the optmized graphs) to
+# then registers the operators (i.e. the nodes of the optimized graphs) to
 # TVM implementations to generate a `tvm.module`.
-# To generate the module library, TVM will first transfer the HLO IR into the lower
-# intrinsic IR of the specified target backend, which is CUDA in this example.
-# Then the machine code will be generated as the module library.
+# To generate the module library, TVM will first transfer the High level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.

-opt_level = 0
+opt_level = 3
 target = tvm.target.cuda()
 with nnvm.compiler.build_config(opt_level=opt_level):
    graph, lib, params = nnvm.compiler.build(
        net, target, shape={"data": data_shape}, params=params)

-######################################################################
-# Save Compiled Module
-# ----------------------------
-# After compilation, we can save the graph, lib and params into separate files
-# and deploy them to Nvidia GPU.
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph runtime and run the module on Nvidia GPU.

+# create random input
+ctx = tvm.gpu()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_runtime.create(graph, lib, ctx)
+# set input and parameters
+module.set_input("data", data)
+module.set_input(**params)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape))
+# convert to numpy
+out.asnumpy()
+
+# Print first 10 elements of output
+print(out.asnumpy().flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in development environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
 from tvm.contrib import util

 temp = util.tempdir()
@ -97,95 +125,17 @@ with open(temp.relpath("deploy_param.params"), "wb") as fo:
    fo.write(nnvm.compiler.save_param_dict(params))
 print(temp.listdir())

-######################################################################
-# Deploy locally to Nvidia GPU
-# ------------------------------
-# Now we can load the module back.
+####################################################

-import numpy as np
-from tvm.contrib import graph_runtime
-
-loaded_lib = tvm.module.load(path_lib)
+# load the module back.
 loaded_json = open(temp.relpath("deploy_graph.json")).read()
+loaded_lib = tvm.module.load(path_lib)
 loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
+input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
+
 module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
 module.load_params(loaded_params)
-
-input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
 module.run(data=input_data)
+
 out = module.get_output(0, out=tvm.nd.empty(out_shape))
-# Print first 10 elements of output
-print(out.asnumpy()[0][0:10])

-######################################################################
-# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
-# --------------------------------------------------------------
-# Following the steps above, we can also compile the model for Raspberry Pi.
-# TVM provides rpc module to help with remote deploying.
-#
-# For demonstration, we simply start an RPC server on the same machine,
-# if :code:`use_rasp` is False. If you have set up the remote
-# environment, please change the three lines below: change the
-# :code:`use_rasp` to True, also change the host and port with your
-# device's host address and port number.
-
-# If we run the example locally for demonstration, we can simply set the
-# compilation target as `llvm`.
-# To run it on the Raspberry Pi, you need to specify its instruction set.
-# `llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon`
-# is the recommended compilation configuration, thanks to Ziheng's work.
-
-from tvm import rpc
-
-use_rasp = False
-host = 'rasp0'
-port = 9090
-
-if not use_rasp:
-    # run server locally
-    host = 'localhost'
-    port = 9099
-    server = rpc.Server(host=host, port=port, use_popen=True)
-
-# compile and save model library
-if use_rasp:
-    target = "llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon"
-else:
-    target = "llvm"
-# use `with tvm.target.rasp` for some target-specified optimization
-with tvm.target.rasp():
-    graph, lib, params = nnvm.compiler.build(
-        net, target, shape={"data": data_shape}, params=params)
-
-temp = util.tempdir()
-path_lib = temp.relpath("deploy_lib_rasp.o")
-lib.save(path_lib)
-
-# connect the server
-remote = rpc.connect(host, port)
-
-# upload the library to remote device and load it
-remote.upload(path_lib)
-rlib = remote.load_module('deploy_lib_rasp.o')
-
-ctx = remote.cpu(0)
-# upload the parameter
-rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
-# create the remote runtime module
-module = graph_runtime.create(graph, rlib, ctx)
-# set parameter
-module.set_input(**rparams)
-# set input data
-input_data = np.random.uniform(size=data_shape)
-module.set_input('data', tvm.nd.array(input_data.astype('float32')))
-# run
-module.run()
-
-out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
-# Print first 10 elements of output
-print(out.asnumpy()[0][0:10])
-
-if not use_rasp:
-    # terminate the local server
-    server.terminate()
--- a/vta/python/vta/top/arm_conv2d.py
+++ b/vta/python/vta/top/arm_conv2d.py
@ -1,57 +1,21 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name
-"""Conv2D schedule ported from RASP
+"""Reuse conv2d schedule from ARM CPU"""

-Used for CPU conv2d
-"""
-from __future__ import absolute_import as _abs
+import tvm

-from topi.nn.conv2d import conv2d, _get_schedule
-from topi.nn.conv2d import SpatialPack, Im2ColPack, Workload
-from topi.rasp import conv2d as _rasp_conv2d
+from topi.nn import conv2d, conv2d_alter_layout
 from topi import generic

-_WORKLOADS = [
-    Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-    Workload('int8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-    Workload('int8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-    Workload('int8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-    Workload('int8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-    Workload('int8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-    Workload('int8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-    Workload('int8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-    Workload('int8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-    Workload('int8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-    Workload('int8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-    Workload('int8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-    Workload('int8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-]
-_SCHEDULES = [
-    # float32 imagenet
-    SpatialPack(1, 8, 4, 1, 4, True),
-    SpatialPack(1, 8, 4, 1, 4, True),
-    SpatialPack(1, 7, 4, 2, 4, True),
-    SpatialPack(1, 4, 8, 4, 1, True),
-    SpatialPack(1, 4, 4, 1, 16, False),
-    SpatialPack(1, 4, 8, 4, 8, False),
-    SpatialPack(1, 7, 4, 3, 8, True),
-    SpatialPack(1, 2, 8, 1, 8, True),
-    SpatialPack(2, 1, 16, 1, 4, True),
-    SpatialPack(1, 7, 4, 1, 1, True),
-    Im2ColPack(7, 4, 1, 16, True),
-    Im2ColPack(7, 4, 1, 8, False),
-    Im2ColPack(7, 4, 1, 16, False),
-]
+@conv2d.register(["vtacpu", "vta"])
+def compute(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return conv2d(*args, **kwargs)

-@_get_schedule.register(["vtacpu", "vta"])
-def _schedule_conv2d(wkl):
-    if wkl not in _WORKLOADS:
-        raise ValueError("no schedule for such workload: {}".format(wkl))
-    idx = _WORKLOADS.index(wkl)
-    sch = _SCHEDULES[idx]
-    return sch
+@generic.schedule_conv2d_nchw.register(["vtacpu", "vta"])
+def schedule(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return generic.schedule_conv2d_nchw(*args, **kwargs)

-conv2d.register(["vtacpu", "vta"], _rasp_conv2d._declaration_conv2d)
-
-generic.schedule_conv2d_nchw.register(
-    ["vtacpu", "vta"],
-    _rasp_conv2d.schedule_conv2d_nchw)
+@conv2d_alter_layout.register(["vtacpu", "vta"])
+def alter(*args, **kwargs):
+    with tvm.target.arm_cpu("vtacpu"):
+        return conv2d_alter_layout(*args, **kwargs)
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@ -244,8 +244,11 @@ def is_packed_layout(layout):
    return False

@reg.register_alter_op_layout("conv2d", level=15)
-def alter_conv2d_layout(*_):
-    return None
+def alter_conv2d_layout(attrs, inputs, out):
+    layout = attrs['layout']
+    if is_packed_layout(layout):
+        return None
+    return _nn.alter_conv2d_layout(attrs, inputs, out)


@reg.register_compute("conv2d", level=15)
@ -368,7 +371,6 @@ def schedule_packed_conv2d(outs):
    oshape = topi.util.get_const_tuple(output.shape)
    s = tvm.create_schedule(output.op)

-
    # setup pad
    if pad_data is not None:
        cdata = pad_data
@ -394,7 +396,6 @@ def schedule_packed_conv2d(outs):
    h_factor = (plan.h_factor if plan.h_factor else oshape[2])
    w_factor = (plan.w_factor if plan.w_factor else oshape[3])

-
    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
    x_co0, x_co1 = s[output].split(x_co, factor=oc_factor)
    x_i0, x_i1 = s[output].split(x_i, factor=h_factor)
@ -459,6 +460,7 @@ class Conv2DSchedule(object):
        self.oc_nthread = oc_nthread
        self.h_nthread = h_nthread
        self.debug_sync = debug_sync
+
    def __str__(self):
        return "{}.{}.{}.{}.{}.{}.{}".format(
            self.b_factor, self.oc_factor, self.ic_factor,
@ -483,7 +485,6 @@ RESNET = {
    11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
 }

-_WL2PLAN = {}
 for idx in RESNET:
    scheds = find_schedules(RESNET[idx], vt_only=True, best_only=True)[0]
    _WL2PLAN[RESNET[idx]] = scheds
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@ -1,6 +1,7 @@
 """Testing if we can generate code in topi style"""

 import tvm
+from tvm import autotvm
 from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
 import topi
@ -62,8 +63,7 @@ def test_cpu_conv2d():

        def verify(s, check_correctness):
            mod = tvm.build(s, [data, kernel, res],
-                            "llvm -device=vtacpu",
-                            env.target_host,
+                            target_host=env.target_host,
                            name="conv2d")
            temp = util.tempdir()
            mod.save(temp.relpath("conv2d.o"))
@ -126,7 +126,11 @@ def test_cpu_conv2d():
            print(wl)
            with tvm.target.create("llvm -device=vtacpu"):
                run_cpu_conv2d(env, remote, key, batch_size, wl)
-    vta.testing.run(_run)
+
+    # load pre-tuned operator parameters for ARM CPU
+    autotvm.tophub.check_package('vta')
+    with autotvm.tophub.context('llvm -device=vtacpu'):
+        vta.testing.run(_run)


 def test_vta_conv2d():
@ -172,7 +176,6 @@ def test_vta_conv2d():
                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
            return a_np, w_np, b_np

-
        def verify(s, check_correctness):
            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
                            env.target_host, name="conv2d")
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@ -8,7 +8,6 @@ onto the VTA accelerator design to perform ImageNet classification tasks.

 """

-
 ######################################################################
 # Import Libraries
 # ----------------
@ -17,27 +16,22 @@ onto the VTA accelerator design to perform ImageNet classification tasks.
 from __future__ import absolute_import, print_function

 import os
-import sys
-import nnvm
-import nnvm.compiler
-import tvm
-import vta
-import vta.testing
-import numpy as np
-import json
-import requests
 import time
-
-from nnvm.compiler import graph_attr
-from tvm import rpc
-from tvm.contrib import graph_runtime, util
-from tvm.contrib.download import download
-from vta.testing import simulator
-
 from io import BytesIO
+
+import numpy as np
+import requests
 from matplotlib import pyplot as plt
 from PIL import Image

+import tvm
+from tvm import rpc, autotvm
+from tvm.contrib import graph_runtime, util
+from tvm.contrib.download import download
+import nnvm.compiler
+import vta
+import vta.testing
+
 # Load VTA parameters from the vta/config/vta_config.json file
 env = vta.get_env()

@ -76,7 +70,6 @@ def classify(m, image):
 # Takes in a path to a graph file, params file, and device target
 # Returns the NNVM graph object, a compiled library object, and the params dict
 def generate_graph(graph_fn, params_fn, device="vta"):
-
    # Measure build start time
    build_start = time.time()

@ -100,12 +93,6 @@ def generate_graph(graph_fn, params_fn, device="vta"):
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

-    # Create NNVM graph
-    graph = nnvm.graph.create(sym)
-    graph_attr.set_shape_inputs(sym, shape_dict)
-    graph_attr.set_dtype_inputs(sym, dtype_dict)
-    graph = graph.apply("InferShape").apply("InferType")
-
    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
@ -166,6 +153,9 @@ for file in [categ_fn, graph_fn, params_fn]:
 # Read in ImageNet Categories
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())

+# Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
+autotvm.tophub.check_package('vta')
+

 ######################################################################
 # Setup the Pynq Board's RPC Server
@ -182,7 +172,6 @@ port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
 # We configure both the bitstream and the runtime system on the Pynq
 # to match the VTA configuration specified by the vta_config.json file.
 if env.TARGET == "pynq":
-
    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)
@ -209,8 +198,8 @@ elif env.TARGET == "sim":
 # ------------------------
 # Build the ResNet graph runtime, and configure the parameters.

-# Set ``device=cpu`` to run inference on the CPU,
-# or ``device=vtacpu`` to run inference on the FPGA.
+# Set ``device=vtacpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
 device = "vta"

 # Device context
@ -225,7 +214,6 @@ m = graph_runtime.create(graph, lib, ctx)
 # Set the parameters
 m.set_input(**params)

-
 ######################################################################
 # Run ResNet-18 inference on a sample image
 # -----------------------------------------