[WIP] C++ topi contributions (#312)

* [WIP] C++ topi contributions Summary: This diff implements C++ topi contributions for: - relu with parametrix threshold - pad with generic padBefore / padAfter specification - matmult with transposes - conv2d_nchw, conv2d_hwcn with runtime constant padding and strides - depthwise_conv2d_nchw with runtime constant padding and strides - group_conv2d_ngchw with runtime constant padding and strides - broadcast_to a broadcastable shape - broadcast_bop where bop is an usual binary op (+ - * / %) Convolution padding is implemented using the pad operation. To avoid extra memory consumption, it is generally recommended to inline the padding with the autoinliner. Unfortunately in its current form the elemwise checks are too restrictive to allow inlining. So this diff also proposes an extension to LHS injective (i.e. no reduction axis in the current IR design) Test Plan: Tested in C++ testsuite in a separate repository, I am looking for suggestions to quickly spin up some tests for tvm. Reviewers: tqchen Subscribers: Tasks: Tags: Blame Revision: * Review + Lint + GSG C++
2017-08-14 00:50:25 +02:00 · 2017-08-14 00:50:25 +02:00 · f08de2b690
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@ -40,6 +40,16 @@ Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map);
 */
 void AutoInlineElemWise(Schedule sch);

+/*!
+ * \brief To automatically inline operations with injective writes
+ *   (i.e. writes without reduction or sequential loops). Note
+ *   that in this case, guarantees about contiguity, transpose, stride,
+ *   alignemnt and memory footprint in general do not hold.
+ *
+ * \param sch The schedule to be inlined.
+ */
+void AutoInlineInjective(Schedule sch);
+
 }  // namespace schedule
 }  // namespace tvm
 #endif  // TVM_SCHEDULE_PASS_H_
--- a/src/schedule/auto_inline_elem_wise.cc
+++ b/src/schedule/auto_inline_elem_wise.cc
@ -60,5 +60,38 @@ void AutoInlineElemWise(Schedule sch) {
  }
 }

+bool IsBroadcast(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    if (compute->reduce_axis.size()) {
+      return false;
+    }
+    // TODO(nicolasvasilache): Implement Me
+  }
+  return false;
+}
+
+void AutoInlineBroadcast(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsBroadcast(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
+bool IsInjective(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    return compute->reduce_axis.size() == 0;
+  }
+  return false;
+}
+
+void AutoInlineInjective(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsInjective(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
 }  // namespace schedule
 }  // namespace tvm
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Broadcast op constructions
+ * \file broadcast.h
+ */
+#ifndef TOPI_BROADCAST_H_
+#define TOPI_BROADCAST_H_
+
+#include <topi/detail/broadcast.h>
+
+namespace topi {
+
+inline tvm::Tensor broadcast_to(const tvm::Tensor& I,
+                                const tvm::Array<tvm::Expr>& output_shape) {
+  CHECK_GE(output_shape.size(), I->shape.size())
+      << "Not a broadcast, output dimensionality smaller than input.\noutput: "
+      << output_shape << "\nvs\ninput: " << I;
+  auto bh = detail::BroadcastShape(output_shape, I->shape);
+  CHECK_EQ(output_shape.size(), bh.common_shape.size());
+  for (int i = 0; i < output_shape.size(); ++i) {
+    CHECK(tvm::ir::Equal(output_shape[i], bh.common_shape[i]));
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return I(detail::InputIndexFromBroadcast(ovars, I, bh.vars2, bh.all_vars));
+  };
+  return tvm::compute(
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
+}
+
+inline tvm::Tensor broadcast_add(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a + b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_sub(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a - b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mul(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a * b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_div(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a / b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mod(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a % b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+}  // namespace topi
+
+#endif  // TOPI_BROADCAST_H_
--- a/topi/include/topi/detail/broadcast.h
+++ b/topi/include/topi/detail/broadcast.h
@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Detail broadcast.
+ * \file broadcast.h
+ */
+#ifndef TOPI_DETAIL_BROADCAST_H_
+#define TOPI_DETAIL_BROADCAST_H_
+
+#include <algorithm>
+#include <deque>
+
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+
+struct BroadcastHelper {
+  std::deque<tvm::Expr> common_shape;
+  std::deque<tvm::Var> all_vars;
+  std::deque<tvm::Var> vars1;
+  std::deque<tvm::Var> vars2;
+};
+
+inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
+                                      const tvm::Array<tvm::Expr>& shape2) {
+  BroadcastHelper bh;
+  int s1_size = shape1.size();
+  int s2_size = shape2.size();
+  tvm::Expr one(1);
+  int i;
+  for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    if (tvm::ir::Equal(shape1[s1_size - i], shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape1[s1_size - i])) {
+      CHECK(!tvm::ir::Equal(one, shape2[s2_size - i]));
+      bh.common_shape.push_front(shape2[s2_size - i]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
+    } else {
+      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i]
+                   << " and " << shape2[s2_size - i] << " in: "
+                   << tvm::Array<tvm::Expr>(shape1.begin(), shape1.end())
+                   << " and "
+                   << tvm::Array<tvm::Expr>(shape2.begin(), shape2.end());
+    }
+  }
+  // Remaining dimensions whether on shape1 or shape2 can always be completed
+  auto max_size = std::max(s1_size, s2_size);
+  auto& shape = (s1_size > s2_size) ? shape1 : shape2;
+  auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
+  for (i = i; i <= max_size; ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    bh.common_shape.push_front(shape[max_size - i]);
+    vars.push_front(bh.all_vars[0]);
+  }
+  return bh;
+}
+
+inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
+    const tvm::Array<tvm::Var>& ovars, const tvm::Tensor& T,
+    const std::deque<tvm::Var>& my_vars, const std::deque<tvm::Var>& all_vars) {
+  tvm::Array<tvm::Expr> ivars;
+  CHECK_EQ(ovars.size(), all_vars.size());
+  // N^2, could use a map but NBD..
+  int expected_dims = T->shape.size();
+  for (int i = 0; i < ovars.size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < my_vars.size(); ++j) {
+    if (all_vars[i].same_as(my_vars[j])) {
+        ivars.push_back(ovars[i]);
+        found = true;
+        break;
+      }
+    }
+    // Only inject 0 here if we have not yet reached the dimension of I
+    // (i.e. this must be a 1)
+    if (!found && (ovars.size() - i) <= expected_dims) {
+      ivars.push_back(tvm::make_zero(ovars[i].type()));
+    }
+  }
+  CHECK(expected_dims == ivars.size());
+  return ivars;
+}
+
+
+template <typename FBinaryExpr>
+inline tvm::Tensor WithBroadcast(FBinaryExpr op, const tvm::Tensor& A,
+                                 const tvm::Tensor& B) {
+  auto bh = BroadcastShape(A->shape, B->shape);
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return op(A(InputIndexFromBroadcast(ovars, A, bh.vars1, bh.all_vars)),
+              B(InputIndexFromBroadcast(ovars, B, bh.vars2, bh.all_vars)));
+  };
+  return tvm::compute(
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
+}
+
+}  // namespace detail
+}  // namespace topi
+
+#endif  // TOPI_DETAIL_BROADCAST_H_
--- a/topi/include/topi/ewise.h
+++ b/topi/include/topi/ewise.h
@ -1,6 +1,6 @@
 /*!
 *  Copyright (c) 2017 by Contributors
- * \file topi.h
+ * \file ewise.h
 * \brief Elementwise op constructions
 */
 #ifndef TOPI_EWISE_H_
@ -12,16 +12,17 @@ namespace topi {
 using namespace tvm;

 // Unary intrinsic operators
-#define TOPI_DECLARE_UNARY_OP(OpName)                                   \
-  inline Tensor OpName(const Tensor& x) {                               \
-    return compute(x->shape, [&](const Array<Var>& i) {                 \
-        return ::tvm::OpName(x(i));                                     \
-      });                                                               \
+#define TOPI_DECLARE_UNARY_OP(OpName)                   \
+  inline Tensor OpName(const Tensor& x) {               \
+    return compute(x->shape, [&](const Array<Var>& i) { \
+        return ::tvm::OpName(x(i));                     \
+      }, "tensor", "ewise");                            \
  }

 TOPI_DECLARE_UNARY_OP(exp);
 TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
+
 }  // namespace topi
 #endif  // TOPI_EWISE_H_
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief NN op constructions
+ * \file nn.h
+ */
+#ifndef TOPI_NN_H_
+#define TOPI_NN_H_
+
+#include <algorithm>
+
+#include "tvm/ir.h"
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+
+template <typename T>
+tvm::Expr Map(const tvm::Array<tvm::Expr>& exprs, T op) {
+  CHECK_GE(exprs.size(), 1);
+  tvm::Expr res = exprs[0];
+  for (int i = 1; i < exprs.size(); ++i) {
+    res = op(res, exprs[i]);
+  }
+  return res;
+}
+
+}  // namespace detail
+
+template <typename T>
+inline tvm::Tensor relu(const tvm::Tensor& x, T threshold = static_cast<T>(0)) {
+  return tvm::compute(
+      x->shape,
+      [&](const tvm::Array<tvm::Var>& i) { return tvm::max(x(i), threshold); },
+      "tensor", "ewise");
+}
+
+inline tvm::Tensor pad(
+    const tvm::Tensor& t, const tvm::Array<tvm::Expr>& pad_before,
+    tvm::Array<tvm::Expr> pad_after = tvm::Array<tvm::Expr>()) {
+  if (pad_after.size() < pad_before.size()) {
+    for (int i = pad_after.size(); i < pad_before.size(); ++i) {
+      pad_after.push_back(pad_before[i]);
+    }
+  }
+  CHECK_GE(pad_before.size(), 1);
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  tvm::Array<tvm::Expr> output_shape;
+  for (int i = 0; i < t->shape.size(); ++i) {
+    if (i >= pad_before.size()) {
+      output_shape.push_back(t->shape[i]);
+    } else {
+      output_shape.push_back(
+          tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i]));
+    }
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    tvm::Array<tvm::Expr> indices;
+    tvm::Array<tvm::Expr> sel;
+    for (int i = 0; i < t->shape.size(); ++i) {
+      if (i >= pad_before.size()) {
+        indices.push_back(ovars[i]);
+        continue;
+      }
+      if (!tvm::ir::Equal(pad_before[i], 0)) {
+        sel.push_back(ovars[i] >= pad_before[i]);
+        indices.push_back(ovars[i] - pad_before[i]);
+      } else {
+        indices.push_back(ovars[i]);
+      }
+      if (!tvm::ir::Equal(pad_after[i], 0)) {
+        sel.push_back(tvm::ir::Simplify(ovars[i] < pad_before[i] + t->shape[i]));
+      }
+    }
+    return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), 0);
+  };
+  return tvm::compute(output_shape, l, "tensor", "ewise");
+}
+
+// Returns a compute that calculates a row-major matrix multiplication:
+//   A(i, k) * B(k, j), if trans_a == trans_b
+//   the usual transposed combinations, otherwise
+inline tvm::Tensor matmult(const tvm::Tensor& A, const tvm::Tensor& B,
+                           bool trans_a = false, bool trans_b = false) {
+  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
+                                     B->shape[trans_b ? 0 : 1]};
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
+  auto l = [&](tvm::Var i, tvm::Var j) {
+    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
+                    {k});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        T(b, i, stride_h * h + kh, stride_w * w + kw) * W(i, o, kh, kw),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1,  // W
+      I->shape[2],                                            // B
+      W->shape[3]                                             // O
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[3]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[0]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[1]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0) ? I : pad(I, {pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        T(stride_h * h + kh, stride_w * w + kw, i, b) * W(kh, kw, i, o),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor depthwise_conv2d_nchw(const tvm::Tensor& I,
+                                         const tvm::Tensor& W, int pad_h = 0,
+                                         int pad_w = 0, int stride_h = 1,
+                                         int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  auto pCM = W->shape[1];  // channel_multiplier
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(T(b, i / pCM, stride_h * h + kh, stride_w * w + kw) *
+                        W(i / pCM, o % pCM, kh, kw),
+                    {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
+                                      const tvm::Tensor& W, int pad_h = 0,
+                                      int pad_w = 0, int stride_h = 1,
+                                      int stride_w = 1) {
+  CHECK_EQ(5, I->shape.size());
+  CHECK_EQ(5, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      I->shape[1],                                            // G
+      W->shape[2],                                            // O
+      (I->shape[3] - W->shape[3] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[4] - W->shape[4] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[2]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[4]}, "kw");
+
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var g, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        I(b, g, i, stride_h * h + kh, stride_w * w + kw) * W(g, i, o, kh, kw),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+}  // namespace topi
+#endif  // TOPI_NN_H_