diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 99329517..0bb4ca2d 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -99,6 +99,8 @@ class NDArray { bool defined() const { return data_ != nullptr; } + /*! \return If NDArray is allocated*/ + inline bool allocated() const; /*! \return If both NDArray reference the same container */ bool same_as(const NDArray& other) const { return data_ == other.data_; @@ -164,11 +166,13 @@ class NDArray { * \param shape The shape of the new array. * \param dtype The data type of the new array. * \param ctx The context of the Array. + * \param allocate Allocate memory if true. * \return The created Array */ TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, - DLContext ctx); + DLContext ctx, + bool allocate = true); /*! * \brief Create a NDArray backed by a dlpack tensor. * @@ -354,6 +358,10 @@ inline void NDArray::reset() { } } +inline bool NDArray::allocated() const { + return defined() && data_->dl_tensor.data != nullptr; +} + /*! \brief return the size of data the DLTensor hold, in term of number of bytes * * \param arr the input DLTensor diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 38016ab8..391e3eeb 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -54,7 +54,15 @@ inline size_t GetDataAlignment(const DLTensor& arr) { void GraphRuntime::Run() { // setup the array and requirements. for (size_t i = 0; i < op_execs_.size(); ++i) { - if (op_execs_[i]) op_execs_[i](); + if (op_execs_[i]) { + auto& op_arg = op_args_[i]; + if (op_arg) { + for (auto& arg : op_arg->args) { + CHECK(arg.data != nullptr) << "Un-initialized input!"; + } + } + op_execs_[i](); + } } } /*! @@ -106,6 +114,8 @@ int GraphRuntime::GetInputIndex(const std::string& name) { void GraphRuntime::SetInput(int index, DLTensor* data_in) { CHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); + CHECK(data_entry_[eid].allocated()) + << "Invoke 'set_input_zero_copy' for 'lazy_init_input' entry!"; data_entry_[eid].CopyFrom(data_in); } /*! @@ -255,7 +265,14 @@ void GraphRuntime::SetupStorage() { for (const std::string& s_type : attrs_.dltype) { vtype.push_back(tvm::runtime::String2TVMType(s_type)); } - + // get the entry id(s) of lazy initialized inputs + std::vector lazy_init_entries; + for (auto const& name : attrs_.lazy_init_input) { + int in_idx = GetInputIndex(name); + CHECK_GE(in_idx, 0) << "input \"" << name << "\" does not exist!"; + uint32_t eid = this->entry_id(input_nodes_[in_idx], 0); + lazy_init_entries.push_back(eid); + } // Size and device type of each storage pool entry. std::vector pool_entry; // Find the maximum space size. @@ -286,6 +303,8 @@ void GraphRuntime::SetupStorage() { } pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); pool_entry[sid].device_type = device_type; + pool_entry[sid].lazy_init = (std::find(lazy_init_entries.begin(), + lazy_init_entries.end(), i) != lazy_init_entries.end()); } // Allocate the space. @@ -300,7 +319,7 @@ void GraphRuntime::SetupStorage() { TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit; shape.push_back(static_cast(pit.size + 3) / 4); storage_pool_.push_back( - NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx)); + NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx, !pit.lazy_init)); } // Assign the pooled entries. A unified memory pool is used to simplifiy diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h index e8097a83..6b771645 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph/graph_runtime.h @@ -188,7 +188,8 @@ class GraphRuntime : public ModuleNode { struct PoolEntry { size_t size; int device_type; - PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {} + bool lazy_init; + PoolEntry(int s, int dev_type) : size(s), device_type(dev_type), lazy_init(false) {} }; // Node entry struct NodeEntry { @@ -277,6 +278,7 @@ class GraphRuntime : public ModuleNode { std::vector device_index; std::vector dltype; std::vector > shape; + std::vector lazy_init_input; // The graph attribute fields. void Load(dmlc::JSONReader *reader) { reader->BeginObject(); @@ -318,6 +320,14 @@ class GraphRuntime : public ModuleNode { CHECK(reader->NextArrayItem()); reader->Read(&device_index); CHECK(!reader->NextArrayItem()); + } else if (key == "lazy_init_input") { + reader->BeginArray(); + CHECK(reader->NextArrayItem()); + reader->Read(&type); + CHECK_EQ(type, "list_str"); + CHECK(reader->NextArrayItem()); + reader->Read(&lazy_init_input); + CHECK(!reader->NextArrayItem()); } else { reader->BeginArray(); CHECK(reader->NextArrayItem()); diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index 98e05a86..beae7550 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -142,14 +142,17 @@ DLManagedTensor* NDArray::ToDLPack() const { NDArray NDArray::Empty(std::vector shape, DLDataType dtype, - DLContext ctx) { + DLContext ctx, + bool allocate) { NDArray ret = Internal::Create(shape, dtype, ctx); - // setup memory content - size_t size = GetDataSize(ret.data_->dl_tensor); - size_t alignment = GetDataAlignment(ret.data_->dl_tensor); - ret.data_->dl_tensor.data = - DeviceAPI::Get(ret->ctx)->AllocDataSpace( - ret->ctx, size, alignment, ret->dtype); + if (allocate) { + // setup memory content + size_t size = GetDataSize(ret.data_->dl_tensor); + size_t alignment = GetDataAlignment(ret.data_->dl_tensor); + ret.data_->dl_tensor.data = + DeviceAPI::Get(ret->ctx)->AllocDataSpace( + ret->ctx, size, alignment, ret->dtype); + } return ret; } diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index 1a7f791f..1e83c178 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -189,6 +189,77 @@ TEST(BuildModule, Heterogeneous) { } } +TEST(BuildModule, LazyInitInput) { + using namespace tvm; + + const int n = 4; + Array shape{n}; + + auto A = placeholder(shape, Float(32), "A"); + auto B = placeholder(shape, Float(32), "B"); + + auto C = compute(A->shape, [&A, &B](Expr i) { + return A[i] + B[i]; + }, "C"); + + auto s = create_schedule({ C->op }); + auto args = Array({ A, B, C }); + std::unordered_map binds; + + auto config = BuildConfig::Create(); + auto target = target::llvm(); + + auto lowered = lower(s, args, "myadd", binds, config); + auto module = build(lowered, target, Target(), config); + + std::string json = + "{\"nodes\": [{\"op\": \"null\", \"name\": \"x\", \"inputs\": []}, {\"op\": \"null\", \"name\": \"y\", \"inputs\": []}, " + "{\"op\": \"tvm_op\", \"name\": \"add\", \"inputs\": [[0, 0, 0], [1, 0, 0]], \"attrs\": {\"func_name\": " + "\"myadd\", \"flatten_data\": \"1\", \"num_inputs\": \"2\", \"num_outputs\": \"1\"}}], " + "\"arg_nodes\": [0, 1], \"node_row_ptr\": [0, 1, 2, 3], \"heads\": [[2, 0, 0]], " + "\"attrs\": {\"shape\": [\"list_shape\", [[4], [4], [4]]], \"dltype\": [\"list_str\", [\"float32\", \"float32\", \"float32\"]], " + "\"storage_id\": [\"list_int\", [0, 1, 2]], \"lazy_init_input\": [\"list_str\", [\"y\"]]}}"; + + // Setup inputs. + auto a_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto b_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + auto pa = (float*)a_val.ToDLPack()->dl_tensor.data; + auto pb = (float*)b_val.ToDLPack()->dl_tensor.data; + + // Assign values. + for (int i = 0; i < n; i++) { + pa[i] = pb[i] = i; + } + + // Initialize graph runtime. + int cpu_dev_ty = static_cast(kDLCPU); + int cpu_dev_id = 0; + + const runtime::PackedFunc* graph_runtime = + tvm::runtime::Registry::Get("tvm.graph_runtime.create"); + runtime::Module mod = (*graph_runtime)(json, module, cpu_dev_ty, cpu_dev_id); + + PackedFunc get_input = mod.GetFunction("get_input", false); + CHECK(((runtime::NDArray)get_input("x")).allocated()); + CHECK(!((runtime::NDArray)get_input("y")).allocated()); + + PackedFunc set_input = mod.GetFunction("set_input", false); + PackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false); + PackedFunc run = mod.GetFunction("run", false); + PackedFunc get_output = mod.GetFunction("get_output", false); + set_input("x", a_val); + set_input_zero_copy("y", b_val); + run(); + tvm::runtime::NDArray out = get_output(0); + float* p_out = (float*)out.ToDLPack()->dl_tensor.data; + + // Check correctness. + for (int i = 0; i < n; ++i) { + CHECK_LT(std::fabs(p_out[i] - i*2), 1e-5); + } +} + int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); testing::FLAGS_gtest_death_test_style = "threadsafe";