[Runtime] Allow parameter sharing between modules (#3489)
As GraphRuntime does not provide control-flow logics, we have to split our model to two parts. While we need to share parameters between them to save memory usage. Solution: 1) add "lazy_init_input" in graph's attributes "attrs": { ... ... "lazy_init_input": [ "list_str", [ "p0" ] ] } 2) allow un-allocated NDArray entry in SetupStorage 3) utilize "set_input_zero_copy" function to set parameters
This commit is contained in:
Родитель
0fa308e9ef
Коммит
224cc243b4
|
@ -99,6 +99,8 @@ class NDArray {
|
||||||
bool defined() const {
|
bool defined() const {
|
||||||
return data_ != nullptr;
|
return data_ != nullptr;
|
||||||
}
|
}
|
||||||
|
/*! \return If NDArray is allocated*/
|
||||||
|
inline bool allocated() const;
|
||||||
/*! \return If both NDArray reference the same container */
|
/*! \return If both NDArray reference the same container */
|
||||||
bool same_as(const NDArray& other) const {
|
bool same_as(const NDArray& other) const {
|
||||||
return data_ == other.data_;
|
return data_ == other.data_;
|
||||||
|
@ -164,11 +166,13 @@ class NDArray {
|
||||||
* \param shape The shape of the new array.
|
* \param shape The shape of the new array.
|
||||||
* \param dtype The data type of the new array.
|
* \param dtype The data type of the new array.
|
||||||
* \param ctx The context of the Array.
|
* \param ctx The context of the Array.
|
||||||
|
* \param allocate Allocate memory if true.
|
||||||
* \return The created Array
|
* \return The created Array
|
||||||
*/
|
*/
|
||||||
TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
|
TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
|
||||||
DLDataType dtype,
|
DLDataType dtype,
|
||||||
DLContext ctx);
|
DLContext ctx,
|
||||||
|
bool allocate = true);
|
||||||
/*!
|
/*!
|
||||||
* \brief Create a NDArray backed by a dlpack tensor.
|
* \brief Create a NDArray backed by a dlpack tensor.
|
||||||
*
|
*
|
||||||
|
@ -354,6 +358,10 @@ inline void NDArray::reset() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool NDArray::allocated() const {
|
||||||
|
return defined() && data_->dl_tensor.data != nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
/*! \brief return the size of data the DLTensor hold, in term of number of bytes
|
/*! \brief return the size of data the DLTensor hold, in term of number of bytes
|
||||||
*
|
*
|
||||||
* \param arr the input DLTensor
|
* \param arr the input DLTensor
|
||||||
|
|
|
@ -54,7 +54,15 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
|
||||||
void GraphRuntime::Run() {
|
void GraphRuntime::Run() {
|
||||||
// setup the array and requirements.
|
// setup the array and requirements.
|
||||||
for (size_t i = 0; i < op_execs_.size(); ++i) {
|
for (size_t i = 0; i < op_execs_.size(); ++i) {
|
||||||
if (op_execs_[i]) op_execs_[i]();
|
if (op_execs_[i]) {
|
||||||
|
auto& op_arg = op_args_[i];
|
||||||
|
if (op_arg) {
|
||||||
|
for (auto& arg : op_arg->args) {
|
||||||
|
CHECK(arg.data != nullptr) << "Un-initialized input!";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
op_execs_[i]();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
|
@ -106,6 +114,8 @@ int GraphRuntime::GetInputIndex(const std::string& name) {
|
||||||
void GraphRuntime::SetInput(int index, DLTensor* data_in) {
|
void GraphRuntime::SetInput(int index, DLTensor* data_in) {
|
||||||
CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
|
CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
|
||||||
uint32_t eid = this->entry_id(input_nodes_[index], 0);
|
uint32_t eid = this->entry_id(input_nodes_[index], 0);
|
||||||
|
CHECK(data_entry_[eid].allocated())
|
||||||
|
<< "Invoke 'set_input_zero_copy' for 'lazy_init_input' entry!";
|
||||||
data_entry_[eid].CopyFrom(data_in);
|
data_entry_[eid].CopyFrom(data_in);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
|
@ -255,7 +265,14 @@ void GraphRuntime::SetupStorage() {
|
||||||
for (const std::string& s_type : attrs_.dltype) {
|
for (const std::string& s_type : attrs_.dltype) {
|
||||||
vtype.push_back(tvm::runtime::String2TVMType(s_type));
|
vtype.push_back(tvm::runtime::String2TVMType(s_type));
|
||||||
}
|
}
|
||||||
|
// get the entry id(s) of lazy initialized inputs
|
||||||
|
std::vector<uint32_t> lazy_init_entries;
|
||||||
|
for (auto const& name : attrs_.lazy_init_input) {
|
||||||
|
int in_idx = GetInputIndex(name);
|
||||||
|
CHECK_GE(in_idx, 0) << "input \"" << name << "\" does not exist!";
|
||||||
|
uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
|
||||||
|
lazy_init_entries.push_back(eid);
|
||||||
|
}
|
||||||
// Size and device type of each storage pool entry.
|
// Size and device type of each storage pool entry.
|
||||||
std::vector<PoolEntry> pool_entry;
|
std::vector<PoolEntry> pool_entry;
|
||||||
// Find the maximum space size.
|
// Find the maximum space size.
|
||||||
|
@ -286,6 +303,8 @@ void GraphRuntime::SetupStorage() {
|
||||||
}
|
}
|
||||||
pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
|
pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
|
||||||
pool_entry[sid].device_type = device_type;
|
pool_entry[sid].device_type = device_type;
|
||||||
|
pool_entry[sid].lazy_init = (std::find(lazy_init_entries.begin(),
|
||||||
|
lazy_init_entries.end(), i) != lazy_init_entries.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate the space.
|
// Allocate the space.
|
||||||
|
@ -300,7 +319,7 @@ void GraphRuntime::SetupStorage() {
|
||||||
TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
|
TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
|
||||||
shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
|
shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
|
||||||
storage_pool_.push_back(
|
storage_pool_.push_back(
|
||||||
NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
|
NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx, !pit.lazy_init));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assign the pooled entries. A unified memory pool is used to simplifiy
|
// Assign the pooled entries. A unified memory pool is used to simplifiy
|
||||||
|
|
|
@ -188,7 +188,8 @@ class GraphRuntime : public ModuleNode {
|
||||||
struct PoolEntry {
|
struct PoolEntry {
|
||||||
size_t size;
|
size_t size;
|
||||||
int device_type;
|
int device_type;
|
||||||
PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
|
bool lazy_init;
|
||||||
|
PoolEntry(int s, int dev_type) : size(s), device_type(dev_type), lazy_init(false) {}
|
||||||
};
|
};
|
||||||
// Node entry
|
// Node entry
|
||||||
struct NodeEntry {
|
struct NodeEntry {
|
||||||
|
@ -277,6 +278,7 @@ class GraphRuntime : public ModuleNode {
|
||||||
std::vector<int> device_index;
|
std::vector<int> device_index;
|
||||||
std::vector<std::string> dltype;
|
std::vector<std::string> dltype;
|
||||||
std::vector<std::vector<int64_t> > shape;
|
std::vector<std::vector<int64_t> > shape;
|
||||||
|
std::vector<std::string> lazy_init_input;
|
||||||
// The graph attribute fields.
|
// The graph attribute fields.
|
||||||
void Load(dmlc::JSONReader *reader) {
|
void Load(dmlc::JSONReader *reader) {
|
||||||
reader->BeginObject();
|
reader->BeginObject();
|
||||||
|
@ -318,6 +320,14 @@ class GraphRuntime : public ModuleNode {
|
||||||
CHECK(reader->NextArrayItem());
|
CHECK(reader->NextArrayItem());
|
||||||
reader->Read(&device_index);
|
reader->Read(&device_index);
|
||||||
CHECK(!reader->NextArrayItem());
|
CHECK(!reader->NextArrayItem());
|
||||||
|
} else if (key == "lazy_init_input") {
|
||||||
|
reader->BeginArray();
|
||||||
|
CHECK(reader->NextArrayItem());
|
||||||
|
reader->Read(&type);
|
||||||
|
CHECK_EQ(type, "list_str");
|
||||||
|
CHECK(reader->NextArrayItem());
|
||||||
|
reader->Read(&lazy_init_input);
|
||||||
|
CHECK(!reader->NextArrayItem());
|
||||||
} else {
|
} else {
|
||||||
reader->BeginArray();
|
reader->BeginArray();
|
||||||
CHECK(reader->NextArrayItem());
|
CHECK(reader->NextArrayItem());
|
||||||
|
|
|
@ -142,14 +142,17 @@ DLManagedTensor* NDArray::ToDLPack() const {
|
||||||
|
|
||||||
NDArray NDArray::Empty(std::vector<int64_t> shape,
|
NDArray NDArray::Empty(std::vector<int64_t> shape,
|
||||||
DLDataType dtype,
|
DLDataType dtype,
|
||||||
DLContext ctx) {
|
DLContext ctx,
|
||||||
|
bool allocate) {
|
||||||
NDArray ret = Internal::Create(shape, dtype, ctx);
|
NDArray ret = Internal::Create(shape, dtype, ctx);
|
||||||
// setup memory content
|
if (allocate) {
|
||||||
size_t size = GetDataSize(ret.data_->dl_tensor);
|
// setup memory content
|
||||||
size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
|
size_t size = GetDataSize(ret.data_->dl_tensor);
|
||||||
ret.data_->dl_tensor.data =
|
size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
|
||||||
DeviceAPI::Get(ret->ctx)->AllocDataSpace(
|
ret.data_->dl_tensor.data =
|
||||||
ret->ctx, size, alignment, ret->dtype);
|
DeviceAPI::Get(ret->ctx)->AllocDataSpace(
|
||||||
|
ret->ctx, size, alignment, ret->dtype);
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -189,6 +189,77 @@ TEST(BuildModule, Heterogeneous) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(BuildModule, LazyInitInput) {
|
||||||
|
using namespace tvm;
|
||||||
|
|
||||||
|
const int n = 4;
|
||||||
|
Array<Expr> shape{n};
|
||||||
|
|
||||||
|
auto A = placeholder(shape, Float(32), "A");
|
||||||
|
auto B = placeholder(shape, Float(32), "B");
|
||||||
|
|
||||||
|
auto C = compute(A->shape, [&A, &B](Expr i) {
|
||||||
|
return A[i] + B[i];
|
||||||
|
}, "C");
|
||||||
|
|
||||||
|
auto s = create_schedule({ C->op });
|
||||||
|
auto args = Array<Tensor>({ A, B, C });
|
||||||
|
std::unordered_map<Tensor, Buffer> binds;
|
||||||
|
|
||||||
|
auto config = BuildConfig::Create();
|
||||||
|
auto target = target::llvm();
|
||||||
|
|
||||||
|
auto lowered = lower(s, args, "myadd", binds, config);
|
||||||
|
auto module = build(lowered, target, Target(), config);
|
||||||
|
|
||||||
|
std::string json =
|
||||||
|
"{\"nodes\": [{\"op\": \"null\", \"name\": \"x\", \"inputs\": []}, {\"op\": \"null\", \"name\": \"y\", \"inputs\": []}, "
|
||||||
|
"{\"op\": \"tvm_op\", \"name\": \"add\", \"inputs\": [[0, 0, 0], [1, 0, 0]], \"attrs\": {\"func_name\": "
|
||||||
|
"\"myadd\", \"flatten_data\": \"1\", \"num_inputs\": \"2\", \"num_outputs\": \"1\"}}], "
|
||||||
|
"\"arg_nodes\": [0, 1], \"node_row_ptr\": [0, 1, 2, 3], \"heads\": [[2, 0, 0]], "
|
||||||
|
"\"attrs\": {\"shape\": [\"list_shape\", [[4], [4], [4]]], \"dltype\": [\"list_str\", [\"float32\", \"float32\", \"float32\"]], "
|
||||||
|
"\"storage_id\": [\"list_int\", [0, 1, 2]], \"lazy_init_input\": [\"list_str\", [\"y\"]]}}";
|
||||||
|
|
||||||
|
// Setup inputs.
|
||||||
|
auto a_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
|
||||||
|
auto b_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
|
||||||
|
|
||||||
|
auto pa = (float*)a_val.ToDLPack()->dl_tensor.data;
|
||||||
|
auto pb = (float*)b_val.ToDLPack()->dl_tensor.data;
|
||||||
|
|
||||||
|
// Assign values.
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
pa[i] = pb[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize graph runtime.
|
||||||
|
int cpu_dev_ty = static_cast<int>(kDLCPU);
|
||||||
|
int cpu_dev_id = 0;
|
||||||
|
|
||||||
|
const runtime::PackedFunc* graph_runtime =
|
||||||
|
tvm::runtime::Registry::Get("tvm.graph_runtime.create");
|
||||||
|
runtime::Module mod = (*graph_runtime)(json, module, cpu_dev_ty, cpu_dev_id);
|
||||||
|
|
||||||
|
PackedFunc get_input = mod.GetFunction("get_input", false);
|
||||||
|
CHECK(((runtime::NDArray)get_input("x")).allocated());
|
||||||
|
CHECK(!((runtime::NDArray)get_input("y")).allocated());
|
||||||
|
|
||||||
|
PackedFunc set_input = mod.GetFunction("set_input", false);
|
||||||
|
PackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false);
|
||||||
|
PackedFunc run = mod.GetFunction("run", false);
|
||||||
|
PackedFunc get_output = mod.GetFunction("get_output", false);
|
||||||
|
set_input("x", a_val);
|
||||||
|
set_input_zero_copy("y", b_val);
|
||||||
|
run();
|
||||||
|
tvm::runtime::NDArray out = get_output(0);
|
||||||
|
float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
|
||||||
|
|
||||||
|
// Check correctness.
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
CHECK_LT(std::fabs(p_out[i] - i*2), 1e-5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
testing::InitGoogleTest(&argc, argv);
|
testing::InitGoogleTest(&argc, argv);
|
||||||
testing::FLAGS_gtest_death_test_style = "threadsafe";
|
testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||||
|
|
Загрузка…
Ссылка в новой задаче