zmq hangs issue. a tricky solution
This commit is contained in:
Родитель
6091fbac06
Коммит
72fae9524b
|
@ -10,6 +10,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Test", "Test\Test.vcxproj",
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "app", "app", "{E89A2FAD-9A40-49CD-89C1-519721658DF0}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
debug_zmq|x64 = debug_zmq|x64
|
debug_zmq|x64 = debug_zmq|x64
|
||||||
|
|
|
@ -29,11 +29,10 @@ public:
|
||||||
// get machine file
|
// get machine file
|
||||||
if (active_) return;
|
if (active_) return;
|
||||||
// CHECK(*argc > 2);
|
// CHECK(*argc > 2);
|
||||||
std::vector<std::string> machine_lists;
|
ParseMachineFile(MV_CONFIG_machine_file, &machine_lists_);
|
||||||
ParseMachineFile(MV_CONFIG_machine_file, &machine_lists);
|
|
||||||
int port = MV_CONFIG_port; // atoi(argv[2]);
|
int port = MV_CONFIG_port; // atoi(argv[2]);
|
||||||
|
|
||||||
size_ = static_cast<int>(machine_lists.size());
|
size_ = static_cast<int>(machine_lists_.size());
|
||||||
CHECK(size_ > 0);
|
CHECK(size_ > 0);
|
||||||
std::unordered_set<std::string> local_ip;
|
std::unordered_set<std::string> local_ip;
|
||||||
net::GetLocalIPAddress(&local_ip);
|
net::GetLocalIPAddress(&local_ip);
|
||||||
|
@ -41,7 +40,7 @@ public:
|
||||||
context_ = zmq_ctx_new();
|
context_ = zmq_ctx_new();
|
||||||
zmq_ctx_set(context_, ZMQ_MAX_SOCKETS, 256);
|
zmq_ctx_set(context_, ZMQ_MAX_SOCKETS, 256);
|
||||||
|
|
||||||
for (auto ip : machine_lists) {
|
for (auto ip : machine_lists_) {
|
||||||
if (local_ip.find(ip) != local_ip.end()) { // my rank
|
if (local_ip.find(ip) != local_ip.end()) { // my rank
|
||||||
rank_ = static_cast<int>(senders_.size());
|
rank_ = static_cast<int>(senders_.size());
|
||||||
senders_.push_back(nullptr);
|
senders_.push_back(nullptr);
|
||||||
|
@ -49,12 +48,16 @@ public:
|
||||||
int rc = zmq_bind(receiver_,
|
int rc = zmq_bind(receiver_,
|
||||||
("tcp://" + ip + ":" + std::to_string(port)).c_str());
|
("tcp://" + ip + ":" + std::to_string(port)).c_str());
|
||||||
CHECK(rc == 0);
|
CHECK(rc == 0);
|
||||||
|
int linger = 0;
|
||||||
|
CHECK(zmq_setsockopt(receiver_, ZMQ_LINGER, &linger, sizeof(linger)) == 0);
|
||||||
} else {
|
} else {
|
||||||
void* senders = zmq_socket(context_, ZMQ_DEALER);
|
void* sender = zmq_socket(context_, ZMQ_DEALER);
|
||||||
int rc = zmq_connect(senders,
|
int rc = zmq_connect(sender,
|
||||||
("tcp://" + ip + ":" + std::to_string(port)).c_str());
|
("tcp://" + ip + ":" + std::to_string(port)).c_str());
|
||||||
CHECK(rc == 0);
|
CHECK(rc == 0);
|
||||||
senders_.push_back(senders);
|
senders_.push_back(sender);
|
||||||
|
int linger = 0;
|
||||||
|
CHECK(zmq_setsockopt(sender, ZMQ_LINGER, &linger, sizeof(linger)) == 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CHECK_NOTNULL(receiver_);
|
CHECK_NOTNULL(receiver_);
|
||||||
|
@ -71,6 +74,8 @@ public:
|
||||||
receiver_ = zmq_socket(context_, ZMQ_DEALER);
|
receiver_ = zmq_socket(context_, ZMQ_DEALER);
|
||||||
int rc = zmq_bind(receiver_, ("tcp://" + ip_port).c_str());
|
int rc = zmq_bind(receiver_, ("tcp://" + ip_port).c_str());
|
||||||
if (rc == 0) {
|
if (rc == 0) {
|
||||||
|
int linger = 0;
|
||||||
|
CHECK(zmq_setsockopt(receiver_, ZMQ_LINGER, &linger, sizeof(linger)) == 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -91,6 +96,7 @@ public:
|
||||||
if (rank == rank_) continue;
|
if (rank == rank_) continue;
|
||||||
std::string ip_port(endpoints[i]);
|
std::string ip_port(endpoints[i]);
|
||||||
senders_[rank] = zmq_socket(context_, ZMQ_DEALER);
|
senders_[rank] = zmq_socket(context_, ZMQ_DEALER);
|
||||||
|
// NOTE(feiga): set linger to 0, otherwise will hang
|
||||||
int rc = zmq_connect(senders_[rank], ("tcp://" + ip_port).c_str());
|
int rc = zmq_connect(senders_[rank], ("tcp://" + ip_port).c_str());
|
||||||
if (rc != 0) {
|
if (rc != 0) {
|
||||||
Log::Error("Failed to connect the socket for sender, rank = %d, "
|
Log::Error("Failed to connect the socket for sender, rank = %d, "
|
||||||
|
@ -104,10 +110,22 @@ public:
|
||||||
|
|
||||||
void Finalize() override {
|
void Finalize() override {
|
||||||
active_ = false;
|
active_ = false;
|
||||||
// zmq_term(context_);
|
for (int i = 0; i < senders_.size(); ++i) {
|
||||||
zmq_close(receiver_);
|
if (i != rank_) {
|
||||||
for (auto& p : senders_) if (p) zmq_close(p);
|
int linger = 0;
|
||||||
zmq_ctx_destroy(context_);
|
CHECK(zmq_setsockopt(senders_[i], ZMQ_LINGER, &linger, sizeof(linger)) == 0);
|
||||||
|
int rc = zmq_close(senders_[i]);
|
||||||
|
CHECK(rc == 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int linger = 0;
|
||||||
|
CHECK(zmq_setsockopt(receiver_, ZMQ_LINGER, &linger, sizeof(linger)) == 0);
|
||||||
|
int rc = zmq_close(receiver_);
|
||||||
|
CHECK(rc == 0);
|
||||||
|
|
||||||
|
CHECK(zmq_ctx_shutdown(context_)==0);
|
||||||
|
// zmq_ctx_term(context_);
|
||||||
|
Log::Info("zmq finalize: close context\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool active() const override { return active_; }
|
bool active() const override { return active_; }
|
||||||
|
@ -151,7 +169,7 @@ public:
|
||||||
MessagePtr& msg = *msg_ptr;
|
MessagePtr& msg = *msg_ptr;
|
||||||
msg->data().clear();
|
msg->data().clear();
|
||||||
CHECK(msg.get());
|
CHECK(msg.get());
|
||||||
recv_size = zmq_recv(receiver_, msg->header(), Message::kHeaderSize, 0);
|
recv_size = zmq_recv(receiver_, msg->header(), Message::kHeaderSize, ZMQ_DONTWAIT);
|
||||||
if (recv_size < 0) { return -1; }
|
if (recv_size < 0) { return -1; }
|
||||||
CHECK(Message::kHeaderSize == recv_size);
|
CHECK(Message::kHeaderSize == recv_size);
|
||||||
|
|
||||||
|
@ -236,6 +254,7 @@ protected:
|
||||||
std::vector<void*> senders_;
|
std::vector<void*> senders_;
|
||||||
int rank_;
|
int rank_;
|
||||||
int size_;
|
int size_;
|
||||||
|
std::vector<std::string> machine_lists_;
|
||||||
};
|
};
|
||||||
} // namespace multiverso
|
} // namespace multiverso
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ Communicator::Communicator() : Actor(actor::kCommunicator) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Communicator::~Communicator() {
|
Communicator::~Communicator() {
|
||||||
|
recv_thread_->join();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Communicator::Main() {
|
void Communicator::Main() {
|
||||||
|
@ -78,8 +78,9 @@ void Communicator::Communicate() {
|
||||||
MessagePtr msg(new Message());
|
MessagePtr msg(new Message());
|
||||||
size_t size = net_util_->Recv(&msg);
|
size_t size = net_util_->Recv(&msg);
|
||||||
if (size == -1) {
|
if (size == -1) {
|
||||||
Log::Debug("recv return -1\n");
|
continue;
|
||||||
break;
|
// Log::Debug("recv return -1\n");
|
||||||
|
// break;
|
||||||
}
|
}
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
// a message received
|
// a message received
|
||||||
|
|
|
@ -50,6 +50,8 @@ void Zoo::Stop(bool finalize_net) {
|
||||||
if (!MV_CONFIG_ma) { StopPS(); }
|
if (!MV_CONFIG_ma) { StopPS(); }
|
||||||
// Stop the network
|
// Stop the network
|
||||||
if (finalize_net) net_util_->Finalize();
|
if (finalize_net) net_util_->Finalize();
|
||||||
|
for (auto actor : zoo_) delete actor.second;
|
||||||
|
Log::Info("Multiverso Shutdown successfully\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int Zoo::rank() const { return NetInterface::Get()->rank(); }
|
int Zoo::rank() const { return NetInterface::Get()->rank(); }
|
||||||
|
|
Загрузка…
Ссылка в новой задаче