зеркало из https://github.com/microsoft/CCF.git
Raft fixes: Rollback safety despite NACKs, and only ACK up to incoming AE (#6016)
This commit is contained in:
Родитель
74326effa7
Коммит
443309e10d
|
@ -2,4 +2,4 @@
|
||||||
(- -) (= =) | Y & +--?
|
(- -) (= =) | Y & +--?
|
||||||
( V ) / . \ | +---=---'
|
( V ) / . \ | +---=---'
|
||||||
/--x-m- /--n-n---xXx--/--yY------>>>----<<<>>]]{{}}---||-/\---..
|
/--x-m- /--n-n---xXx--/--yY------>>>----<<<>>]]{{}}---||-/\---..
|
||||||
2024
|
2024!
|
|
@ -1 +1 @@
|
||||||
.............
|
..............
|
12
CHANGELOG.md
12
CHANGELOG.md
|
@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
|
||||||
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
||||||
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## Unreleased
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Improvements to the Raft implementation, to retain commit safety and liveness despite message loss (#6016).
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added 2 new log lines which may be helpful diagnostics in production deployments, both including a new `[rollback]` tag.
|
||||||
|
- `[rollback] ... Dropping conflicting branch` may be emitted after network partitions, and indicates that some `Pending` (non-committed) transactions have been lost. This is expected, but worth investigating if it occurs regularly - it is a sign of elections impacting service availability.
|
||||||
|
- `[rollback] ... Ignoring conflicting AppendEntries` could also be emitted after a network partition, but **should be reported to the CCF development team**. It is a sign of an unexpected execution path, which could lead to loss of liveness (inability to advance commit).
|
||||||
|
|
||||||
## [5.0.0-dev12]
|
## [5.0.0-dev12]
|
||||||
|
|
||||||
[5.0.0-dev12]: https://github.com/microsoft/CCF/releases/tag/ccf-5.0.0-dev12
|
[5.0.0-dev12]: https://github.com/microsoft/CCF/releases/tag/ccf-5.0.0-dev12
|
||||||
|
|
|
@ -63,6 +63,8 @@
|
||||||
#define RAFT_TRACE_JSON_OUT(json_object) \
|
#define RAFT_TRACE_JSON_OUT(json_object) \
|
||||||
CCF_LOG_OUT(DEBUG, "raft_trace") << json_object
|
CCF_LOG_OUT(DEBUG, "raft_trace") << json_object
|
||||||
|
|
||||||
|
#define LOG_ROLLBACK_INFO_FMT CCF_LOG_FMT(INFO, "rollback")
|
||||||
|
|
||||||
namespace aft
|
namespace aft
|
||||||
{
|
{
|
||||||
using Configuration = kv::Configuration;
|
using Configuration = kv::Configuration;
|
||||||
|
@ -1061,7 +1063,7 @@ namespace aft
|
||||||
from,
|
from,
|
||||||
state->current_view,
|
state->current_view,
|
||||||
r.term);
|
r.term);
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1082,7 +1084,7 @@ namespace aft
|
||||||
state->node_id,
|
state->node_id,
|
||||||
from,
|
from,
|
||||||
r.prev_idx);
|
r.prev_idx);
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1095,8 +1097,7 @@ namespace aft
|
||||||
prev_term,
|
prev_term,
|
||||||
r.prev_term);
|
r.prev_term);
|
||||||
const ccf::TxID rejected_tx{r.prev_term, r.prev_idx};
|
const ccf::TxID rejected_tx{r.prev_term, r.prev_idx};
|
||||||
send_append_entries_response(
|
send_append_entries_response_nack(from, rejected_tx);
|
||||||
from, AppendEntriesResponseType::FAIL, rejected_tx);
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1109,7 +1110,7 @@ namespace aft
|
||||||
assert(state->retirement_committable_idx.has_value());
|
assert(state->retirement_committable_idx.has_value());
|
||||||
if (r.idx > state->retirement_committable_idx)
|
if (r.idx > state->retirement_committable_idx)
|
||||||
{
|
{
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1155,26 +1156,6 @@ namespace aft
|
||||||
r.idx,
|
r.idx,
|
||||||
r.prev_idx);
|
r.prev_idx);
|
||||||
|
|
||||||
if (is_new_follower)
|
|
||||||
{
|
|
||||||
if (state->last_idx > r.prev_idx)
|
|
||||||
{
|
|
||||||
RAFT_DEBUG_FMT(
|
|
||||||
"New follower received first append entries with mismatch - "
|
|
||||||
"rolling back from {} to {}",
|
|
||||||
state->last_idx,
|
|
||||||
r.prev_idx);
|
|
||||||
auto rollback_level = r.prev_idx;
|
|
||||||
rollback(rollback_level);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
RAFT_DEBUG_FMT(
|
|
||||||
"New follower has no conflict with prev_idx {}", r.prev_idx);
|
|
||||||
}
|
|
||||||
is_new_follower = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<
|
std::vector<
|
||||||
std::tuple<std::unique_ptr<kv::AbstractExecutionWrapper>, kv::Version>>
|
std::tuple<std::unique_ptr<kv::AbstractExecutionWrapper>, kv::Version>>
|
||||||
append_entries;
|
append_entries;
|
||||||
|
@ -1182,12 +1163,64 @@ namespace aft
|
||||||
for (Index i = r.prev_idx + 1; i <= r.idx; i++)
|
for (Index i = r.prev_idx + 1; i <= r.idx; i++)
|
||||||
{
|
{
|
||||||
if (i <= state->last_idx)
|
if (i <= state->last_idx)
|
||||||
|
{
|
||||||
|
// NB: This is only safe as long as AppendEntries only contain a
|
||||||
|
// single term. If they cover multiple terms, then we would need to at
|
||||||
|
// least partially deserialise each entry to establish what term it is
|
||||||
|
// in (or report the terms in the header)
|
||||||
|
static_assert(
|
||||||
|
max_terms_per_append_entries == 1,
|
||||||
|
"AppendEntries rollback logic assumes single term");
|
||||||
|
const auto incoming_term = r.term_of_idx;
|
||||||
|
const auto local_term = state->view_history.view_at(i);
|
||||||
|
if (incoming_term != local_term)
|
||||||
|
{
|
||||||
|
if (is_new_follower)
|
||||||
|
{
|
||||||
|
auto rollback_level = i - 1;
|
||||||
|
RAFT_DEBUG_FMT(
|
||||||
|
"New follower received AppendEntries with conflict. Incoming "
|
||||||
|
"entry {}.{} conflicts with local {}.{}. Rolling back to {}.",
|
||||||
|
incoming_term,
|
||||||
|
i,
|
||||||
|
local_term,
|
||||||
|
i,
|
||||||
|
rollback_level);
|
||||||
|
LOG_ROLLBACK_INFO_FMT(
|
||||||
|
"Dropping conflicting branch. Rolling back {} entries, "
|
||||||
|
"beginning with {}.{}.",
|
||||||
|
state->last_idx - rollback_level,
|
||||||
|
local_term,
|
||||||
|
i);
|
||||||
|
rollback(rollback_level);
|
||||||
|
is_new_follower = false;
|
||||||
|
// Then continue to process this AE as normal
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// We have a node retaining a conflicting suffix, and refusing to
|
||||||
|
// roll it back. It will remain divergent (not contributing to
|
||||||
|
// commit) this term, and can only be brought in-sync in a future
|
||||||
|
// term.
|
||||||
|
// This log is emitted as a canary, for what we hope is an
|
||||||
|
// unreachable branch. If it is ever seen we should revisit this.
|
||||||
|
LOG_ROLLBACK_INFO_FMT(
|
||||||
|
"Ignoring conflicting AppendEntries. Retaining {} entries, "
|
||||||
|
"beginning with {}.{}.",
|
||||||
|
state->last_idx - (i - 1),
|
||||||
|
local_term,
|
||||||
|
i);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
// If the current entry has already been deserialised, skip the
|
// If the current entry has already been deserialised, skip the
|
||||||
// payload for that entry
|
// payload for that entry
|
||||||
ledger->skip_entry(data, size);
|
ledger->skip_entry(data, size);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<uint8_t> entry;
|
std::vector<uint8_t> entry;
|
||||||
try
|
try
|
||||||
|
@ -1202,7 +1235,7 @@ namespace aft
|
||||||
state->node_id,
|
state->node_id,
|
||||||
from,
|
from,
|
||||||
e.what());
|
e.what());
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1215,9 +1248,10 @@ namespace aft
|
||||||
"deserialised",
|
"deserialised",
|
||||||
state->node_id,
|
state->node_id,
|
||||||
from);
|
from);
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
append_entries.push_back(std::make_tuple(std::move(ds), i));
|
append_entries.push_back(std::make_tuple(std::move(ds), i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1251,7 +1285,7 @@ namespace aft
|
||||||
if (apply_success == kv::ApplyResult::FAIL)
|
if (apply_success == kv::ApplyResult::FAIL)
|
||||||
{
|
{
|
||||||
ledger->truncate(i - 1);
|
ledger->truncate(i - 1);
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
state->last_idx = i;
|
state->last_idx = i;
|
||||||
|
@ -1280,7 +1314,7 @@ namespace aft
|
||||||
RAFT_FAIL_FMT("Follower failed to apply log entry: {}", i);
|
RAFT_FAIL_FMT("Follower failed to apply log entry: {}", i);
|
||||||
state->last_idx--;
|
state->last_idx--;
|
||||||
ledger->truncate(state->last_idx);
|
ledger->truncate(state->last_idx);
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::FAIL);
|
send_append_entries_response_nack(from);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1315,6 +1349,7 @@ namespace aft
|
||||||
"term");
|
"term");
|
||||||
state->view_history.update(r.prev_idx + 1, ds->get_term());
|
state->view_history.update(r.prev_idx + 1, ds->get_term());
|
||||||
}
|
}
|
||||||
|
|
||||||
commit_if_possible(r.leader_commit_idx);
|
commit_if_possible(r.leader_commit_idx);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -1369,29 +1404,51 @@ namespace aft
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
send_append_entries_response(from, AppendEntriesResponseType::OK);
|
send_append_entries_response_ack(from, r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void send_append_entries_response_ack(
|
||||||
|
ccf::NodeId to, const AppendEntries& ae)
|
||||||
|
{
|
||||||
|
// If we get to here, we have applied up to r.idx in this AppendEntries.
|
||||||
|
// We must only ACK this far, as we know nothing about the agreement of a
|
||||||
|
// suffix we may still hold _after_ r.idx with the leader's log
|
||||||
|
const auto response_idx = ae.idx;
|
||||||
|
send_append_entries_response(
|
||||||
|
to, AppendEntriesResponseType::OK, state->current_view, response_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void send_append_entries_response_nack(
|
||||||
|
ccf::NodeId to, const ccf::TxID& rejected)
|
||||||
|
{
|
||||||
|
const auto response_idx = find_highest_possible_match(rejected);
|
||||||
|
const auto response_term = get_term_internal(response_idx);
|
||||||
|
|
||||||
|
send_append_entries_response(
|
||||||
|
to, AppendEntriesResponseType::FAIL, response_term, response_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void send_append_entries_response_nack(ccf::NodeId to)
|
||||||
|
{
|
||||||
|
send_append_entries_response(
|
||||||
|
to,
|
||||||
|
AppendEntriesResponseType::FAIL,
|
||||||
|
state->current_view,
|
||||||
|
state->last_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_append_entries_response(
|
void send_append_entries_response(
|
||||||
ccf::NodeId to,
|
ccf::NodeId to,
|
||||||
AppendEntriesResponseType answer,
|
AppendEntriesResponseType answer,
|
||||||
const std::optional<ccf::TxID>& rejected = std::nullopt)
|
aft::Term response_term,
|
||||||
|
aft::Index response_idx)
|
||||||
{
|
{
|
||||||
aft::Index response_idx = state->last_idx;
|
|
||||||
aft::Term response_term = state->current_view;
|
|
||||||
|
|
||||||
if (answer == AppendEntriesResponseType::FAIL && rejected.has_value())
|
|
||||||
{
|
|
||||||
response_idx = find_highest_possible_match(rejected.value());
|
|
||||||
response_term = get_term_internal(response_idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
RAFT_DEBUG_FMT(
|
RAFT_DEBUG_FMT(
|
||||||
"Send append entries response from {} to {} for index {}: {}",
|
"Send append entries response from {} to {} for index {}: {}",
|
||||||
state->node_id,
|
state->node_id,
|
||||||
to,
|
to,
|
||||||
response_idx,
|
response_idx,
|
||||||
answer);
|
(answer == AppendEntriesResponseType::OK ? "ACK" : "NACK"));
|
||||||
|
|
||||||
AppendEntriesResponse response{
|
AppendEntriesResponse response{
|
||||||
.term = response_term,
|
.term = response_term,
|
||||||
|
@ -1516,20 +1573,10 @@ namespace aft
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Potentially unnecessary safety check - use min with last_idx, to
|
// max(...) because why would we ever want to go backwards on a success
|
||||||
// prevent matches past this node's local knowledge
|
// response?!
|
||||||
const auto proposed_match = std::min(r.last_log_idx, state->last_idx);
|
node->second.match_idx =
|
||||||
if (proposed_match < node->second.match_idx)
|
std::max(node->second.match_idx, r.last_log_idx);
|
||||||
{
|
|
||||||
RAFT_FAIL_FMT(
|
|
||||||
"Append entries response to {} from {} attempting to move "
|
|
||||||
"match_idx backwards ({} -> {})",
|
|
||||||
state->node_id,
|
|
||||||
from,
|
|
||||||
node->second.match_idx,
|
|
||||||
proposed_match);
|
|
||||||
}
|
|
||||||
node->second.match_idx = proposed_match;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RAFT_DEBUG_FMT(
|
RAFT_DEBUG_FMT(
|
||||||
|
|
|
@ -191,6 +191,10 @@ int main(int argc, char** argv)
|
||||||
skip_invariants = true;
|
skip_invariants = true;
|
||||||
driver->summarise_logs_all();
|
driver->summarise_logs_all();
|
||||||
break;
|
break;
|
||||||
|
case shash("summarise_messages"):
|
||||||
|
assert(items.size() == 3);
|
||||||
|
driver->summarise_messages(items[1], items[2]);
|
||||||
|
break;
|
||||||
case shash("shuffle_one"):
|
case shash("shuffle_one"):
|
||||||
assert(items.size() == 2);
|
assert(items.size() == 2);
|
||||||
driver->shuffle_messages_one(items[1]);
|
driver->shuffle_messages_one(items[1]);
|
||||||
|
|
|
@ -572,6 +572,75 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string get_message_summary(const std::vector<uint8_t>& contents)
|
||||||
|
{
|
||||||
|
const uint8_t* data = contents.data();
|
||||||
|
size_t size = contents.size();
|
||||||
|
|
||||||
|
const auto msg_type = serialized::peek<aft::RaftMsgType>(data, size);
|
||||||
|
switch (msg_type)
|
||||||
|
{
|
||||||
|
case (aft::RaftMsgType::raft_request_vote):
|
||||||
|
{
|
||||||
|
return "RV";
|
||||||
|
}
|
||||||
|
case (aft::RaftMsgType::raft_request_vote_response):
|
||||||
|
{
|
||||||
|
return "RVR";
|
||||||
|
}
|
||||||
|
case (aft::RaftMsgType::raft_append_entries):
|
||||||
|
{
|
||||||
|
auto ae = *(aft::AppendEntries*)data;
|
||||||
|
return fmt::format(
|
||||||
|
"AE(t{}, ({}.{}, {}.{}])",
|
||||||
|
ae.term,
|
||||||
|
ae.prev_term,
|
||||||
|
ae.prev_idx,
|
||||||
|
ae.term_of_idx,
|
||||||
|
ae.idx);
|
||||||
|
}
|
||||||
|
case (aft::RaftMsgType::raft_append_entries_response):
|
||||||
|
{
|
||||||
|
auto aer = *(aft::AppendEntriesResponse*)data;
|
||||||
|
return fmt::format(
|
||||||
|
"AER({}, t{}, i{})",
|
||||||
|
aer.success == aft::AppendEntriesResponseType::OK ? "ACK" : "NACK",
|
||||||
|
aer.term,
|
||||||
|
aer.last_log_idx);
|
||||||
|
}
|
||||||
|
case (aft::RaftMsgType::raft_propose_request_vote):
|
||||||
|
{
|
||||||
|
return "PRV";
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
throw std::runtime_error(
|
||||||
|
fmt::format("Unhandled RaftMsgType: {}", msg_type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void summarise_messages(ccf::NodeId src, ccf::NodeId dst)
|
||||||
|
{
|
||||||
|
auto raft = _nodes.at(src).raft;
|
||||||
|
auto& messages = channel_stub_proxy(*raft)->messages;
|
||||||
|
std::vector<std::string> message_reps;
|
||||||
|
for (const auto& [target, raw_msg] : messages)
|
||||||
|
{
|
||||||
|
if (target == dst)
|
||||||
|
{
|
||||||
|
message_reps.push_back(get_message_summary(raw_msg));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RAFT_DRIVER_PRINT(
|
||||||
|
"Note right of {}: {} message(s) to {} = [{}]",
|
||||||
|
src,
|
||||||
|
message_reps.size(),
|
||||||
|
dst,
|
||||||
|
fmt::join(message_reps, ", "));
|
||||||
|
}
|
||||||
|
|
||||||
void state_one(ccf::NodeId node_id)
|
void state_one(ccf::NodeId node_id)
|
||||||
{
|
{
|
||||||
auto raft = _nodes.at(node_id).raft;
|
auto raft = _nodes.at(node_id).raft;
|
||||||
|
|
|
@ -0,0 +1,128 @@
|
||||||
|
# Start a 5-node network
|
||||||
|
start_node,0
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
assert_commit_idx,0,2
|
||||||
|
|
||||||
|
trust_nodes,2,1,2,3,4
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
dispatch_all
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
assert_commit_idx,0,4
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
assert_state_sync
|
||||||
|
|
||||||
|
state_all
|
||||||
|
|
||||||
|
# Node 0 produces an unshared suffix
|
||||||
|
emit_signature,2
|
||||||
|
emit_signature,2
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
# Node 1 calls and wins an election
|
||||||
|
periodic_one,1,100
|
||||||
|
assert_detail,1,leadership_state,Candidate
|
||||||
|
dispatch_single,1,2
|
||||||
|
dispatch_single,2,1
|
||||||
|
dispatch_single,1,3
|
||||||
|
dispatch_single,3,1
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
|
||||||
|
# Node 1 produces a conflicting suffix
|
||||||
|
emit_signature,3
|
||||||
|
emit_signature,3
|
||||||
|
emit_signature,3
|
||||||
|
|
||||||
|
# Node 1 tries to share this
|
||||||
|
periodic_one,1,10
|
||||||
|
drop_pending,1 # First forget prior messages
|
||||||
|
periodic_one,1,10 # This is an empty heartbeat
|
||||||
|
dispatch_single,1,0
|
||||||
|
|
||||||
|
# The 0->1 channel now contains a NACK, which we take great care to retain for a while!
|
||||||
|
summarise_messages,0,1
|
||||||
|
|
||||||
|
# Node 1 shares their suffix successfully with node 2
|
||||||
|
dispatch_single,1,2
|
||||||
|
dispatch_single,2,1
|
||||||
|
periodic_one,1,10
|
||||||
|
dispatch_single,1,2
|
||||||
|
|
||||||
|
# Node 2 calls and wins an election
|
||||||
|
periodic_one,2,100
|
||||||
|
assert_detail,2,leadership_state,Candidate
|
||||||
|
dispatch_single,2,3
|
||||||
|
dispatch_single,3,2
|
||||||
|
dispatch_single,2,4
|
||||||
|
dispatch_single,4,2
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
|
||||||
|
# Node 2 updates node 0 with the suffix produced by node 1
|
||||||
|
drop_pending,2
|
||||||
|
periodic_one,2,10
|
||||||
|
dispatch_one,2
|
||||||
|
dispatch_single,0,2
|
||||||
|
periodic_one,2,10
|
||||||
|
dispatch_one,2
|
||||||
|
dispatch_single,0,2
|
||||||
|
periodic_one,2,10
|
||||||
|
dispatch_one,2
|
||||||
|
dispatch_single,0,2
|
||||||
|
|
||||||
|
# Node 2 extends this suffix, shares it with node 0
|
||||||
|
emit_signature,4
|
||||||
|
emit_signature,4
|
||||||
|
periodic_one,2,10
|
||||||
|
dispatch_single,2,0
|
||||||
|
|
||||||
|
# Node 1 calls and wins an election
|
||||||
|
drop_pending,1
|
||||||
|
periodic_one,1,100
|
||||||
|
assert_detail,1,leadership_state,Candidate
|
||||||
|
dispatch_single,1,3
|
||||||
|
dispatch_single,3,1
|
||||||
|
dispatch_single,1,4
|
||||||
|
dispatch_single,4,1
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
drop_pending,1
|
||||||
|
|
||||||
|
# Node 1 creates an alternative branch from 2's recent suffix
|
||||||
|
emit_signature,5
|
||||||
|
emit_signature,5
|
||||||
|
|
||||||
|
# Node 1 shares this with 3, so that it is only needs 1 more ACK to advance commit
|
||||||
|
periodic_one,1,10
|
||||||
|
dispatch_single,1,3
|
||||||
|
dispatch_single,3,1
|
||||||
|
periodic_one,1,10
|
||||||
|
dispatch_single,1,3
|
||||||
|
dispatch_single,1,3
|
||||||
|
dispatch_single,3,1
|
||||||
|
dispatch_single,3,1
|
||||||
|
|
||||||
|
# Node 1 finally receives that stale NACK from 0
|
||||||
|
drop_pending,1
|
||||||
|
state_all
|
||||||
|
summarise_messages,0,1
|
||||||
|
dispatch_single,0,1
|
||||||
|
summarise_messages,0,1
|
||||||
|
|
||||||
|
# Node 1 now has an underestimate of its match with 0, so will produce some redundant (matching) AEs
|
||||||
|
periodic_one,1,10
|
||||||
|
|
||||||
|
# The lead AE in the 1->0 should now be a NOP, containing entirely entries that 0 already has
|
||||||
|
dispatch_single,1,0
|
||||||
|
|
||||||
|
# What did 0 produce as the response, and how does node 1 handle that?
|
||||||
|
dispatch_single,0,1
|
||||||
|
|
||||||
|
summarise_logs_all
|
||||||
|
state_all
|
||||||
|
|
||||||
|
assert_commit_safety,1
|
|
@ -0,0 +1,50 @@
|
||||||
|
# Start a 3-node network
|
||||||
|
start_node,0
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
assert_commit_idx,0,2
|
||||||
|
|
||||||
|
trust_nodes,2,1,2
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
dispatch_all
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
assert_commit_idx,0,4
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
assert_state_sync
|
||||||
|
|
||||||
|
state_all
|
||||||
|
|
||||||
|
# Original primary node 0 produces a small suffix, doesn't get to share it
|
||||||
|
emit_signature,2
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
# Node 1 calls and wins an election
|
||||||
|
periodic_one,1,100
|
||||||
|
assert_detail,1,leadership_state,Candidate
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_one,2
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
|
||||||
|
# Node 1 produces multiple heartbeat AEs before emitting a signature!
|
||||||
|
periodic_one,1,10
|
||||||
|
periodic_one,1,10
|
||||||
|
emit_signature,3
|
||||||
|
|
||||||
|
# With a few iterations node 1 should be able to bring all nodes in-sync
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
state_all
|
||||||
|
assert_state_sync
|
|
@ -0,0 +1,122 @@
|
||||||
|
# Set up 5 node network
|
||||||
|
start_node,0
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
assert_commit_idx,0,2
|
||||||
|
|
||||||
|
trust_nodes,2,1,2,3,4
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
periodic_all,10
|
||||||
|
dispatch_all
|
||||||
|
|
||||||
|
assert_commit_idx,0,4
|
||||||
|
|
||||||
|
assert_state_sync
|
||||||
|
# All nodes in agreement
|
||||||
|
|
||||||
|
# Nodes 3 and 4 are partitioned, so the remaining 3 need unanimity for commit
|
||||||
|
disconnect_node,3
|
||||||
|
disconnect_node,4
|
||||||
|
|
||||||
|
# Node 0 produces a suffix
|
||||||
|
replicate,2,hello
|
||||||
|
emit_signature,2
|
||||||
|
replicate,2,world
|
||||||
|
emit_signature,2
|
||||||
|
|
||||||
|
# Node 0 shares this suffix with 1
|
||||||
|
periodic_one,0,10
|
||||||
|
drop_pending_to,0,2
|
||||||
|
dispatch_one,0
|
||||||
|
|
||||||
|
# Node 1 calls and wins an election
|
||||||
|
periodic_one,1,100
|
||||||
|
assert_detail,1,leadership_state,Candidate
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_one,0
|
||||||
|
dispatch_one,2
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
|
||||||
|
# Node 1 sends a late heartbeat, producing a NACK
|
||||||
|
drop_pending_to,1,0
|
||||||
|
drop_pending_to,2,1 # Ensure nothing else is in the message queue
|
||||||
|
dispatch_one,1
|
||||||
|
|
||||||
|
# NB: The 2->1 channel now contains a NACK at 2.4.
|
||||||
|
# We need to keep this around, and don't touch this channel for a while
|
||||||
|
|
||||||
|
# Node 0 calls and wins an election
|
||||||
|
periodic_one,0,100
|
||||||
|
assert_detail,0,leadership_state,Candidate
|
||||||
|
dispatch_one,0
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_single,2,0
|
||||||
|
assert_detail,0,leadership_state,Leader
|
||||||
|
|
||||||
|
# Node 0 emits a fresh signature, so that they can advance commit
|
||||||
|
emit_signature,4
|
||||||
|
|
||||||
|
# Node 0 sends heartbeats
|
||||||
|
periodic_one,0,10
|
||||||
|
dispatch_one,0
|
||||||
|
|
||||||
|
# Node 0 receives responses, setting initial next_index
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_single,2,0 # NB: dispatch_single, to avoid touching the NACK for 1
|
||||||
|
dispatch_single,2,0
|
||||||
|
|
||||||
|
# Node 0 sends useful AEs
|
||||||
|
periodic_one,0,10
|
||||||
|
dispatch_one,0
|
||||||
|
|
||||||
|
# Node 0 receives response, advancing commit
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_single,2,0
|
||||||
|
dispatch_single,2,0
|
||||||
|
|
||||||
|
assert_commit_idx,0,9
|
||||||
|
|
||||||
|
# Note that the peers have not yet advanced commit
|
||||||
|
assert_commit_idx,1,4
|
||||||
|
assert_commit_idx,2,4
|
||||||
|
|
||||||
|
assert_commit_safety,0
|
||||||
|
|
||||||
|
# Node 1 calls and wins an election
|
||||||
|
# To win, it now needs a vote from one of the previous dead nodes (we can't touch the 2->1 channel!)
|
||||||
|
reconnect_node,3
|
||||||
|
periodic_one,1,100
|
||||||
|
assert_detail,1,leadership_state,Candidate
|
||||||
|
dispatch_one,1
|
||||||
|
dispatch_one,0
|
||||||
|
dispatch_one,3
|
||||||
|
assert_detail,1,leadership_state,Leader
|
||||||
|
|
||||||
|
# Don't need this heartbeat AE, drop it
|
||||||
|
drop_pending_to,1,2
|
||||||
|
|
||||||
|
# Now we deliver a stale NACK
|
||||||
|
dispatch_single,2,1
|
||||||
|
|
||||||
|
# 1 may now believe 2's log is too short!
|
||||||
|
|
||||||
|
# Node 1 produces AEs
|
||||||
|
periodic_one,1,10
|
||||||
|
|
||||||
|
state_all
|
||||||
|
assert_commit_safety,0 # Sanity check, we're fine here
|
||||||
|
|
||||||
|
# This AE reaches 2
|
||||||
|
dispatch_single,1,2
|
||||||
|
|
||||||
|
# 2 had better not rollback and break commit safety!
|
||||||
|
state_all
|
||||||
|
assert_commit_safety,0
|
|
@ -981,7 +981,6 @@ AppendEntriesAlreadyDone(i, j, index, m) ==
|
||||||
IF membershipState = RetirementSigned /\ commitIndex' > RetirementIndex(i)
|
IF membershipState = RetirementSigned /\ commitIndex' > RetirementIndex(i)
|
||||||
THEN RetirementCompleted
|
THEN RetirementCompleted
|
||||||
ELSE @]
|
ELSE @]
|
||||||
/\ isNewFollower' = [isNewFollower EXCEPT ![i] = FALSE]
|
|
||||||
/\ Reply([type |-> AppendEntriesResponse,
|
/\ Reply([type |-> AppendEntriesResponse,
|
||||||
term |-> currentTerm[i],
|
term |-> currentTerm[i],
|
||||||
success |-> TRUE,
|
success |-> TRUE,
|
||||||
|
@ -989,12 +988,15 @@ AppendEntriesAlreadyDone(i, j, index, m) ==
|
||||||
source |-> i,
|
source |-> i,
|
||||||
dest |-> j],
|
dest |-> j],
|
||||||
m)
|
m)
|
||||||
/\ UNCHANGED <<removedFromConfiguration, currentTerm, leadershipState, votedFor, log, candidateVars, leaderVars>>
|
/\ UNCHANGED <<removedFromConfiguration, currentTerm, leadershipState, votedFor, isNewFollower, log, candidateVars, leaderVars>>
|
||||||
|
|
||||||
\* Follower i receives an AppendEntries request m where it needs to roll back first
|
\* Follower i receives an AppendEntries request m where it needs to roll back first
|
||||||
\* This action rolls back the log and leaves m in messages for further processing
|
\* This action rolls back the log and leaves m in messages for further processing
|
||||||
ConflictAppendEntriesRequest(i, index, m) ==
|
ConflictAppendEntriesRequest(i, index, m) ==
|
||||||
/\ Len(log[i]) >= index
|
/\ m.entries /= << >>
|
||||||
|
/\ \E idx \in 1..Len(m.entries) :
|
||||||
|
/\ (index + (idx - 1)) \in DOMAIN log[i]
|
||||||
|
/\ log[i][index + (idx - 1)].term # m.entries[idx].term
|
||||||
/\ isNewFollower[i] = TRUE
|
/\ isNewFollower[i] = TRUE
|
||||||
/\ LET new_log == [index2 \in 1..m.prevLogIndex |-> log[i][index2]] \* Truncate log
|
/\ LET new_log == [index2 \in 1..m.prevLogIndex |-> log[i][index2]] \* Truncate log
|
||||||
IN /\ log' = [log EXCEPT ![i] = new_log]
|
IN /\ log' = [log EXCEPT ![i] = new_log]
|
||||||
|
@ -1038,7 +1040,6 @@ NoConflictAppendEntriesRequest(i, j, m) ==
|
||||||
ELSE UNCHANGED leadershipState
|
ELSE UNCHANGED leadershipState
|
||||||
\* Recalculate membership state based on log' and commitIndex'
|
\* Recalculate membership state based on log' and commitIndex'
|
||||||
/\ membershipState' = [membershipState EXCEPT ![i] = CalcMembershipState(log'[i], commitIndex'[i], i)]
|
/\ membershipState' = [membershipState EXCEPT ![i] = CalcMembershipState(log'[i], commitIndex'[i], i)]
|
||||||
/\ isNewFollower' = [isNewFollower EXCEPT ![i] = FALSE]
|
|
||||||
/\ Reply([type |-> AppendEntriesResponse,
|
/\ Reply([type |-> AppendEntriesResponse,
|
||||||
term |-> currentTerm[i],
|
term |-> currentTerm[i],
|
||||||
success |-> TRUE,
|
success |-> TRUE,
|
||||||
|
@ -1046,7 +1047,7 @@ NoConflictAppendEntriesRequest(i, j, m) ==
|
||||||
source |-> i,
|
source |-> i,
|
||||||
dest |-> j],
|
dest |-> j],
|
||||||
m)
|
m)
|
||||||
/\ UNCHANGED <<removedFromConfiguration, currentTerm, votedFor, candidateVars, leaderVars>>
|
/\ UNCHANGED <<removedFromConfiguration, currentTerm, votedFor, isNewFollower, candidateVars, leaderVars>>
|
||||||
|
|
||||||
AcceptAppendEntriesRequest(i, j, logOk, m) ==
|
AcceptAppendEntriesRequest(i, j, logOk, m) ==
|
||||||
\* accept request
|
\* accept request
|
||||||
|
|
Загрузка…
Ссылка в новой задаче