CCF/tests/election.py

136 строки
4.7 KiB
Python
Исходник Обычный вид История

2019-04-26 18:27:27 +03:00
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the Apache 2.0 License.
import time
import math
2020-07-07 17:46:44 +03:00
import infra.network
2019-04-26 18:27:27 +03:00
import infra.proc
import infra.e2e_args
2020-07-07 17:46:44 +03:00
import ccf.checker
2020-05-01 15:55:20 +03:00
import http
import suite.test_requirements as reqs
2019-04-26 18:27:27 +03:00
2020-07-07 17:46:44 +03:00
from ccf.tx_status import TxStatus
2019-04-26 18:27:27 +03:00
from loguru import logger as LOG
# This test starts from a given number of nodes (hosts), commits
2019-08-15 19:52:43 +03:00
# a transaction, stops the current primary, waits for an election and repeats
# this process until no progress can be made (i.e. no primary can be elected
2019-04-26 18:27:27 +03:00
# as F > N/2).
@reqs.description("Stopping current primary and waiting for a new one to be elected")
@reqs.can_kill_n_nodes(1)
def test_kill_primary(network, args, find_new_primary=True):
primary, _ = network.find_primary()
primary.stop()
LOG.debug(
f"Waiting {network.election_duration}s for a new primary to be elected..."
)
time.sleep(network.election_duration)
if find_new_primary:
new_primary, new_term = network.find_primary()
LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")
return network
2020-05-22 16:56:21 +03:00
def wait_for_seqno_to_commit(seqno, view, nodes):
2019-04-26 18:27:27 +03:00
"""
2020-05-22 16:56:21 +03:00
Wait for a specific seqno at a specific view to be committed on all nodes.
2019-04-26 18:27:27 +03:00
"""
2020-07-07 17:46:44 +03:00
for _ in range(infra.network.Network.replication_delay * 10):
2019-04-26 18:27:27 +03:00
up_to_date_f = []
for f in nodes:
2020-07-03 15:52:56 +03:00
with f.client() as c:
r = c.get("/node/tx", {"view": view, "seqno": seqno})
2020-05-01 15:55:20 +03:00
assert (
r.status == http.HTTPStatus.OK
), f"tx request returned HTTP status {r.status}"
2020-07-20 16:07:33 +03:00
status = TxStatus(r.body["status"])
2020-05-01 15:55:20 +03:00
if status == TxStatus.Committed:
2019-04-26 18:27:27 +03:00
up_to_date_f.append(f.node_id)
2020-05-01 15:55:20 +03:00
elif status == TxStatus.Invalid:
raise RuntimeError(
2020-05-22 16:56:21 +03:00
f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed"
2020-05-01 15:55:20 +03:00
)
else:
pass
2019-04-26 18:27:27 +03:00
if len(up_to_date_f) == len(nodes):
break
2020-03-26 18:30:06 +03:00
time.sleep(0.1)
2019-04-26 18:27:27 +03:00
assert len(up_to_date_f) == len(
nodes
2020-05-01 15:55:20 +03:00
), "Only {} out of {} nodes are up to date".format(len(up_to_date_f), len(nodes))
2019-04-26 18:27:27 +03:00
def run(args):
# Three nodes minimum to make sure that the raft network can still make progress
# if one node stops
hosts = ["localhost"] * (4 if args.consensus == "pbft" else 3)
2019-04-26 18:27:27 +03:00
2020-07-07 17:46:44 +03:00
with infra.network.network(
hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb
2019-04-26 18:27:27 +03:00
) as network:
2020-07-07 17:46:44 +03:00
check = ccf.checker.Checker()
2019-04-26 18:27:27 +03:00
network.start_and_join(args)
2020-05-22 16:56:21 +03:00
current_view = None
2019-04-26 18:27:27 +03:00
# Number of nodes F to stop until network cannot make progress
nodes_to_stop = math.ceil(len(hosts) / 2)
if args.consensus == "pbft":
nodes_to_stop = math.ceil(len(hosts) / 3)
2019-04-26 18:27:27 +03:00
for _ in range(nodes_to_stop):
2019-08-15 19:52:43 +03:00
# Note that for the first iteration, the primary is known in advance anyway
LOG.debug("Find freshly elected primary")
# After a view change in pbft, finding the new primary takes longer
2020-05-22 16:56:21 +03:00
primary, current_view = network.find_primary(
request_timeout=(30 if args.consensus == "pbft" else 3)
)
2019-04-26 18:27:27 +03:00
LOG.debug(
2020-05-22 16:56:21 +03:00
"Commit new transactions, primary:{}, current_view:{}".format(
primary.node_id, current_view
)
)
2020-07-03 15:52:56 +03:00
with primary.client("user0") as c:
2020-07-20 16:07:33 +03:00
res = c.post(
2020-07-03 15:52:56 +03:00
"/app/log/private",
2019-04-26 18:27:27 +03:00
{
2020-05-22 16:56:21 +03:00
"id": current_view,
"msg": "This log is committed in view {}".format(current_view),
2019-04-26 18:27:27 +03:00
},
)
2020-03-23 13:53:05 +03:00
check(res, result=True)
2020-05-22 16:56:21 +03:00
seqno = res.seqno
2019-04-26 18:27:27 +03:00
LOG.debug("Waiting for transaction to be committed by all nodes")
2020-05-22 16:56:21 +03:00
wait_for_seqno_to_commit(seqno, current_view, network.get_joined_nodes())
2019-04-26 18:27:27 +03:00
test_kill_primary(network, args, find_new_primary=False)
2019-04-26 18:27:27 +03:00
# More than F nodes have been stopped, trying to commit any message
LOG.debug(
"No progress can be made as more than {} nodes have stopped".format(
nodes_to_stop
)
)
2019-05-20 13:45:53 +03:00
try:
primary, _ = network.find_primary()
2019-08-15 19:52:43 +03:00
assert False, "Primary should not be found"
2020-07-07 17:46:44 +03:00
except infra.network.PrimaryNotFound:
pass
2020-05-26 16:11:49 +03:00
LOG.success(
f"As expected, primary could not be found after election duration ({network.election_duration}s)."
)
2020-05-26 16:11:49 +03:00
LOG.success("Test ended successfully.")
2019-04-26 18:27:27 +03:00
if __name__ == "__main__":
args = infra.e2e_args.cli_args()
args.package = "liblogging"
2019-04-26 18:27:27 +03:00
run(args)