2019-04-26 18:27:27 +03:00
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
# Licensed under the Apache 2.0 License.
|
2020-08-21 19:29:14 +03:00
|
|
|
from infra.network import PrimaryNotFound
|
2019-04-26 18:27:27 +03:00
|
|
|
import time
|
|
|
|
import math
|
2020-07-07 17:46:44 +03:00
|
|
|
import infra.network
|
2019-04-26 18:27:27 +03:00
|
|
|
import infra.proc
|
2020-01-29 18:09:28 +03:00
|
|
|
import infra.e2e_args
|
2020-07-27 19:34:16 +03:00
|
|
|
import infra.checker
|
2020-05-01 15:55:20 +03:00
|
|
|
import http
|
2020-07-03 14:02:26 +03:00
|
|
|
import suite.test_requirements as reqs
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-07-07 17:46:44 +03:00
|
|
|
from ccf.tx_status import TxStatus
|
2020-09-17 18:56:53 +03:00
|
|
|
from ccf.log_capture import flush_info
|
2019-04-26 18:27:27 +03:00
|
|
|
from loguru import logger as LOG
|
|
|
|
|
|
|
|
# This test starts from a given number of nodes (hosts), commits
|
2019-08-15 19:52:43 +03:00
|
|
|
# a transaction, stops the current primary, waits for an election and repeats
|
|
|
|
# this process until no progress can be made (i.e. no primary can be elected
|
2019-04-26 18:27:27 +03:00
|
|
|
# as F > N/2).
|
|
|
|
|
|
|
|
|
2020-07-03 14:02:26 +03:00
|
|
|
@reqs.description("Stopping current primary and waiting for a new one to be elected")
|
|
|
|
@reqs.can_kill_n_nodes(1)
|
2020-08-21 19:29:14 +03:00
|
|
|
def test_kill_primary(network, args):
|
2020-07-03 14:02:26 +03:00
|
|
|
primary, _ = network.find_primary()
|
|
|
|
primary.stop()
|
2020-08-21 19:29:14 +03:00
|
|
|
new_primary, new_term = network.wait_for_new_primary(primary.node_id)
|
|
|
|
LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")
|
2020-07-03 14:02:26 +03:00
|
|
|
|
|
|
|
return network
|
|
|
|
|
|
|
|
|
2020-05-22 16:56:21 +03:00
|
|
|
def wait_for_seqno_to_commit(seqno, view, nodes):
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
2020-05-22 16:56:21 +03:00
|
|
|
Wait for a specific seqno at a specific view to be committed on all nodes.
|
2019-04-26 18:27:27 +03:00
|
|
|
"""
|
2020-07-07 17:46:44 +03:00
|
|
|
for _ in range(infra.network.Network.replication_delay * 10):
|
2019-04-26 18:27:27 +03:00
|
|
|
up_to_date_f = []
|
2020-09-17 18:56:53 +03:00
|
|
|
logs = {}
|
2019-04-26 18:27:27 +03:00
|
|
|
for f in nodes:
|
2020-07-03 15:52:56 +03:00
|
|
|
with f.client() as c:
|
2020-09-17 18:56:53 +03:00
|
|
|
logs[f.node_id] = []
|
|
|
|
r = c.get(
|
|
|
|
f"/node/tx?view={view}&seqno={seqno}", log_capture=logs[f.node_id]
|
|
|
|
)
|
2020-05-01 15:55:20 +03:00
|
|
|
assert (
|
2020-07-27 19:34:16 +03:00
|
|
|
r.status_code == http.HTTPStatus.OK
|
|
|
|
), f"tx request returned HTTP status {r.status_code}"
|
2020-09-20 19:47:44 +03:00
|
|
|
status = TxStatus(r.body.json()["status"])
|
2020-05-01 15:55:20 +03:00
|
|
|
if status == TxStatus.Committed:
|
2019-04-26 18:27:27 +03:00
|
|
|
up_to_date_f.append(f.node_id)
|
2020-05-01 15:55:20 +03:00
|
|
|
elif status == TxStatus.Invalid:
|
2020-09-17 18:56:53 +03:00
|
|
|
flush_info(logs[f.node_id], None, 0)
|
2020-05-01 15:55:20 +03:00
|
|
|
raise RuntimeError(
|
2020-05-22 16:56:21 +03:00
|
|
|
f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed"
|
2020-05-01 15:55:20 +03:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
pass
|
2019-04-26 18:27:27 +03:00
|
|
|
if len(up_to_date_f) == len(nodes):
|
|
|
|
break
|
2020-03-26 18:30:06 +03:00
|
|
|
time.sleep(0.1)
|
2020-09-17 18:56:53 +03:00
|
|
|
for lines in logs.values():
|
|
|
|
flush_info(lines, None, 0)
|
2019-04-26 18:27:27 +03:00
|
|
|
assert len(up_to_date_f) == len(
|
|
|
|
nodes
|
2020-05-01 15:55:20 +03:00
|
|
|
), "Only {} out of {} nodes are up to date".format(len(up_to_date_f), len(nodes))
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
|
|
|
|
def run(args):
|
2020-07-07 17:46:44 +03:00
|
|
|
with infra.network.network(
|
2020-10-22 16:54:43 +03:00
|
|
|
args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb
|
2019-04-26 18:27:27 +03:00
|
|
|
) as network:
|
2020-07-27 19:34:16 +03:00
|
|
|
check = infra.checker.Checker()
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-11-08 12:33:47 +03:00
|
|
|
network.start_and_join(args)
|
2020-05-22 16:56:21 +03:00
|
|
|
current_view = None
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
# Number of nodes F to stop until network cannot make progress
|
2020-10-22 16:54:43 +03:00
|
|
|
nodes_to_stop = math.ceil(len(args.nodes) / 2)
|
2020-09-09 12:59:57 +03:00
|
|
|
if args.consensus == "bft":
|
2020-11-02 19:08:16 +03:00
|
|
|
# nodes_to_stop = math.ceil(len(args.nodes) / 3)
|
|
|
|
|
|
|
|
LOG.error("aaaaaaa {}", math.ceil(len(args.nodes) / 3))
|
|
|
|
|
|
|
|
# View Change implementation is in progress.
|
|
|
|
# https://github.com/microsoft/CCF/issues/1709
|
|
|
|
nodes_to_stop = 1
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-08-21 19:29:14 +03:00
|
|
|
primary_is_known = True
|
|
|
|
for node_to_stop in range(nodes_to_stop):
|
2019-08-15 19:52:43 +03:00
|
|
|
# Note that for the first iteration, the primary is known in advance anyway
|
|
|
|
LOG.debug("Find freshly elected primary")
|
2020-01-27 16:53:23 +03:00
|
|
|
# After a view change in pbft, finding the new primary takes longer
|
2020-05-22 16:56:21 +03:00
|
|
|
primary, current_view = network.find_primary(
|
2020-09-09 12:59:57 +03:00
|
|
|
timeout=(30 if args.consensus == "bft" else 3)
|
2020-01-27 16:53:23 +03:00
|
|
|
)
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2019-11-25 19:52:04 +03:00
|
|
|
LOG.debug(
|
2020-05-22 16:56:21 +03:00
|
|
|
"Commit new transactions, primary:{}, current_view:{}".format(
|
|
|
|
primary.node_id, current_view
|
2019-11-25 19:52:04 +03:00
|
|
|
)
|
|
|
|
)
|
2020-07-03 15:52:56 +03:00
|
|
|
with primary.client("user0") as c:
|
2020-07-20 16:07:33 +03:00
|
|
|
res = c.post(
|
2020-07-03 15:52:56 +03:00
|
|
|
"/app/log/private",
|
2019-04-26 18:27:27 +03:00
|
|
|
{
|
2020-05-22 16:56:21 +03:00
|
|
|
"id": current_view,
|
|
|
|
"msg": "This log is committed in view {}".format(current_view),
|
2019-04-26 18:27:27 +03:00
|
|
|
},
|
|
|
|
)
|
2020-03-23 13:53:05 +03:00
|
|
|
check(res, result=True)
|
2020-05-22 16:56:21 +03:00
|
|
|
seqno = res.seqno
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
LOG.debug("Waiting for transaction to be committed by all nodes")
|
2020-05-22 16:56:21 +03:00
|
|
|
wait_for_seqno_to_commit(seqno, current_view, network.get_joined_nodes())
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-08-21 19:29:14 +03:00
|
|
|
try:
|
|
|
|
test_kill_primary(network, args)
|
|
|
|
except PrimaryNotFound:
|
|
|
|
if node_to_stop < nodes_to_stop - 1:
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
primary_is_known = False
|
2019-04-26 18:27:27 +03:00
|
|
|
|
2020-08-21 19:29:14 +03:00
|
|
|
assert not primary_is_known, "Primary is still known"
|
2020-05-26 16:11:49 +03:00
|
|
|
LOG.success("Test ended successfully.")
|
2019-04-26 18:27:27 +03:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
2020-01-29 18:09:28 +03:00
|
|
|
args = infra.e2e_args.cli_args()
|
2020-01-28 17:06:12 +03:00
|
|
|
args.package = "liblogging"
|
2020-10-22 16:54:43 +03:00
|
|
|
args.nodes = infra.e2e_args.min_nodes(args, f=1)
|
2019-04-26 18:27:27 +03:00
|
|
|
run(args)
|