CCF/tests/infra/network.py

1091 строка
40 KiB
Python

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the Apache 2.0 License.
import os
import time
import logging
from contextlib import contextmanager
from enum import Enum, IntEnum, auto
from ccf.clients import CCFConnectionException, flush_info
import infra.path
import infra.proc
import infra.node
import infra.consortium
from ccf.ledger import NodeStatus, Ledger
from ccf.tx_status import TxStatus
from ccf.tx_id import TxID
import random
from dataclasses import dataclass
from math import ceil
import http
import pprint
from loguru import logger as LOG
logging.getLogger("paramiko").setLevel(logging.WARNING)
# JOIN_TIMEOUT should be greater than the worst case quote verification time (~ 25 secs)
JOIN_TIMEOUT = 40
COMMON_FOLDER = "common"
class NodeRole(Enum):
ANY = auto()
PRIMARY = auto()
BACKUP = auto()
class ServiceStatus(Enum):
OPENING = "Opening"
OPEN = "Open"
CLOSED = "Closed"
class EllipticCurve(IntEnum):
secp384r1 = 0
secp256r1 = 1
def next(self):
return EllipticCurve((self.value + 1) % len(EllipticCurve))
class PrimaryNotFound(Exception):
pass
class CodeIdNotFound(Exception):
pass
class StartupSnapshotIsOld(Exception):
pass
class NodeShutdownError(Exception):
pass
def get_common_folder_name(workspace, label):
return os.path.join(workspace, f"{label}_{COMMON_FOLDER}")
@dataclass
class UserInfo:
local_id: int
service_id: str
class Network:
KEY_GEN = "keygenerator.sh"
SHARE_SCRIPT = "submit_recovery_share.sh"
node_args_to_forward = [
"enclave_type",
"host_log_level",
"sig_tx_interval",
"sig_ms_interval",
"raft_election_timeout_ms",
"bft_view_change_timeout_ms",
"consensus",
"memory_reserve_startup",
"log_format_json",
"constitution",
"join_timer",
"worker_threads",
"ledger_chunk_bytes",
"san",
"snapshot_tx_interval",
"max_open_sessions",
"max_open_sessions_hard",
"jwt_key_refresh_interval_s",
"common_read_only_ledger_dir",
"curve_id",
"client_connection_timeout_ms",
]
# Maximum delay (seconds) for updates to propagate from the primary to backups
replication_delay = 30
def __init__(
self,
hosts,
binary_dir=".",
dbg_nodes=None,
perf_nodes=None,
existing_network=None,
txs=None,
jwt_issuer=None,
library_dir=".",
init_partitioner=False,
version=None,
):
if existing_network is None:
self.consortium = None
self.users = []
self.next_node_id = 0
self.txs = txs
self.jwt_issuer = jwt_issuer
else:
self.consortium = existing_network.consortium
self.users = existing_network.users
self.next_node_id = existing_network.next_node_id
self.txs = existing_network.txs
self.jwt_issuer = existing_network.jwt_issuer
self.ignoring_shutdown_errors = False
self.nodes = []
self.hosts = hosts
self.status = ServiceStatus.CLOSED
self.binary_dir = binary_dir
self.library_dir = library_dir
self.common_dir = None
self.election_duration = None
self.key_generator = os.path.join(binary_dir, self.KEY_GEN)
self.share_script = os.path.join(binary_dir, self.SHARE_SCRIPT)
if not os.path.isfile(self.key_generator):
raise FileNotFoundError(
f"Could not find key generator script at '{self.key_generator}' - is binary directory set correctly?"
)
self.dbg_nodes = dbg_nodes
self.perf_nodes = perf_nodes
self.version = version
# Requires admin privileges
self.partitioner = (
infra.partitions.Partitioner(self) if init_partitioner else None
)
try:
os.remove("/tmp/vscode-gdb.sh")
except FileNotFoundError:
pass
for host in hosts:
self.create_node(host, version=self.version)
def _get_next_local_node_id(self):
next_node_id = self.next_node_id
self.next_node_id += 1
return next_node_id
def create_node(
self, host, binary_dir=None, library_dir=None, node_port=None, version=None
):
node_id = self._get_next_local_node_id()
debug = (
(str(node_id) in self.dbg_nodes) if self.dbg_nodes is not None else False
)
perf = (
(str(node_id) in self.perf_nodes) if self.perf_nodes is not None else False
)
node = infra.node.Node(
node_id,
host,
binary_dir or self.binary_dir,
library_dir or self.library_dir,
debug,
perf,
node_port=node_port,
version=version,
)
self.nodes.append(node)
return node
def _add_node(
self,
node,
lib_name,
args,
target_node=None,
recovery=False,
ledger_dir=None,
copy_ledger_read_only=True,
read_only_ledger_dir=None,
from_snapshot=True,
snapshot_dir=None,
):
forwarded_args = {
arg: getattr(args, arg)
for arg in infra.network.Network.node_args_to_forward
}
# Contact primary if no target node is set
if target_node is None:
target_node, _ = self.find_primary(
timeout=args.ledger_recovery_timeout if recovery else 3
)
LOG.info(f"Joining from target node {target_node.local_node_id}")
# Only retrieve snapshot from target node if the snapshot directory is not
# specified
if from_snapshot and snapshot_dir is None:
snapshot_dir = self.get_committed_snapshots(target_node)
committed_ledger_dir = None
current_ledger_dir = None
if from_snapshot:
if os.listdir(snapshot_dir):
LOG.info(f"Joining from snapshot directory: {snapshot_dir}")
# Only when joining from snapshot, retrieve ledger dirs from target node
# if the ledger directories are not specified. When joining without snapshot,
# the entire ledger will be retransmitted by primary node
current_ledger_dir = ledger_dir or None
committed_ledger_dir = read_only_ledger_dir or None
if copy_ledger_read_only and read_only_ledger_dir is None:
current_ledger_dir, committed_ledger_dir = target_node.get_ledger(
include_read_only_dirs=True
)
else:
LOG.warning(
f"Attempting to join from snapshot but {snapshot_dir} is empty: defaulting to complete replay of transaction history"
)
else:
LOG.info(
"Joining without snapshot: complete transaction history will be replayed"
)
node.join(
lib_name=lib_name,
workspace=args.workspace,
label=args.label,
common_dir=self.common_dir,
target_rpc_address=f"{target_node.get_public_rpc_host()}:{target_node.rpc_port}",
snapshot_dir=snapshot_dir,
ledger_dir=current_ledger_dir,
read_only_ledger_dir=committed_ledger_dir,
**forwarded_args,
)
# If the network is opening, node are trusted without consortium approval
if self.status == ServiceStatus.OPENING:
try:
node.wait_for_node_to_join(timeout=JOIN_TIMEOUT)
except TimeoutError:
LOG.error(f"New node {node.local_node_id} failed to join the network")
raise
def _start_all_nodes(
self,
args,
recovery=False,
ledger_dir=None,
read_only_ledger_dir=None,
snapshot_dir=None,
):
hosts = self.hosts
if not args.package:
raise ValueError("A package name must be specified.")
self.status = ServiceStatus.OPENING
LOG.info("Opening CCF service on {}".format(hosts))
forwarded_args = {
arg: getattr(args, arg)
for arg in infra.network.Network.node_args_to_forward
}
for i, node in enumerate(self.nodes):
try:
if i == 0:
if not recovery:
node.start(
lib_name=args.package,
workspace=args.workspace,
label=args.label,
common_dir=self.common_dir,
members_info=self.consortium.get_members_info(),
**forwarded_args,
)
else:
node.recover(
lib_name=args.package,
workspace=args.workspace,
label=args.label,
common_dir=self.common_dir,
ledger_dir=ledger_dir,
read_only_ledger_dir=read_only_ledger_dir,
snapshot_dir=snapshot_dir,
**forwarded_args,
)
self.wait_for_state(
node,
infra.node.State.PART_OF_PUBLIC_NETWORK.value,
timeout=args.ledger_recovery_timeout,
)
else:
# When a new service is started, initial nodes join without a snapshot
self._add_node(
node,
args.package,
args,
recovery=recovery,
ledger_dir=ledger_dir,
from_snapshot=snapshot_dir is not None,
read_only_ledger_dir=read_only_ledger_dir,
snapshot_dir=snapshot_dir,
)
except Exception:
LOG.exception("Failed to start node {}".format(node.local_node_id))
raise
self.election_duration = (
args.bft_view_change_timeout_ms / 1000
if args.consensus == "bft"
else args.raft_election_timeout_ms / 1000
) * 2
LOG.info("All nodes started")
# Here, recovery nodes might still be catching up, and possibly swamp
# the current primary which would not be able to serve user requests
primary, _ = self.find_primary(
timeout=args.ledger_recovery_timeout if recovery else 3
)
return primary
def _setup_common_folder(self, constitution):
LOG.info(f"Creating common folder: {self.common_dir}")
cmd = ["rm", "-rf", self.common_dir]
assert (
infra.proc.ccall(*cmd).returncode == 0
), f"Could not remove {self.common_dir} directory"
cmd = ["mkdir", "-p", self.common_dir]
assert (
infra.proc.ccall(*cmd).returncode == 0
), f"Could not create {self.common_dir} directory"
for fragment in constitution:
cmd = ["cp", fragment, self.common_dir]
assert (
infra.proc.ccall(*cmd).returncode == 0
), f"Could not copy governance {fragment} to {self.common_dir}"
# It is more convenient to create a symlink in the common directory than generate
# certs and keys in the top directory and move them across
cmd = ["ln", "-s", os.path.join(os.getcwd(), self.KEY_GEN), self.common_dir]
assert (
infra.proc.ccall(*cmd).returncode == 0
), f"Could not symlink {self.KEY_GEN} to {self.common_dir}"
def start_and_join(self, args):
"""
Starts a CCF network.
:param args: command line arguments to configure the CCF nodes.
"""
self.common_dir = get_common_folder_name(args.workspace, args.label)
assert (
args.constitution
), "--constitution argument must be provided to start a network"
self._setup_common_folder(args.constitution)
mc = max(1, args.initial_member_count)
initial_members_info = []
for i in range(mc):
initial_members_info += [
(
i,
(i < args.initial_recovery_member_count),
{"is_operator": True}
if (i < args.initial_operator_count)
else None,
)
]
self.consortium = infra.consortium.Consortium(
self.common_dir,
self.key_generator,
self.share_script,
args.consensus,
initial_members_info,
args.participants_curve,
authenticate_session=not args.disable_member_session_auth,
)
initial_users = [
f"user{user_id}" for user_id in list(range(max(0, args.initial_user_count)))
]
self.create_users(initial_users, args.participants_curve)
primary = self._start_all_nodes(args)
self.wait_for_all_nodes_to_commit(primary=primary)
LOG.success("All nodes joined network")
self.consortium.activate(self.find_random_node())
if args.js_app_bundle:
self.consortium.set_js_app(
remote_node=self.find_random_node(), app_bundle_path=args.js_app_bundle
)
for path in args.jwt_issuer:
self.consortium.set_jwt_issuer(
remote_node=self.find_random_node(), json_path=path
)
if self.jwt_issuer:
self.jwt_issuer.register(self)
self.consortium.add_users_and_transition_service_to_open(
self.find_random_node(), initial_users
)
self.status = ServiceStatus.OPEN
LOG.info(f"Initial set of users added: {len(initial_users)}")
LOG.success("***** Network is now open *****")
def start_in_recovery(
self,
args,
ledger_dir,
committed_ledger_dir=None,
snapshot_dir=None,
common_dir=None,
):
"""
Starts a CCF network in recovery mode.
:param args: command line arguments to configure the CCF nodes.
:param ledger_dir: ledger directory to recover from.
:param snapshot_dir: snapshot directory to recover from.
:param common_dir: common directory containing member and user keys and certs.
"""
self.common_dir = common_dir or get_common_folder_name(
args.workspace, args.label
)
ledger_dirs = [ledger_dir]
if committed_ledger_dir:
ledger_dirs.append(committed_ledger_dir)
ledger = Ledger(ledger_dirs, committed_only=False)
public_state, _ = ledger.get_latest_public_state()
primary = self._start_all_nodes(
args,
recovery=True,
ledger_dir=ledger_dir,
read_only_ledger_dir=committed_ledger_dir,
snapshot_dir=snapshot_dir,
)
# If a common directory was passed in, initialise the consortium from it
if common_dir is not None:
self.consortium = infra.consortium.Consortium(
common_dir,
self.key_generator,
self.share_script,
args.consensus,
public_state=public_state,
)
for node in self.get_joined_nodes():
self.wait_for_state(
node,
infra.node.State.PART_OF_PUBLIC_NETWORK.value,
timeout=args.ledger_recovery_timeout,
)
# Catch-up in recovery can take a long time, so extend this timeout
self.wait_for_all_nodes_to_commit(primary=primary, timeout=20)
LOG.success("All nodes joined public network")
def recover(self, args):
"""
Recovers a CCF network previously started in recovery mode.
:param args: command line arguments to configure the CCF nodes.
"""
self.consortium.check_for_service(
self.find_random_node(), status=ServiceStatus.OPENING
)
self.consortium.wait_for_all_nodes_to_be_trusted(
self.find_random_node(), self.nodes
)
self.consortium.transition_service_to_open(self.find_random_node())
self.consortium.recover_with_shares(self.find_random_node())
for node in self.get_joined_nodes():
self.wait_for_state(
node,
infra.node.State.PART_OF_NETWORK.value,
timeout=args.ledger_recovery_timeout,
)
self._wait_for_app_open(node)
self.consortium.check_for_service(self.find_random_node(), ServiceStatus.OPEN)
LOG.success("***** Recovered network is now open *****")
def ignore_errors_on_shutdown(self):
self.ignoring_shutdown_errors = True
def stop_all_nodes(self, skip_verification=False, verbose_verification=False):
if not skip_verification:
# Verify that all txs committed on the service can be read
if self.txs is not None:
log_capture = None if verbose_verification else []
self.txs.verify(self, log_capture=log_capture)
if verbose_verification:
flush_info(log_capture, None)
fatal_error_found = False
for node in self.nodes:
_, fatal_errors = node.stop()
if fatal_errors:
fatal_error_found = True
LOG.info("All nodes stopped")
if not skip_verification:
longest_ledger_seqno = 0
most_up_to_date_node = None
committed_ledger_dirs = {}
for node in self.nodes:
# Find stopped node with longest ledger
_, committed_ledger_dir = node.get_ledger(include_read_only_dirs=True)
ledger_end_seqno = 0
for ledger_file in os.listdir(committed_ledger_dir):
end_seqno = infra.node.get_committed_ledger_end_seqno(ledger_file)
if end_seqno > ledger_end_seqno:
ledger_end_seqno = end_seqno
if ledger_end_seqno > longest_ledger_seqno:
longest_ledger_seqno = ledger_end_seqno
most_up_to_date_node = node
committed_ledger_dirs[node.local_node_id] = [
committed_ledger_dir,
ledger_end_seqno,
]
# Verify that all ledger files on stopped nodes exist on most up-to-date node
# and are identical
if most_up_to_date_node:
longest_ledger_dir, _ = committed_ledger_dirs[
most_up_to_date_node.local_node_id
]
for node_id, (committed_ledger_dir, _) in (
l
for l in committed_ledger_dirs.items()
if not l[0] == most_up_to_date_node.node_id
):
for ledger_file in os.listdir(committed_ledger_dir):
if ledger_file not in os.listdir(longest_ledger_dir):
raise Exception(
f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.local_node_id}: {ledger_file}"
)
if infra.path.compute_file_checksum(
os.path.join(longest_ledger_dir, ledger_file)
) != infra.path.compute_file_checksum(
os.path.join(committed_ledger_dir, ledger_file)
):
raise Exception(
f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}"
)
LOG.success(
f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes"
)
if fatal_error_found:
if self.ignoring_shutdown_errors:
LOG.warning("Ignoring shutdown errors")
else:
raise NodeShutdownError("Fatal error found during node shutdown")
def join_node(
self, node, lib_name, args, target_node=None, timeout=JOIN_TIMEOUT, **kwargs
):
self._add_node(node, lib_name, args, target_node, **kwargs)
primary, _ = self.find_primary()
try:
self.consortium.wait_for_node_to_exist_in_store(
primary,
node.node_id,
timeout=timeout,
node_status=(
NodeStatus.PENDING
if self.status == ServiceStatus.OPEN
else NodeStatus.TRUSTED
),
)
except TimeoutError as e:
LOG.error(f"New pending node {node.node_id} failed to join the network")
errors, _ = node.stop()
self.nodes.remove(node)
if errors:
# Throw accurate exceptions if known errors found in
for error in errors:
if "Quote does not contain known enclave measurement" in error:
raise CodeIdNotFound from e
if "StartupSnapshotIsOld" in error:
raise StartupSnapshotIsOld from e
raise
def trust_node(self, node, args):
primary, _ = self.find_primary()
try:
if self.status is ServiceStatus.OPEN:
self.consortium.trust_node(
primary,
node.node_id,
timeout=ceil(args.join_timer * 2 / 1000),
)
# Here, quote verification has already been run when the node
# was added as pending. Only wait for the join timer for the
# joining node to retrieve network secrets.
node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 / 1000))
except (ValueError, TimeoutError):
LOG.error(f"New trusted node {node.node_id} failed to join the network")
node.stop()
raise
node.network_state = infra.node.NodeNetworkState.joined
self.wait_for_all_nodes_to_commit(primary=primary)
def retire_node(self, remote_node, node_to_retire):
self.consortium.retire_node(remote_node, node_to_retire)
self.nodes.remove(node_to_retire)
def create_user(self, local_user_id, curve, record=True):
infra.proc.ccall(
self.key_generator,
"--name",
local_user_id,
"--curve",
f"{curve.name}",
path=self.common_dir,
log_output=False,
).check_returncode()
with open(
os.path.join(self.common_dir, f"{local_user_id}_cert.pem"), encoding="utf-8"
) as c:
service_user_id = infra.crypto.compute_cert_der_hash_hex_from_pem(c.read())
new_user = UserInfo(local_user_id, service_user_id)
if record:
self.users.append(new_user)
return new_user
def create_users(self, local_user_ids, curve):
for local_user_id in local_user_ids:
self.create_user(local_user_id, curve)
def get_members(self):
return self.consortium.members
def get_joined_nodes(self):
return [node for node in self.nodes if node.is_joined()]
def wait_for_state(self, node, state, timeout=3):
end_time = time.time() + timeout
while time.time() < end_time:
try:
with node.client(connection_timeout=timeout) as c:
r = c.get("/node/state")
if r.body.json()["state"] == state:
break
except ConnectionRefusedError:
pass
time.sleep(0.1)
else:
raise TimeoutError(
f"Timed out waiting for state {state} on node {node.node_id}"
)
if state == infra.node.State.PART_OF_NETWORK.value:
self.status = ServiceStatus.OPEN
def _wait_for_app_open(self, node, timeout=3):
end_time = time.time() + timeout
while time.time() < end_time:
# As an operator, query a well-known /app endpoint to find out
# if the app has been opened to users
with node.client() as c:
r = c.get("/app/commit")
if not (r.status_code == http.HTTPStatus.NOT_FOUND.value):
return
time.sleep(0.1)
raise TimeoutError(f"Application frontend was not open after {timeout}s")
def _get_node_by_service_id(self, node_id):
return next((node for node in self.nodes if node.node_id == node_id), None)
def find_primary(self, nodes=None, timeout=3, log_capture=None):
"""
Find the identity of the primary in the network and return its identity
and the current view.
"""
primary_id = None
view = None
logs = []
asked_nodes = nodes or self.get_joined_nodes()
end_time = time.time() + timeout
while time.time() < end_time:
for node in asked_nodes:
with node.client() as c:
try:
logs = []
res = c.get("/node/network", timeout=1, log_capture=logs)
assert res.status_code == http.HTTPStatus.OK.value, res
body = res.body.json()
view = body["current_view"]
primary_id = body["primary_id"]
if primary_id is not None:
break
except Exception:
LOG.warning(
f"Could not successfully connect to node {node.local_node_id}. Retrying..."
)
if primary_id is not None:
break
time.sleep(0.1)
if primary_id is None:
flush_info(logs, log_capture, 0)
raise PrimaryNotFound
flush_info(logs, log_capture, 0)
return (self._get_node_by_service_id(primary_id), view)
def find_backups(self, primary=None, timeout=3):
if primary is None:
primary, _ = self.find_primary(timeout=timeout)
return [n for n in self.get_joined_nodes() if n != primary]
def find_any_backup(self, primary=None, timeout=3):
return random.choice(self.find_backups(primary=primary, timeout=timeout))
def find_node_by_role(self, role=NodeRole.ANY):
role_ = (
random.choice([NodeRole.PRIMARY, NodeRole.BACKUP]) if NodeRole.ANY else role
)
if role_ == NodeRole.PRIMARY:
return self.find_primary()[0]
else:
return self.find_any_backup()
def find_random_node(self):
return random.choice(self.get_joined_nodes())
def find_nodes(self, timeout=3):
primary, _ = self.find_primary(timeout=timeout)
backups = self.find_backups(primary=primary, timeout=timeout)
return primary, backups
def find_primary_and_any_backup(self, timeout=3):
primary, backups = self.find_nodes(timeout)
backup = random.choice(backups)
return primary, backup
def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10):
"""
Wait for all nodes to have joined the network and committed all transactions
executed on the primary.
"""
if not (primary or tx_id):
raise ValueError("Either a valid TxID or primary node should be specified")
end_time = time.time() + timeout
# If no TxID is specified, retrieve latest readable one
if tx_id == None:
while time.time() < end_time:
with primary.client() as c:
resp = c.get(
"/node/network/nodes/self"
) # Well-known read-only endpoint
tx_id = TxID(resp.view, resp.seqno)
if tx_id.valid():
break
time.sleep(0.1)
assert (
tx_id.valid()
), f"Primary {primary.node_id} has not made any progress yet ({tx_id})"
caught_up_nodes = []
logs = {}
while time.time() < end_time:
caught_up_nodes = []
for node in self.get_joined_nodes():
with node.client() as c:
logs[node.node_id] = []
resp = c.get(
f"/node/local_tx?transaction_id={tx_id}",
log_capture=logs[node.node_id],
)
if resp.status_code != 200:
# Node may not have joined the network yet, try again
break
status = TxStatus(resp.body.json()["status"])
if status == TxStatus.Committed:
caught_up_nodes.append(node)
elif status == TxStatus.Invalid:
flush_info(logs[node.node_id], None, 0)
raise RuntimeError(
f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed"
)
else:
pass
if len(caught_up_nodes) == len(self.get_joined_nodes()):
break
time.sleep(0.1)
for lines in logs.values():
flush_info(lines, None, 0)
assert len(caught_up_nodes) == len(
self.get_joined_nodes()
), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_node_commit_sync(self, timeout=3):
"""
Wait for commit level to get in sync on all nodes. This is expected to
happen once CFTR has been established, in the absence of new transactions.
"""
end_time = time.time() + timeout
while time.time() < end_time:
commits = []
for node in self.get_joined_nodes():
with node.client() as c:
r = c.get("/node/commit")
assert r.status_code == http.HTTPStatus.OK.value
body = r.body.json()
commits.append(body["transaction_id"])
if [commits[0]] * len(commits) == commits:
break
time.sleep(0.1)
expected = [commits[0]] * len(commits)
if expected != commits:
for node in self.get_joined_nodes():
r = c.get("/node/consensus")
pprint.pprint(r.body.json())
assert expected == commits, f"Multiple commit values: {commits}"
def wait_for_new_primary(self, old_primary, nodes=None, timeout_multiplier=2):
# We arbitrarily pick twice the election duration to protect ourselves against the somewhat
# but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
timeout = self.election_duration * timeout_multiplier
LOG.info(
f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..."
)
end_time = time.time() + timeout
error = TimeoutError
logs = []
backup = self.find_any_backup(old_primary)
if backup.get_consensus() == "bft":
try:
with backup.client("user0") as c:
_ = c.post(
"/app/log/private",
{"id": -1, "msg": "This is submitted to force a view change"},
)
time.sleep(1)
except CCFConnectionException:
LOG.warning(f"Could not successfully connect to node {backup.node_id}.")
while time.time() < end_time:
try:
logs = []
new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs)
if new_primary.node_id != old_primary.node_id:
flush_info(logs, None)
LOG.info(
f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
)
return (new_primary, new_term)
except PrimaryNotFound:
error = PrimaryNotFound
except Exception:
pass
time.sleep(0.1)
flush_info(logs, None)
raise error(f"A new primary was not elected after {timeout} seconds")
def wait_for_new_primary_in(
self, expected_node_ids, nodes=None, timeout_multiplier=2
):
# We arbitrarily pick twice the election duration to protect ourselves against the somewhat
# but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
timeout = self.election_duration * timeout_multiplier
LOG.info(
f"Waiting up to {timeout}s for a new primary in {expected_node_ids} to be elected..."
)
end_time = time.time() + timeout
error = TimeoutError
logs = []
while time.time() < end_time:
try:
logs = []
new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs)
if new_primary.node_id in expected_node_ids:
flush_info(logs, None)
LOG.info(
f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
)
return (new_primary, new_term)
except PrimaryNotFound:
error = PrimaryNotFound
except Exception:
pass
time.sleep(0.1)
flush_info(logs, None)
raise error(f"A new primary was not elected after {timeout} seconds")
def wait_for_commit_proof(self, node, seqno, timeout=3):
# Wait that the target seqno has a commit proof on a specific node.
# This is achieved by first waiting for a commit over seqno, issuing
# a write request and then waiting for a commit over that
end_time = time.time() + timeout
while time.time() < end_time:
with node.client() as c:
r = c.get("/node/commit")
current_tx = TxID.from_str(r.body.json()["transaction_id"])
if current_tx.seqno >= seqno:
with node.client(
self.consortium.get_any_active_member().local_id
) as nc:
# Using update_state_digest here as a convenient write tx
# that is app agnostic
r = nc.post("/gov/ack/update_state_digest")
assert (
r.status_code == http.HTTPStatus.OK.value
), f"Error ack/update_state_digest: {r}"
c.wait_for_commit(r)
return True
time.sleep(0.1)
raise TimeoutError(f"seqno {seqno} did not have commit proof after {timeout}s")
def wait_for_snapshot_committed_for(self, seqno, timeout=3):
# Check that snapshot exists for target seqno and if so, wait until
# snapshot evidence has commit proof (= commit rule for snapshots)
snapshot_evidence_seqno = None
primary, _ = self.find_primary()
for s in os.listdir(primary.get_snapshots()):
if infra.node.get_snapshot_seqnos(s)[0] > seqno:
snapshot_evidence_seqno = infra.node.get_snapshot_seqnos(s)[1]
if snapshot_evidence_seqno is None:
return False
return self.wait_for_commit_proof(primary, snapshot_evidence_seqno, timeout)
def get_committed_snapshots(self, node):
# Wait for all available snapshot files to be committed before
# copying snapshot directory, so that we always use the latest snapshot
def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=6):
end_time = time.time() + timeout
committed = True
uncommitted_snapshots = []
while time.time() < end_time:
committed = True
uncommitted_snapshots = []
for f in list_src_dir_func(src_dir):
is_committed = infra.node.is_file_committed(f)
if not is_committed:
self.wait_for_commit_proof(
node, infra.node.get_snapshot_seqnos(f)[1]
)
uncommitted_snapshots.append(f)
committed &= is_committed
if committed:
break
time.sleep(0.1)
if not committed:
LOG.error(
f"Error: Not all snapshots were committed after {timeout}s in {src_dir}: {uncommitted_snapshots}"
)
return committed
return node.get_committed_snapshots(wait_for_snapshots_to_be_committed)
def _get_ledger_public_view_at(self, node, call, seqno, timeout):
end_time = time.time() + timeout
while time.time() < end_time:
try:
return call(seqno)
except Exception:
self.consortium.create_and_withdraw_large_proposal(node)
time.sleep(0.1)
raise TimeoutError(
f"Could not read transaction at seqno {seqno} from ledger {node.remote.ledger_paths()}"
)
def get_ledger_public_state_at(self, seqno, timeout=5):
primary, _ = self.find_primary()
return self._get_ledger_public_view_at(
primary, primary.get_ledger_public_tables_at, seqno, timeout
)
def get_latest_ledger_public_state(self, timeout=5):
primary, _ = self.find_primary()
with primary.client() as nc:
resp = nc.get("/node/commit")
body = resp.body.json()
tx_id = TxID.from_str(body["transaction_id"])
return self._get_ledger_public_view_at(
primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout
)
@contextmanager
def network(
hosts,
binary_directory=".",
dbg_nodes=None,
perf_nodes=None,
pdb=False,
txs=None,
jwt_issuer=None,
library_directory=".",
init_partitioner=False,
version=None,
):
"""
Context manager for Network class.
:param hosts: a list of hostnames (localhost or remote hostnames)
:param binary_directory: the directory where CCF's binaries are located
:param library_directory: the directory where CCF's libraries are located
:param dbg_nodes: default: []. List of node id's that will not start (user is prompted to start them manually)
:param perf_nodes: default: []. List of node ids that will run under perf record
:param pdb: default: False. Debugger.
:param txs: default: None. Transactions committed on that network.
:return: a Network instance that can be used to create/access nodes, handle the genesis state (add members, create
node.json), and stop all the nodes that belong to the network
"""
if dbg_nodes is None:
dbg_nodes = []
if perf_nodes is None:
perf_nodes = []
net = Network(
hosts=hosts,
binary_dir=binary_directory,
library_dir=library_directory,
dbg_nodes=dbg_nodes,
perf_nodes=perf_nodes,
txs=txs,
jwt_issuer=jwt_issuer,
init_partitioner=init_partitioner,
version=version,
)
try:
yield net
except Exception:
# Don't try to verify txs on Exception path
net.txs = None
if pdb:
import pdb
# pylint: disable=forgotten-debug-statement
pdb.set_trace()
else:
raise
finally:
LOG.info("Stopping network")
net.stop_all_nodes(skip_verification=True)
if init_partitioner:
net.partitioner.cleanup()