CCF/tests/infra/network.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the Apache 2.0 License.
import os
import time
import logging
from contextlib import contextmanager
from enum import Enum, IntEnum, auto
from ccf.clients import CCFConnectionException, flush_info
import infra.path
import infra.proc
import infra.node
import infra.consortium
from ccf.ledger import NodeStatus, Ledger
from ccf.tx_status import TxStatus
from ccf.tx_id import TxID
import random
from dataclasses import dataclass
from math import ceil
import http
import pprint

from loguru import logger as LOG

logging.getLogger("paramiko").setLevel(logging.WARNING)

# JOIN_TIMEOUT should be greater than the worst case quote verification time (~ 25 secs)
JOIN_TIMEOUT = 40

COMMON_FOLDER = "common"


class NodeRole(Enum):
    ANY = auto()
    PRIMARY = auto()
    BACKUP = auto()


class ServiceStatus(Enum):
    OPENING = "Opening"
    OPEN = "Open"
    CLOSED = "Closed"


class EllipticCurve(IntEnum):
    secp384r1 = 0
    secp256r1 = 1

    def next(self):
        return EllipticCurve((self.value + 1) % len(EllipticCurve))


class PrimaryNotFound(Exception):
    pass


class CodeIdNotFound(Exception):
    pass


class StartupSnapshotIsOld(Exception):
    pass


class NodeShutdownError(Exception):
    pass


def get_common_folder_name(workspace, label):
    return os.path.join(workspace, f"{label}_{COMMON_FOLDER}")


@dataclass
class UserInfo:
    local_id: int
    service_id: str


class Network:
    KEY_GEN = "keygenerator.sh"
    SHARE_SCRIPT = "submit_recovery_share.sh"
    node_args_to_forward = [
        "enclave_type",
        "host_log_level",
        "sig_tx_interval",
        "sig_ms_interval",
        "raft_election_timeout_ms",
        "bft_view_change_timeout_ms",
        "consensus",
        "memory_reserve_startup",
        "log_format_json",
        "constitution",
        "join_timer",
        "worker_threads",
        "ledger_chunk_bytes",
        "san",
        "snapshot_tx_interval",
        "max_open_sessions",
        "max_open_sessions_hard",
        "jwt_key_refresh_interval_s",
        "common_read_only_ledger_dir",
        "curve_id",
        "client_connection_timeout_ms",
    ]

    # Maximum delay (seconds) for updates to propagate from the primary to backups
    replication_delay = 30

    def __init__(
        self,
        hosts,
        binary_dir=".",
        dbg_nodes=None,
        perf_nodes=None,
        existing_network=None,
        txs=None,
        jwt_issuer=None,
        library_dir=".",
        init_partitioner=False,
        version=None,
    ):
        if existing_network is None:
            self.consortium = None
            self.users = []
            self.next_node_id = 0
            self.txs = txs
            self.jwt_issuer = jwt_issuer
        else:
            self.consortium = existing_network.consortium
            self.users = existing_network.users
            self.next_node_id = existing_network.next_node_id
            self.txs = existing_network.txs
            self.jwt_issuer = existing_network.jwt_issuer

        self.ignoring_shutdown_errors = False
        self.nodes = []
        self.hosts = hosts
        self.status = ServiceStatus.CLOSED
        self.binary_dir = binary_dir
        self.library_dir = library_dir
        self.common_dir = None
        self.election_duration = None
        self.key_generator = os.path.join(binary_dir, self.KEY_GEN)
        self.share_script = os.path.join(binary_dir, self.SHARE_SCRIPT)
        if not os.path.isfile(self.key_generator):
            raise FileNotFoundError(
                f"Could not find key generator script at '{self.key_generator}' - is binary directory set correctly?"
            )
        self.dbg_nodes = dbg_nodes
        self.perf_nodes = perf_nodes
        self.version = version

        # Requires admin privileges
        self.partitioner = (
            infra.partitions.Partitioner(self) if init_partitioner else None
        )

        try:
            os.remove("/tmp/vscode-gdb.sh")
        except FileNotFoundError:
            pass

        for host in hosts:
            self.create_node(host, version=self.version)

    def _get_next_local_node_id(self):
        next_node_id = self.next_node_id
        self.next_node_id += 1
        return next_node_id

    def create_node(
        self, host, binary_dir=None, library_dir=None, node_port=None, version=None
    ):
        node_id = self._get_next_local_node_id()
        debug = (
            (str(node_id) in self.dbg_nodes) if self.dbg_nodes is not None else False
        )
        perf = (
            (str(node_id) in self.perf_nodes) if self.perf_nodes is not None else False
        )
        node = infra.node.Node(
            node_id,
            host,
            binary_dir or self.binary_dir,
            library_dir or self.library_dir,
            debug,
            perf,
            node_port=node_port,
            version=version,
        )
        self.nodes.append(node)
        return node

    def _add_node(
        self,
        node,
        lib_name,
        args,
        target_node=None,
        recovery=False,
        ledger_dir=None,
        copy_ledger_read_only=True,
        read_only_ledger_dir=None,
        from_snapshot=True,
        snapshot_dir=None,
    ):
        forwarded_args = {
            arg: getattr(args, arg)
            for arg in infra.network.Network.node_args_to_forward
        }

        # Contact primary if no target node is set
        if target_node is None:
            target_node, _ = self.find_primary(
                timeout=args.ledger_recovery_timeout if recovery else 3
            )
        LOG.info(f"Joining from target node {target_node.local_node_id}")

        # Only retrieve snapshot from target node if the snapshot directory is not
        # specified
        if from_snapshot and snapshot_dir is None:
            snapshot_dir = self.get_committed_snapshots(target_node)

        committed_ledger_dir = None
        current_ledger_dir = None
        if from_snapshot:
            if os.listdir(snapshot_dir):
                LOG.info(f"Joining from snapshot directory: {snapshot_dir}")
                # Only when joining from snapshot, retrieve ledger dirs from target node
                # if the ledger directories are not specified. When joining without snapshot,
                # the entire ledger will be retransmitted by primary node
                current_ledger_dir = ledger_dir or None
                committed_ledger_dir = read_only_ledger_dir or None
                if copy_ledger_read_only and read_only_ledger_dir is None:
                    current_ledger_dir, committed_ledger_dir = target_node.get_ledger(
                        include_read_only_dirs=True
                    )
            else:
                LOG.warning(
                    f"Attempting to join from snapshot but {snapshot_dir} is empty: defaulting to complete replay of transaction history"
                )
        else:
            LOG.info(
                "Joining without snapshot: complete transaction history will be replayed"
            )

        node.join(
            lib_name=lib_name,
            workspace=args.workspace,
            label=args.label,
            common_dir=self.common_dir,
            target_rpc_address=f"{target_node.get_public_rpc_host()}:{target_node.rpc_port}",
            snapshot_dir=snapshot_dir,
            ledger_dir=current_ledger_dir,
            read_only_ledger_dir=committed_ledger_dir,
            **forwarded_args,
        )

        # If the network is opening, node are trusted without consortium approval
        if self.status == ServiceStatus.OPENING:
            try:
                node.wait_for_node_to_join(timeout=JOIN_TIMEOUT)
            except TimeoutError:
                LOG.error(f"New node {node.local_node_id} failed to join the network")
                raise

    def _start_all_nodes(
        self,
        args,
        recovery=False,
        ledger_dir=None,
        read_only_ledger_dir=None,
        snapshot_dir=None,
    ):
        hosts = self.hosts

        if not args.package:
            raise ValueError("A package name must be specified.")

        self.status = ServiceStatus.OPENING
        LOG.info("Opening CCF service on {}".format(hosts))

        forwarded_args = {
            arg: getattr(args, arg)
            for arg in infra.network.Network.node_args_to_forward
        }

        for i, node in enumerate(self.nodes):
            try:
                if i == 0:
                    if not recovery:
                        node.start(
                            lib_name=args.package,
                            workspace=args.workspace,
                            label=args.label,
                            common_dir=self.common_dir,
                            members_info=self.consortium.get_members_info(),
                            **forwarded_args,
                        )
                    else:
                        node.recover(
                            lib_name=args.package,
                            workspace=args.workspace,
                            label=args.label,
                            common_dir=self.common_dir,
                            ledger_dir=ledger_dir,
                            read_only_ledger_dir=read_only_ledger_dir,
                            snapshot_dir=snapshot_dir,
                            **forwarded_args,
                        )
                        self.wait_for_state(
                            node,
                            infra.node.State.PART_OF_PUBLIC_NETWORK.value,
                            timeout=args.ledger_recovery_timeout,
                        )
                else:
                    # When a new service is started, initial nodes join without a snapshot
                    self._add_node(
                        node,
                        args.package,
                        args,
                        recovery=recovery,
                        ledger_dir=ledger_dir,
                        from_snapshot=snapshot_dir is not None,
                        read_only_ledger_dir=read_only_ledger_dir,
                        snapshot_dir=snapshot_dir,
                    )
            except Exception:
                LOG.exception("Failed to start node {}".format(node.local_node_id))
                raise

        self.election_duration = (
            args.bft_view_change_timeout_ms / 1000
            if args.consensus == "bft"
            else args.raft_election_timeout_ms / 1000
        ) * 2

        LOG.info("All nodes started")

        # Here, recovery nodes might still be catching up, and possibly swamp
        # the current primary which would not be able to serve user requests
        primary, _ = self.find_primary(
            timeout=args.ledger_recovery_timeout if recovery else 3
        )
        return primary

    def _setup_common_folder(self, constitution):
        LOG.info(f"Creating common folder: {self.common_dir}")
        cmd = ["rm", "-rf", self.common_dir]
        assert (
            infra.proc.ccall(*cmd).returncode == 0
        ), f"Could not remove {self.common_dir} directory"
        cmd = ["mkdir", "-p", self.common_dir]
        assert (
            infra.proc.ccall(*cmd).returncode == 0
        ), f"Could not create {self.common_dir} directory"
        for fragment in constitution:
            cmd = ["cp", fragment, self.common_dir]
            assert (
                infra.proc.ccall(*cmd).returncode == 0
            ), f"Could not copy governance {fragment} to {self.common_dir}"
        # It is more convenient to create a symlink in the common directory than generate
        # certs and keys in the top directory and move them across
        cmd = ["ln", "-s", os.path.join(os.getcwd(), self.KEY_GEN), self.common_dir]
        assert (
            infra.proc.ccall(*cmd).returncode == 0
        ), f"Could not symlink {self.KEY_GEN} to {self.common_dir}"

    def start_and_join(self, args):
        """
        Starts a CCF network.
        :param args: command line arguments to configure the CCF nodes.
        """
        self.common_dir = get_common_folder_name(args.workspace, args.label)

        assert (
            args.constitution
        ), "--constitution argument must be provided to start a network"

        self._setup_common_folder(args.constitution)

        mc = max(1, args.initial_member_count)
        initial_members_info = []
        for i in range(mc):
            initial_members_info += [
                (
                    i,
                    (i < args.initial_recovery_member_count),
                    {"is_operator": True}
                    if (i < args.initial_operator_count)
                    else None,
                )
            ]

        self.consortium = infra.consortium.Consortium(
            self.common_dir,
            self.key_generator,
            self.share_script,
            args.consensus,
            initial_members_info,
            args.participants_curve,
            authenticate_session=not args.disable_member_session_auth,
        )
        initial_users = [
            f"user{user_id}" for user_id in list(range(max(0, args.initial_user_count)))
        ]
        self.create_users(initial_users, args.participants_curve)

        primary = self._start_all_nodes(args)
        self.wait_for_all_nodes_to_commit(primary=primary)
        LOG.success("All nodes joined network")

        self.consortium.activate(self.find_random_node())

        if args.js_app_bundle:
            self.consortium.set_js_app(
                remote_node=self.find_random_node(), app_bundle_path=args.js_app_bundle
            )

        for path in args.jwt_issuer:
            self.consortium.set_jwt_issuer(
                remote_node=self.find_random_node(), json_path=path
            )

        if self.jwt_issuer:
            self.jwt_issuer.register(self)

        self.consortium.add_users_and_transition_service_to_open(
            self.find_random_node(), initial_users
        )
        self.status = ServiceStatus.OPEN
        LOG.info(f"Initial set of users added: {len(initial_users)}")
        LOG.success("***** Network is now open *****")

    def start_in_recovery(
        self,
        args,
        ledger_dir,
        committed_ledger_dir=None,
        snapshot_dir=None,
        common_dir=None,
    ):
        """
        Starts a CCF network in recovery mode.
        :param args: command line arguments to configure the CCF nodes.
        :param ledger_dir: ledger directory to recover from.
        :param snapshot_dir: snapshot directory to recover from.
        :param common_dir: common directory containing member and user keys and certs.
        """
        self.common_dir = common_dir or get_common_folder_name(
            args.workspace, args.label
        )
        ledger_dirs = [ledger_dir]
        if committed_ledger_dir:
            ledger_dirs.append(committed_ledger_dir)

        ledger = Ledger(ledger_dirs, committed_only=False)
        public_state, _ = ledger.get_latest_public_state()

        primary = self._start_all_nodes(
            args,
            recovery=True,
            ledger_dir=ledger_dir,
            read_only_ledger_dir=committed_ledger_dir,
            snapshot_dir=snapshot_dir,
        )

        # If a common directory was passed in, initialise the consortium from it
        if common_dir is not None:
            self.consortium = infra.consortium.Consortium(
                common_dir,
                self.key_generator,
                self.share_script,
                args.consensus,
                public_state=public_state,
            )

        for node in self.get_joined_nodes():
            self.wait_for_state(
                node,
                infra.node.State.PART_OF_PUBLIC_NETWORK.value,
                timeout=args.ledger_recovery_timeout,
            )
        # Catch-up in recovery can take a long time, so extend this timeout
        self.wait_for_all_nodes_to_commit(primary=primary, timeout=20)
        LOG.success("All nodes joined public network")

    def recover(self, args):
        """
        Recovers a CCF network previously started in recovery mode.
        :param args: command line arguments to configure the CCF nodes.
        """
        self.consortium.check_for_service(
            self.find_random_node(), status=ServiceStatus.OPENING
        )
        self.consortium.wait_for_all_nodes_to_be_trusted(
            self.find_random_node(), self.nodes
        )
        self.consortium.transition_service_to_open(self.find_random_node())
        self.consortium.recover_with_shares(self.find_random_node())

        for node in self.get_joined_nodes():
            self.wait_for_state(
                node,
                infra.node.State.PART_OF_NETWORK.value,
                timeout=args.ledger_recovery_timeout,
            )
            self._wait_for_app_open(node)

        self.consortium.check_for_service(self.find_random_node(), ServiceStatus.OPEN)
        LOG.success("***** Recovered network is now open *****")

    def ignore_errors_on_shutdown(self):
        self.ignoring_shutdown_errors = True

    def stop_all_nodes(self, skip_verification=False, verbose_verification=False):
        if not skip_verification:
            # Verify that all txs committed on the service can be read
            if self.txs is not None:
                log_capture = None if verbose_verification else []
                self.txs.verify(self, log_capture=log_capture)
                if verbose_verification:
                    flush_info(log_capture, None)

        fatal_error_found = False

        for node in self.nodes:
            _, fatal_errors = node.stop()
            if fatal_errors:
                fatal_error_found = True

        LOG.info("All nodes stopped")

        if not skip_verification:
            longest_ledger_seqno = 0
            most_up_to_date_node = None
            committed_ledger_dirs = {}

            for node in self.nodes:
                # Find stopped node with longest ledger
                _, committed_ledger_dir = node.get_ledger(include_read_only_dirs=True)
                ledger_end_seqno = 0
                for ledger_file in os.listdir(committed_ledger_dir):
                    end_seqno = infra.node.get_committed_ledger_end_seqno(ledger_file)
                    if end_seqno > ledger_end_seqno:
                        ledger_end_seqno = end_seqno

                if ledger_end_seqno > longest_ledger_seqno:
                    longest_ledger_seqno = ledger_end_seqno
                    most_up_to_date_node = node
                committed_ledger_dirs[node.local_node_id] = [
                    committed_ledger_dir,
                    ledger_end_seqno,
                ]

            # Verify that all ledger files on stopped nodes exist on most up-to-date node
            # and are identical
            if most_up_to_date_node:
                longest_ledger_dir, _ = committed_ledger_dirs[
                    most_up_to_date_node.local_node_id
                ]
                for node_id, (committed_ledger_dir, _) in (
                    l
                    for l in committed_ledger_dirs.items()
                    if not l[0] == most_up_to_date_node.node_id
                ):
                    for ledger_file in os.listdir(committed_ledger_dir):
                        if ledger_file not in os.listdir(longest_ledger_dir):
                            raise Exception(
                                f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.local_node_id}: {ledger_file}"
                            )
                        if infra.path.compute_file_checksum(
                            os.path.join(longest_ledger_dir, ledger_file)
                        ) != infra.path.compute_file_checksum(
                            os.path.join(committed_ledger_dir, ledger_file)
                        ):
                            raise Exception(
                                f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}"
                            )

                LOG.success(
                    f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes"
                )

        if fatal_error_found:
            if self.ignoring_shutdown_errors:
                LOG.warning("Ignoring shutdown errors")
            else:
                raise NodeShutdownError("Fatal error found during node shutdown")

    def join_node(
        self, node, lib_name, args, target_node=None, timeout=JOIN_TIMEOUT, **kwargs
    ):
        self._add_node(node, lib_name, args, target_node, **kwargs)

        primary, _ = self.find_primary()
        try:
            self.consortium.wait_for_node_to_exist_in_store(
                primary,
                node.node_id,
                timeout=timeout,
                node_status=(
                    NodeStatus.PENDING
                    if self.status == ServiceStatus.OPEN
                    else NodeStatus.TRUSTED
                ),
            )
        except TimeoutError as e:
            LOG.error(f"New pending node {node.node_id} failed to join the network")
            errors, _ = node.stop()
            self.nodes.remove(node)
            if errors:
                # Throw accurate exceptions if known errors found in
                for error in errors:
                    if "Quote does not contain known enclave measurement" in error:
                        raise CodeIdNotFound from e
                    if "StartupSnapshotIsOld" in error:
                        raise StartupSnapshotIsOld from e
            raise

    def trust_node(self, node, args):
        primary, _ = self.find_primary()
        try:
            if self.status is ServiceStatus.OPEN:
                self.consortium.trust_node(
                    primary,
                    node.node_id,
                    timeout=ceil(args.join_timer * 2 / 1000),
                )
            # Here, quote verification has already been run when the node
            # was added as pending. Only wait for the join timer for the
            # joining node to retrieve network secrets.
            node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 / 1000))
        except (ValueError, TimeoutError):
            LOG.error(f"New trusted node {node.node_id} failed to join the network")
            node.stop()
            raise

        node.network_state = infra.node.NodeNetworkState.joined
        self.wait_for_all_nodes_to_commit(primary=primary)

    def retire_node(self, remote_node, node_to_retire):
        self.consortium.retire_node(remote_node, node_to_retire)
        self.nodes.remove(node_to_retire)

    def create_user(self, local_user_id, curve, record=True):
        infra.proc.ccall(
            self.key_generator,
            "--name",
            local_user_id,
            "--curve",
            f"{curve.name}",
            path=self.common_dir,
            log_output=False,
        ).check_returncode()

        with open(
            os.path.join(self.common_dir, f"{local_user_id}_cert.pem"), encoding="utf-8"
        ) as c:
            service_user_id = infra.crypto.compute_cert_der_hash_hex_from_pem(c.read())
        new_user = UserInfo(local_user_id, service_user_id)
        if record:
            self.users.append(new_user)

        return new_user

    def create_users(self, local_user_ids, curve):
        for local_user_id in local_user_ids:
            self.create_user(local_user_id, curve)

    def get_members(self):
        return self.consortium.members

    def get_joined_nodes(self):
        return [node for node in self.nodes if node.is_joined()]

    def wait_for_state(self, node, state, timeout=3):
        end_time = time.time() + timeout
        while time.time() < end_time:
            try:
                with node.client(connection_timeout=timeout) as c:
                    r = c.get("/node/state")
                    if r.body.json()["state"] == state:
                        break
            except ConnectionRefusedError:
                pass
            time.sleep(0.1)
        else:
            raise TimeoutError(
                f"Timed out waiting for state {state} on node {node.node_id}"
            )
        if state == infra.node.State.PART_OF_NETWORK.value:
            self.status = ServiceStatus.OPEN

    def _wait_for_app_open(self, node, timeout=3):
        end_time = time.time() + timeout
        while time.time() < end_time:
            # As an operator, query a well-known /app endpoint to find out
            # if the app has been opened to users
            with node.client() as c:
                r = c.get("/app/commit")
                if not (r.status_code == http.HTTPStatus.NOT_FOUND.value):
                    return
                time.sleep(0.1)
        raise TimeoutError(f"Application frontend was not open after {timeout}s")

    def _get_node_by_service_id(self, node_id):
        return next((node for node in self.nodes if node.node_id == node_id), None)

    def find_primary(self, nodes=None, timeout=3, log_capture=None):
        """
        Find the identity of the primary in the network and return its identity
        and the current view.
        """
        primary_id = None
        view = None

        logs = []

        asked_nodes = nodes or self.get_joined_nodes()
        end_time = time.time() + timeout
        while time.time() < end_time:
            for node in asked_nodes:
                with node.client() as c:
                    try:
                        logs = []
                        res = c.get("/node/network", timeout=1, log_capture=logs)
                        assert res.status_code == http.HTTPStatus.OK.value, res

                        body = res.body.json()
                        view = body["current_view"]
                        primary_id = body["primary_id"]
                        if primary_id is not None:
                            break

                    except Exception:
                        LOG.warning(
                            f"Could not successfully connect to node {node.local_node_id}. Retrying..."
                        )

            if primary_id is not None:
                break
            time.sleep(0.1)

        if primary_id is None:
            flush_info(logs, log_capture, 0)
            raise PrimaryNotFound

        flush_info(logs, log_capture, 0)

        return (self._get_node_by_service_id(primary_id), view)

    def find_backups(self, primary=None, timeout=3):
        if primary is None:
            primary, _ = self.find_primary(timeout=timeout)
        return [n for n in self.get_joined_nodes() if n != primary]

    def find_any_backup(self, primary=None, timeout=3):
        return random.choice(self.find_backups(primary=primary, timeout=timeout))

    def find_node_by_role(self, role=NodeRole.ANY):
        role_ = (
            random.choice([NodeRole.PRIMARY, NodeRole.BACKUP]) if NodeRole.ANY else role
        )
        if role_ == NodeRole.PRIMARY:
            return self.find_primary()[0]
        else:
            return self.find_any_backup()

    def find_random_node(self):
        return random.choice(self.get_joined_nodes())

    def find_nodes(self, timeout=3):
        primary, _ = self.find_primary(timeout=timeout)
        backups = self.find_backups(primary=primary, timeout=timeout)
        return primary, backups

    def find_primary_and_any_backup(self, timeout=3):
        primary, backups = self.find_nodes(timeout)
        backup = random.choice(backups)
        return primary, backup

    def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10):
        """
        Wait for all nodes to have joined the network and committed all transactions
        executed on the primary.
        """
        if not (primary or tx_id):
            raise ValueError("Either a valid TxID or primary node should be specified")

        end_time = time.time() + timeout

        # If no TxID is specified, retrieve latest readable one
        if tx_id == None:
            while time.time() < end_time:
                with primary.client() as c:
                    resp = c.get(
                        "/node/network/nodes/self"
                    )  # Well-known read-only endpoint
                    tx_id = TxID(resp.view, resp.seqno)
                    if tx_id.valid():
                        break
                time.sleep(0.1)
            assert (
                tx_id.valid()
            ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})"

        caught_up_nodes = []
        logs = {}
        while time.time() < end_time:
            caught_up_nodes = []
            for node in self.get_joined_nodes():
                with node.client() as c:
                    logs[node.node_id] = []
                    resp = c.get(
                        f"/node/local_tx?transaction_id={tx_id}",
                        log_capture=logs[node.node_id],
                    )
                    if resp.status_code != 200:
                        # Node may not have joined the network yet, try again
                        break
                    status = TxStatus(resp.body.json()["status"])
                    if status == TxStatus.Committed:
                        caught_up_nodes.append(node)
                    elif status == TxStatus.Invalid:
                        flush_info(logs[node.node_id], None, 0)
                        raise RuntimeError(
                            f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed"
                        )
                    else:
                        pass

            if len(caught_up_nodes) == len(self.get_joined_nodes()):
                break
            time.sleep(0.1)

        for lines in logs.values():
            flush_info(lines, None, 0)
        assert len(caught_up_nodes) == len(
            self.get_joined_nodes()
        ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"

    def wait_for_node_commit_sync(self, timeout=3):
        """
        Wait for commit level to get in sync on all nodes. This is expected to
        happen once CFTR has been established, in the absence of new transactions.
        """
        end_time = time.time() + timeout
        while time.time() < end_time:
            commits = []
            for node in self.get_joined_nodes():
                with node.client() as c:
                    r = c.get("/node/commit")
                    assert r.status_code == http.HTTPStatus.OK.value
                    body = r.body.json()
                    commits.append(body["transaction_id"])
            if [commits[0]] * len(commits) == commits:
                break
            time.sleep(0.1)
        expected = [commits[0]] * len(commits)
        if expected != commits:
            for node in self.get_joined_nodes():
                r = c.get("/node/consensus")
                pprint.pprint(r.body.json())
        assert expected == commits, f"Multiple commit values: {commits}"

    def wait_for_new_primary(self, old_primary, nodes=None, timeout_multiplier=2):
        # We arbitrarily pick twice the election duration to protect ourselves against the somewhat
        # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
        timeout = self.election_duration * timeout_multiplier
        LOG.info(
            f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..."
        )
        end_time = time.time() + timeout
        error = TimeoutError
        logs = []

        backup = self.find_any_backup(old_primary)
        if backup.get_consensus() == "bft":
            try:
                with backup.client("user0") as c:
                    _ = c.post(
                        "/app/log/private",
                        {"id": -1, "msg": "This is submitted to force a view change"},
                    )
                time.sleep(1)
            except CCFConnectionException:
                LOG.warning(f"Could not successfully connect to node {backup.node_id}.")

        while time.time() < end_time:
            try:
                logs = []
                new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs)
                if new_primary.node_id != old_primary.node_id:
                    flush_info(logs, None)
                    LOG.info(
                        f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
                    )
                    return (new_primary, new_term)
            except PrimaryNotFound:
                error = PrimaryNotFound
            except Exception:
                pass
            time.sleep(0.1)
        flush_info(logs, None)
        raise error(f"A new primary was not elected after {timeout} seconds")

    def wait_for_new_primary_in(
        self, expected_node_ids, nodes=None, timeout_multiplier=2
    ):
        # We arbitrarily pick twice the election duration to protect ourselves against the somewhat
        # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
        timeout = self.election_duration * timeout_multiplier
        LOG.info(
            f"Waiting up to {timeout}s for a new primary in {expected_node_ids} to be elected..."
        )
        end_time = time.time() + timeout
        error = TimeoutError
        logs = []
        while time.time() < end_time:
            try:
                logs = []
                new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs)
                if new_primary.node_id in expected_node_ids:
                    flush_info(logs, None)
                    LOG.info(
                        f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
                    )
                    return (new_primary, new_term)
            except PrimaryNotFound:
                error = PrimaryNotFound
            except Exception:
                pass
            time.sleep(0.1)
        flush_info(logs, None)
        raise error(f"A new primary was not elected after {timeout} seconds")

    def wait_for_commit_proof(self, node, seqno, timeout=3):
        # Wait that the target seqno has a commit proof on a specific node.
        # This is achieved by first waiting for a commit over seqno, issuing
        # a write request and then waiting for a commit over that
        end_time = time.time() + timeout
        while time.time() < end_time:
            with node.client() as c:
                r = c.get("/node/commit")
                current_tx = TxID.from_str(r.body.json()["transaction_id"])
                if current_tx.seqno >= seqno:
                    with node.client(
                        self.consortium.get_any_active_member().local_id
                    ) as nc:
                        # Using update_state_digest here as a convenient write tx
                        # that is app agnostic
                        r = nc.post("/gov/ack/update_state_digest")
                        assert (
                            r.status_code == http.HTTPStatus.OK.value
                        ), f"Error ack/update_state_digest: {r}"
                        c.wait_for_commit(r)
                        return True
            time.sleep(0.1)
        raise TimeoutError(f"seqno {seqno} did not have commit proof after {timeout}s")

    def wait_for_snapshot_committed_for(self, seqno, timeout=3):
        # Check that snapshot exists for target seqno and if so, wait until
        # snapshot evidence has commit proof (= commit rule for snapshots)
        snapshot_evidence_seqno = None
        primary, _ = self.find_primary()
        for s in os.listdir(primary.get_snapshots()):
            if infra.node.get_snapshot_seqnos(s)[0] > seqno:
                snapshot_evidence_seqno = infra.node.get_snapshot_seqnos(s)[1]
        if snapshot_evidence_seqno is None:
            return False

        return self.wait_for_commit_proof(primary, snapshot_evidence_seqno, timeout)

    def get_committed_snapshots(self, node):
        # Wait for all available snapshot files to be committed before
        # copying snapshot directory, so that we always use the latest snapshot
        def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=6):
            end_time = time.time() + timeout
            committed = True
            uncommitted_snapshots = []
            while time.time() < end_time:
                committed = True
                uncommitted_snapshots = []
                for f in list_src_dir_func(src_dir):
                    is_committed = infra.node.is_file_committed(f)
                    if not is_committed:
                        self.wait_for_commit_proof(
                            node, infra.node.get_snapshot_seqnos(f)[1]
                        )
                        uncommitted_snapshots.append(f)
                    committed &= is_committed
                if committed:
                    break
                time.sleep(0.1)
            if not committed:
                LOG.error(
                    f"Error: Not all snapshots were committed after {timeout}s in {src_dir}: {uncommitted_snapshots}"
                )
            return committed

        return node.get_committed_snapshots(wait_for_snapshots_to_be_committed)

    def _get_ledger_public_view_at(self, node, call, seqno, timeout):
        end_time = time.time() + timeout
        while time.time() < end_time:
            try:
                return call(seqno)
            except Exception:
                self.consortium.create_and_withdraw_large_proposal(node)
                time.sleep(0.1)
        raise TimeoutError(
            f"Could not read transaction at seqno {seqno} from ledger {node.remote.ledger_paths()}"
        )

    def get_ledger_public_state_at(self, seqno, timeout=5):
        primary, _ = self.find_primary()
        return self._get_ledger_public_view_at(
            primary, primary.get_ledger_public_tables_at, seqno, timeout
        )

    def get_latest_ledger_public_state(self, timeout=5):
        primary, _ = self.find_primary()
        with primary.client() as nc:
            resp = nc.get("/node/commit")
            body = resp.body.json()
            tx_id = TxID.from_str(body["transaction_id"])
        return self._get_ledger_public_view_at(
            primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout
        )


@contextmanager
def network(
    hosts,
    binary_directory=".",
    dbg_nodes=None,
    perf_nodes=None,
    pdb=False,
    txs=None,
    jwt_issuer=None,
    library_directory=".",
    init_partitioner=False,
    version=None,
):
    """
    Context manager for Network class.
    :param hosts: a list of hostnames (localhost or remote hostnames)
    :param binary_directory: the directory where CCF's binaries are located
    :param library_directory: the directory where CCF's libraries are located
    :param dbg_nodes: default: []. List of node id's that will not start (user is prompted to start them manually)
    :param perf_nodes: default: []. List of node ids that will run under perf record
    :param pdb: default: False. Debugger.
    :param txs: default: None. Transactions committed on that network.
    :return: a Network instance that can be used to create/access nodes, handle the genesis state (add members, create
    node.json), and stop all the nodes that belong to the network
    """
    if dbg_nodes is None:
        dbg_nodes = []
    if perf_nodes is None:
        perf_nodes = []

    net = Network(
        hosts=hosts,
        binary_dir=binary_directory,
        library_dir=library_directory,
        dbg_nodes=dbg_nodes,
        perf_nodes=perf_nodes,
        txs=txs,
        jwt_issuer=jwt_issuer,
        init_partitioner=init_partitioner,
        version=version,
    )
    try:
        yield net
    except Exception:
        # Don't try to verify txs on Exception path
        net.txs = None

        if pdb:
            import pdb

            # pylint: disable=forgotten-debug-statement
            pdb.set_trace()
        else:
            raise
    finally:
        LOG.info("Stopping network")
        net.stop_all_nodes(skip_verification=True)
        if init_partitioner:
            net.partitioner.cleanup()