CCF/tests/reconfiguration.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the Apache 2.0 License.
import infra.e2e_args
import infra.network
import infra.proc
import infra.net
import infra.logging_app as app
from infra.tx_status import TxStatus
import suite.test_requirements as reqs
import tempfile
from shutil import copy
from copy import deepcopy
import os
import time
import ccf.ledger
import json
import infra.crypto
from datetime import datetime
from infra.checker import check_can_progress
from governance_history import check_signatures
from infra.snp import IS_SNP
from infra.runner import ConcurrentRunner
import http
import random

from loguru import logger as LOG


def node_configs(network):
    configs = {}
    for node in network.get_joined_nodes():
        try:
            with node.client() as nc:
                configs[node.node_id] = nc.get("/node/config").body.json()
        except Exception:
            pass
    return configs


def count_nodes(configs, network):
    nodes = set(str(k) for k in configs.keys())
    stopped = {str(n.node_id) for n in network.nodes if n.is_stopped()}
    for node_id, node_config in configs.items():
        nodes_in_config = set(node_config.keys()) - stopped
        assert nodes == nodes_in_config, f"{nodes} {nodes_in_config} {node_id}"
    return len(nodes)


def wait_for_reconfiguration_to_complete(network, timeout=10):
    max_num_configs = 0
    max_rid = 0
    all_same_rid = False
    end_time = time.time() + timeout
    while max_num_configs > 1 or not all_same_rid:
        max_num_configs = 0
        all_same_rid = True
        for node in network.get_joined_nodes():
            with node.client(verify_ca=False) as c:
                try:
                    r = c.get("/node/consensus")
                    rj = r.body.json()
                    cfgs = rj["details"]["configs"]
                    num_configs = len(cfgs)
                    max_num_configs = max(max_num_configs, num_configs)
                    if num_configs == 1 and cfgs[0]["rid"] != max_rid:
                        max_rid = max(max_rid, cfgs[0]["rid"])
                        all_same_rid = False
                except Exception as ex:
                    # OK, retiring node may be gone or a joining node may not be ready yet
                    LOG.info(f"expected RPC failure because of: {ex}")
        time.sleep(0.5)
        LOG.info(f"max num configs: {max_num_configs}, max rid: {max_rid}")
        assert time.time() <= end_time, "Reconfiguration did not complete in time"


@reqs.description("Adding a node with invalid target service certificate")
def test_add_node_invalid_service_cert(network, args):
    primary, _ = network.find_primary()

    # Incorrect target service certificate file, in this case the primary's node
    # identity
    service_cert_file = os.path.join(primary.common_dir, f"{primary.local_node_id}.pem")
    new_node = network.create_node("local://localhost")
    try:
        network.join_node(
            new_node,
            args.package,
            args,
            service_cert_file=service_cert_file,
            timeout=3,
            stop_on_error=True,
        )
    except infra.network.ServiceCertificateInvalid:
        LOG.info(
            f"Node {new_node.local_node_id} with invalid service certificate failed to start, as expected"
        )
    else:
        assert (
            False
        ), f"Node {new_node.local_node_id} with invalid service certificate unexpectedly started"

    return network


@reqs.description("Adding a valid node")
def test_add_node(network, args, from_snapshot=True):
    # Note: host is supplied explicitly to avoid having differently
    # assigned IPs for the interfaces, something which the test infra doesn't
    # support widely yet.
    operator_rpc_interface = "operator_rpc_interface"
    host = infra.net.expand_localhost()
    new_node = network.create_node(
        infra.interfaces.HostSpec(
            rpc_interfaces={
                infra.interfaces.PRIMARY_RPC_INTERFACE: infra.interfaces.RPCInterface(
                    host=host
                ),
                operator_rpc_interface: infra.interfaces.RPCInterface(
                    host=host,
                    endorsement=infra.interfaces.Endorsement(
                        authority=infra.interfaces.EndorsementAuthority.Node
                    ),
                ),
            }
        )
    )
    network.join_node(new_node, args.package, args, from_snapshot=from_snapshot)

    # Verify self-signed node certificate validity period
    new_node.verify_certificate_validity_period(interface_name=operator_rpc_interface)

    network.trust_node(
        new_node,
        args,
        validity_period_days=args.maximum_node_certificate_validity_days // 2,
    )

    if not from_snapshot:
        with new_node.client() as c:
            s = c.get("/node/state")
            assert s.body.json()["node_id"] == new_node.node_id
            assert (
                s.body.json()["startup_seqno"] == 0
            ), "Node started without snapshot but reports startup seqno != 0"

    # Now that the node is trusted, verify endorsed certificate validity period
    new_node.verify_certificate_validity_period()

    return network


@reqs.description("Adding a node with an invalid certificate validity period")
def test_add_node_invalid_validity_period(network, args):
    new_node = network.create_node("local://localhost")
    network.join_node(new_node, args.package, args)
    try:
        network.trust_node(
            new_node,
            args,
            validity_period_days=args.maximum_node_certificate_validity_days + 1,
        )
    except infra.proposal.ProposalNotAccepted:
        LOG.info(
            "As expected, node could not be trusted since its certificate validity period is invalid"
        )
    else:
        raise AssertionError(
            "Node should not be trusted if its certificate validity period is invalid"
        )
    return network


def test_add_node_on_other_curve(network, args):
    original_curve = args.curve_id
    args.curve_id = (
        infra.network.EllipticCurve.secp256r1
        if original_curve is None
        else original_curve.next()
    )
    network = test_add_node(network, args)
    args.curve_id = original_curve
    return network


@reqs.description("Changing curve used for identity of new nodes and new services")
def test_change_curve(network, args):
    # NB: This doesn't actually test things, it just changes the configuration
    # for future tests. Expects to be part of an interesting suite
    original_curve = args.curve_id
    args.curve_id = (
        infra.network.EllipticCurve.secp256r1
        if original_curve is None
        else original_curve.next()
    )
    return network


@reqs.description("Adding a valid node from a backup")
@reqs.at_least_n_nodes(2)
def test_add_node_from_backup(network, args):
    new_node = network.create_node("local://localhost")
    network.join_node(
        new_node,
        args.package,
        args,
        target_node=network.find_any_backup(),
    )
    network.trust_node(new_node, args)
    return network


@reqs.description("Adding a node with endorsements retrieved from remote server")
def test_add_node_endorsements_endpoints(network, args):
    # By default, SEV-SNP endorsements are retrieved from the environment on ACI.
    # However, we still want to support fetching those from a remote server, which is
    # tested here
    primary, _ = network.find_primary()
    if not IS_SNP:
        LOG.warning("Skipping test as running on non SEV-SNP")
        return network

    args_copy = deepcopy(args)
    test_vectors = [
        (["Azure:global.acccache.azure.net"], True),
        (["AMD:kdsintf.amd.com"], True),
        (["AMD:invalid.amd.com"], False),
        (["Azure:invalid.azure.com", "AMD:kdsintf.amd.com"], True),  # Fallback server
    ]

    for servers, expected_result in test_vectors:
        LOG.info(
            f"Joining new node with endorsement server {servers} (expect success: {expected_result})"
        )
        new_node = network.create_node("local://localhost")
        args_copy.snp_endorsements_servers = servers
        try:
            network.join_node(
                new_node,
                args.package,
                args_copy,
                set_snp_report_endorsements_envvar=None,
                timeout=15,
            )
        except TimeoutError:
            assert not expected_result
            LOG.info(
                f"Node with invalid quote endorsement servers {servers} could not join as expected"
            )
        else:
            assert (
                expected_result
            ), f"Node with invalid quote endorsement servers joined unexpectedly: {servers}"
            network.retire_node(primary, new_node)
        new_node.stop()

    return network


@reqs.description("Adding a valid node from snapshot")
@reqs.at_least_n_nodes(2)
def test_add_node_from_snapshot(network, args, copy_ledger=True, from_backup=False):
    # Before adding the node from a snapshot, override at least one app entry
    # and wait for a new committed snapshot covering that entry, so that there
    # is at least one historical entry to verify.
    network.txs.issue(network, number_txs=1)
    idx, historical_entry = network.txs.get_last_tx(priv=True)
    network.txs.issue(network, number_txs=1, repeat=True)

    new_node = network.create_node("local://localhost")
    network.join_node(
        new_node,
        args.package,
        args,
        copy_ledger=copy_ledger,
        target_node=network.find_any_backup() if from_backup else None,
        from_snapshot=True,
    )
    network.trust_node(new_node, args)

    with new_node.client() as c:
        r = c.get("/node/state")
        assert (
            r.body.json()["startup_seqno"] != 0
        ), "Node started from snapshot but reports startup seqno of 0"

    # Finally, verify all app entries on the new node, including historical ones
    # from the historical ledger and skip historical entries if ledger
    # was not copied to node.
    network.txs.verify(node=new_node, include_historical=copy_ledger)

    # Check that historical entry can be retrieved (or not, if new node does not
    # have access to historical ledger files).
    try:
        network.txs.verify_tx(
            node=new_node,
            idx=idx,
            msg=historical_entry["msg"],
            seqno=historical_entry["seqno"],
            view=historical_entry["view"],
            historical=True,
        )
    except infra.logging_app.LoggingTxsVerifyException:
        assert (
            not copy_ledger
        ), f"New node {new_node.local_node_id} without ledger should not be able to serve historical entries"
    else:
        assert (
            copy_ledger
        ), f"New node {new_node.local_node_id} with ledger should be able to serve historical entries"

    if not copy_ledger:
        # Pick some sequence numbers before the snapshot the new node started from, and for which
        # the new node does not have corresponding ledger chunks
        missing_txids = []
        with new_node.client("user0") as c:
            r = c.get("/node/state")
            assert r.status_code == http.HTTPStatus.OK, r
            startup_seqno = r.body.json()["startup_seqno"]
            assert startup_seqno != 0, startup_seqno
            possible_seqno_range = range(1, startup_seqno)
            num_samples = min(len(possible_seqno_range), 5)
            missing_seqnos = sorted(random.sample(possible_seqno_range, num_samples))
            LOG.info(f"Verifying status of transactions at seqnos: {missing_seqnos}")
            view = 2
            for seqno in missing_seqnos:
                assert seqno != 0, "0 is not a valid seqno"
                status = TxStatus.Invalid
                while status == TxStatus.Invalid:
                    r = c.get(f"/node/tx?transaction_id={view}.{seqno}")
                    assert r.status_code == http.HTTPStatus.OK, r
                    status = TxStatus(r.body.json()["status"])
                    if status == TxStatus.Committed:
                        missing_txids.append(f"{view}.{seqno}")
                    else:
                        # Should never happen, because we're looking at seqnos for which there
                        # is a committed snapshot, and so are definitely committed.
                        assert status != TxStatus.Pending, status
                        view += 1
                        # Not likely to happen on purpose
                        assert view < 10, view

        LOG.info("Check historical queries return ACCEPTED")
        with new_node.client("user0") as c:
            for txid in missing_txids:
                # New node knows transactions are committed
                rc = c.get(f"/node/tx?transaction_id={txid}")
                status = TxStatus(r.body.json()["status"])
                assert status == TxStatus.Committed
                # But can't read their contents
                rc = c.get(f"/app/receipt?transaction_id={txid}")
                assert rc.status_code == http.HTTPStatus.ACCEPTED, rc
                time.sleep(3)
                # Not even after giving the host enough time
                rc = c.get(f"/app/receipt?transaction_id={txid}")
                assert rc.status_code == http.HTTPStatus.ACCEPTED, rc

    primary, _ = network.find_primary()
    network.retire_node(primary, new_node)
    return network


@reqs.description("Adding as many pending nodes as current number of nodes")
@reqs.supports_methods("/app/log/private")
def test_add_as_many_pending_nodes(network, args):
    # Killing pending nodes should not change the raft consensus rules
    primary, _ = network.find_primary()
    number_new_nodes = len(network.nodes)
    LOG.info(
        f"Adding {number_new_nodes} pending nodes - consensus rules should not change"
    )

    new_nodes = []
    for _ in range(number_new_nodes):
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args)
        new_nodes.append(new_node)

    for new_node in new_nodes:
        new_node.stop()

    # Even though pending nodes (half the number of nodes) are stopped,
    # service can still make progress
    check_can_progress(primary)

    # Cleanup killed pending nodes
    for new_node in new_nodes:
        network.retire_node(primary, new_node)

    wait_for_reconfiguration_to_complete(network)

    return network


@reqs.description("Retiring a backup")
@reqs.at_least_n_nodes(2)
@reqs.can_kill_n_nodes(1)
def test_retire_backup(network, args):
    primary, _ = network.find_primary()
    backup_to_retire = network.find_any_backup()
    network.retire_node(primary, backup_to_retire)
    backup_to_retire.stop()
    check_can_progress(primary)
    wait_for_reconfiguration_to_complete(network)
    return network


@reqs.description("Retiring the primary")
@reqs.can_kill_n_nodes(1)
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.retire_node(primary, primary, timeout=15)
    # Query this backup to find the new primary. If we ask any other
    # node, then this backup may not know the new primary by the
    # time we call check_can_progress.
    new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup])
    # See https://github.com/microsoft/CCF/issues/1713
    check_can_progress(new_primary)
    check_can_progress(backup)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    wait_for_reconfiguration_to_complete(network)
    return network


@reqs.description("Test node filtering by status")
def test_node_filter(network, args):
    primary, _ = network.find_primary_and_any_backup()
    with primary.client() as c:

        def get_nodes(status):
            r = c.get(f"/node/network/nodes?status={status}")
            nodes = r.body.json()["nodes"]
            # Primary may change during operation, so do not check for primary equality
            for node in nodes:
                del node["primary"]
            return sorted(nodes, key=lambda node: node["node_id"])

        trusted_before = get_nodes("Trusted")
        pending_before = get_nodes("Pending")
        retired_before = get_nodes("Retired")
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args, target_node=primary)
        trusted_after = get_nodes("Trusted")
        pending_after = get_nodes("Pending")
        retired_after = get_nodes("Retired")
        assert trusted_before == trusted_after, (trusted_before, trusted_after)
        assert len(pending_before) + 1 == len(pending_after), (
            pending_before,
            pending_after,
        )
        assert retired_before == retired_after, (retired_before, retired_after)

        assert all(info["status"] == "Trusted" for info in trusted_after), trusted_after
        assert all(info["status"] == "Pending" for info in pending_after), pending_after
        assert all(info["status"] == "Retired" for info in retired_after), retired_after
    return network


@reqs.description("Get node CCF version")
def test_version(network, args):
    if args.ccf_version is None:
        LOG.warning(
            "Skipping network version check as no expected version is specified"
        )
        return

    nodes = network.get_joined_nodes()

    for node in nodes:
        with node.client() as c:
            r = c.get("/node/version")
            assert r.body.json()["ccf_version"] == args.ccf_version
            assert r.body.json()["unsafe"] == os.path.exists(
                os.path.join(args.binary_dir, "UNSAFE")
            )


@reqs.description("Issue fake join requests as untrusted client")
def test_issue_fake_join(network, args):
    primary, _ = network.find_primary()

    # Assemble dummy join request body
    net = {"bind_address": "0:0"}
    req = {}
    req["node_info_network"] = {
        "node_to_node_interface": net,
        "rpc_interfaces": {"name": net},
    }
    req["consensus_type"] = "CFT"
    req["startup_seqno"] = 0
    with open(
        os.path.join(network.common_dir, "member0_enc_pubk.pem"), "r", encoding="utf-8"
    ) as f:
        req["public_encryption_key"] = f.read()

    with primary.client(identity="user0") as c:
        # First, retrieve real quote from primary node
        own_quote = c.get("/node/quotes/self").body.json()

        LOG.info("Join with SGX dummy quote")
        req["quote_info"] = {"format": "OE_SGX_v1", "quote": "", "endorsements": ""}
        r = c.post("/node/join", body=req)
        assert r.status_code == http.HTTPStatus.UNAUTHORIZED
        assert (
            r.body.json()["error"]["code"] == "InvalidQuote"
        ), "Quote verification should fail when OE_SGX_v1 is specified"

        LOG.info("Join with SGX real quote, but different TLS key")
        req["quote_info"] = {
            "format": "OE_SGX_v1",
            "quote": own_quote["raw"],
            "endorsements": own_quote["endorsements"],
        }
        r = c.post("/node/join", body=req)
        assert r.status_code == http.HTTPStatus.UNAUTHORIZED
        assert r.body.json()["error"]["code"] == "InvalidQuote"
        if args.enclave_platform != "sgx":
            assert r.body.json()["error"]["message"] == "Quote could not be verified"
        else:
            assert (
                r.body.json()["error"]["message"]
                == "Quote report data does not contain node's public key hash"
            )

        LOG.info("Join with AMD SEV-SNP quote")
        req["quote_info"] = {
            "format": "AMD_SEV_SNP_v1",
            "quote": own_quote["raw"],
            "endorsements": own_quote["endorsements"],
        }
        r = c.post("/node/join", body=req)
        if args.enclave_platform != "snp":
            assert r.status_code == http.HTTPStatus.UNAUTHORIZED
            assert r.body.json()["error"]["code"] == "InvalidQuote"
            assert r.body.json()["error"]["message"] == "Quote could not be verified"
        else:
            assert (
                r.body.json()["error"]["message"]
                == "Quote report data does not contain node's public key hash"
            )

        LOG.info("Join with virtual quote")
        req["quote_info"] = {
            "format": "Insecure_Virtual",
            "quote": "",
            "endorsements": "",
        }
        r = c.post("/node/join", body=req)
        if args.enclave_platform == "virtual":
            assert r.status_code == http.HTTPStatus.OK
            assert r.body.json()["node_status"] == ccf.ledger.NodeStatus.PENDING.value
        else:
            assert r.status_code == http.HTTPStatus.UNAUTHORIZED
            assert (
                r.body.json()["error"]["code"] == "InvalidQuote"
            ), "Virtual node must never join non-virtual network"

    return network


@reqs.description("Replace a node on the same addresses")
@reqs.can_kill_n_nodes(1)
def test_node_replacement(network, args):
    primary, backups = network.find_nodes()

    node_to_replace = backups[-1]
    LOG.info(f"Retiring node {node_to_replace.local_node_id}")
    network.retire_node(primary, node_to_replace)
    node_to_replace.stop()
    check_can_progress(primary)

    LOG.info("Adding one node on same address as retired node")
    replacement_node = network.create_node(
        f"local://{node_to_replace.get_public_rpc_host()}:{node_to_replace.get_public_rpc_port()}",
        node_port=node_to_replace.n2n_interface.port,
    )
    network.join_node(replacement_node, args.package, args)
    network.trust_node(replacement_node, args)

    assert replacement_node.node_id != node_to_replace.node_id
    assert (
        replacement_node.get_public_rpc_host() == node_to_replace.get_public_rpc_host()
    )
    assert replacement_node.n2n_interface.port == node_to_replace.n2n_interface.port
    assert (
        replacement_node.get_public_rpc_port() == node_to_replace.get_public_rpc_port()
    )

    allowed_to_suspend_count = network.get_f() - len(network.get_stopped_nodes())
    backups_to_suspend = backups[:allowed_to_suspend_count]
    LOG.info(
        f"Suspending {len(backups_to_suspend)} other nodes to make progress depend on the replacement"
    )
    for other_backup in backups_to_suspend:
        other_backup.suspend()
    # Confirm the network can make progress
    check_can_progress(primary)
    for other_backup in backups_to_suspend:
        other_backup.resume()

    return network


@reqs.description("Join straddling a primary retirement")
@reqs.at_least_n_nodes(3)
def test_join_straddling_primary_replacement(network, args):
    # We need a fourth node before we attempt the replacement, otherwise
    # we will reach a situation where two out four nodes in the voting quorum
    # are unable to participate (one retired and one not yet joined).
    test_add_node(network, args)
    primary, _ = network.find_primary()
    new_node = network.create_node("local://localhost")
    network.join_node(new_node, args.package, args)
    proposal_body = {
        "actions": [
            {
                "name": "transition_node_to_trusted",
                "args": {
                    "node_id": new_node.node_id,
                    "valid_from": str(datetime.utcnow()),
                },
            },
            {
                "name": "remove_node",
                "args": {"node_id": primary.node_id},
            },
        ]
    }

    proposal = network.consortium.get_any_active_member().propose(
        primary, proposal_body
    )
    network.consortium.vote_using_majority(
        primary,
        proposal,
        {"ballot": "export function vote (proposal, proposer_id) { return true }"},
        timeout=10,
    )

    network.wait_for_new_primary(primary)
    new_node.wait_for_node_to_join(timeout=10)

    primary.stop()
    network.nodes.remove(primary)
    wait_for_reconfiguration_to_complete(network)
    return network


@reqs.description("Test retired nodes have emitted at most one signature")
def test_retiring_nodes_emit_at_most_one_signature(network, args):
    primary, _ = network.find_primary()

    # Force ledger flush of all transactions so far
    network.get_latest_ledger_public_state()
    ledger = ccf.ledger.Ledger(primary.remote.ledger_paths())

    retiring_nodes = set()
    retired_nodes = set()
    for chunk in ledger:
        for tr in chunk:
            tables = tr.get_public_domain().get_tables()
            if ccf.ledger.NODES_TABLE_NAME in tables:
                nodes = tables[ccf.ledger.NODES_TABLE_NAME]
                for nid, info_ in nodes.items():
                    if info_ is None:
                        # Node was removed
                        continue
                    info = json.loads(info_)
                    if info["status"] == "Retired":
                        retiring_nodes.add(nid)

            if ccf.ledger.SIGNATURE_TX_TABLE_NAME in tables:
                sigs = tables[ccf.ledger.SIGNATURE_TX_TABLE_NAME]
                assert len(sigs) == 1, sigs.keys()
                (sig_,) = sigs.values()
                sig = json.loads(sig_)
                assert (
                    sig["node"] not in retired_nodes
                ), f"Unexpected signature from {sig['node']}"
                retired_nodes |= retiring_nodes
                retiring_nodes = set()

    assert not retiring_nodes, (retiring_nodes, retired_nodes)
    LOG.info("{} nodes retired throughout test", len(retired_nodes))

    wait_for_reconfiguration_to_complete(network)

    return network


@reqs.description("Adding a learner without snapshot")
def test_learner_catches_up(network, args):
    primary, _ = network.find_primary()
    num_nodes_before = 0

    with primary.client() as c:
        s = c.get("/node/consensus")
        rj = s.body.json()
        # At this point, there should be exactly one configuration
        assert len(rj["details"]["configs"]) == 1
        c0 = rj["details"]["configs"][0]["nodes"]
        num_nodes_before = len(c0)

    new_node = network.create_node("local://localhost")
    network.join_node(new_node, args.package, args)
    network.trust_node(new_node, args)

    with new_node.client() as c:
        s = c.get("/node/network/nodes/self")
        rj = s.body.json()
        assert rj["status"] == "Learner" or rj["status"] == "Trusted"

    network.wait_for_node_in_store(
        primary,
        new_node.node_id,
        node_status=(ccf.ledger.NodeStatus.TRUSTED),
        timeout=3,
    )

    with primary.client() as c:
        s = c.get("/node/consensus")
        rj = s.body.json()
        assert len(rj["details"]["learners"]) == 0

        # At this point, there should be exactly one configuration, which includes the new node.
        assert len(rj["details"]["configs"]) == 1
        c0 = rj["details"]["configs"][0]["nodes"]
        assert len(c0) == num_nodes_before + 1
        assert new_node.node_id in c0

    return network


@reqs.description("Test node certificates validity period")
def test_node_certificates_validity_period(network, args):
    for node in network.get_joined_nodes():
        node.verify_certificate_validity_period()
    return network


@reqs.description("Add a new node without a snapshot but with the historical ledger")
def test_add_node_with_read_only_ledger(network, args):
    network.txs.issue(network, number_txs=10)
    network.txs.issue(network, number_txs=2, repeat=True)

    new_node = network.create_node("local://localhost")
    network.join_node(
        new_node, args.package, args, from_snapshot=False, copy_ledger=True
    )
    network.trust_node(new_node, args)
    return network


@reqs.description("Test reconfiguration type in service config")
def test_service_config_endpoint(network, args):
    for n in network.get_joined_nodes():
        with n.client() as c:
            r = c.get("/node/service/configuration")
            rj = r.body.json()
            assert args.reconfiguration_type == rj["reconfiguration_type"]


@reqs.description("Confirm ledger contains expected entries")
def test_ledger_invariants(network, args):
    # Force ledger flush of all transactions so far
    network.get_latest_ledger_public_state()

    for node in network.nodes:
        LOG.info(f"Examining ledger on node {node.local_node_id}")
        ledger_directories = node.remote.ledger_paths()
        ledger = ccf.ledger.Ledger(ledger_directories)
        check_signatures(ledger)

    return network


def run_all(args):
    txs = app.LoggingTxs("user0")
    with infra.network.network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        pdb=args.pdb,
        txs=txs,
    ) as network:
        network.start_and_open(args)

        test_version(network, args)
        test_issue_fake_join(network, args)

        test_add_as_many_pending_nodes(network, args)
        test_add_node_invalid_service_cert(network, args)
        test_add_node(network, args, from_snapshot=False)
        test_add_node_with_read_only_ledger(network, args)
        test_join_straddling_primary_replacement(network, args)
        test_node_replacement(network, args)
        test_add_node_from_backup(network, args)
        test_add_node_endorsements_endpoints(network, args)
        test_add_node_on_other_curve(network, args)
        test_retire_backup(network, args)
        test_add_node(network, args)
        test_retire_primary(network, args)

        test_add_node_from_snapshot(network, args)
        test_add_node_from_snapshot(network, args, from_backup=True)
        test_add_node_from_snapshot(network, args, copy_ledger=False)

        test_node_filter(network, args)
        test_retiring_nodes_emit_at_most_one_signature(network, args)

        if args.reconfiguration_type == "TwoTransaction":
            test_learner_catches_up(network, args)

        test_service_config_endpoint(network, args)
        test_node_certificates_validity_period(network, args)
        test_add_node_invalid_validity_period(network, args)

        test_ledger_invariants(network, args)

    run_join_old_snapshot(args)


def run_join_old_snapshot(args):
    txs = app.LoggingTxs("user0")
    nodes = ["local://localhost"]

    with tempfile.TemporaryDirectory() as tmp_dir:
        with infra.network.network(
            nodes,
            args.binary_dir,
            args.debug_nodes,
            args.perf_nodes,
            pdb=args.pdb,
            txs=txs,
        ) as network:
            network.start_and_open(args)
            primary, _ = network.find_primary()

            # First, retrieve and save one committed snapshot
            txs.issue(network, number_txs=args.snapshot_tx_interval)
            old_committed_snapshots = network.get_committed_snapshots(primary)
            copy(
                os.path.join(
                    old_committed_snapshots, os.listdir(old_committed_snapshots)[0]
                ),
                tmp_dir,
            )

            # Then generate another newer snapshot, and add two more nodes from it
            txs.issue(network, number_txs=args.snapshot_tx_interval)

            for _ in range(0, 2):
                new_node = network.create_node("local://localhost")
                network.join_node(
                    new_node,
                    args.package,
                    args,
                    from_snapshot=True,
                )
                network.trust_node(new_node, args)

            # Kill primary and wait for a new one: new primary is
            # guaranteed to have started from the new snapshot
            primary.stop()
            network.wait_for_new_primary(primary)

            # Start new node from the old snapshot
            try:
                new_node = network.create_node("local://localhost")
                network.join_node(
                    new_node,
                    args.package,
                    args,
                    from_snapshot=True,
                    snapshots_dir=tmp_dir,
                    timeout=3,
                )
            except infra.network.StartupSeqnoIsOld:
                LOG.info(
                    f"Node {new_node.local_node_id} started from old snapshot could not join the service, as expected"
                )
            else:
                raise RuntimeError(
                    f"Node {new_node.local_node_id} started from old snapshot unexpectedly joined the service"
                )

            # Start new node from no snapshot
            try:
                new_node = network.create_node("local://localhost")
                network.join_node(
                    new_node,
                    args.package,
                    args,
                    from_snapshot=False,
                    timeout=3,
                )
            except infra.network.StartupSeqnoIsOld:
                LOG.info(
                    f"Node {new_node.local_node_id} started without snapshot could not join the service, as expected"
                )
            else:
                raise RuntimeError(
                    f"Node {new_node.local_node_id} started without snapshot unexpectedly joined the service successfully"
                )


def get_current_nodes_table(network):
    tables, _ = network.get_latest_ledger_public_state()
    tn = "public:ccf.gov.nodes.info"
    r = {}
    for nid, info in tables[tn].items():
        r[nid.decode()] = json.loads(info)
    return r


def check_2tx_ledger(ledger_paths, learner_id):
    pending_at = 0
    learner_at = 0
    trusted_at = 0

    ledger = ccf.ledger.Ledger(ledger_paths, committed_only=False)

    for chunk in ledger:
        for tr in chunk:
            tables = tr.get_public_domain().get_tables()
            if ccf.ledger.NODES_TABLE_NAME in tables:
                nodes = tables[ccf.ledger.NODES_TABLE_NAME]
                for nid, info_ in nodes.items():
                    info = json.loads(info_)
                    if nid.decode() == learner_id and "status" in info:
                        seq_no = tr.get_public_domain().get_seqno()
                        if info["status"] == "Pending":
                            pending_at = seq_no
                        elif info["status"] == "Learner":
                            learner_at = seq_no
                        elif info["status"] == "Trusted":
                            trusted_at = seq_no

    assert pending_at < learner_at < trusted_at


@reqs.description("Migrate from 1tx to 2tx reconfiguration scheme")
def test_migration_2tx_reconfiguration(
    network, args, initial_is_1tx=True, valid_from=None, **kwargs
):
    primary, _ = network.find_primary()

    # Check that the service config agrees that this is a 1tx network
    with primary.client() as c:
        s = c.get("/node/service/configuration").body.json()
        if initial_is_1tx:
            assert s["reconfiguration_type"] == "OneTransaction"

    network.consortium.submit_2tx_migration_proposal(primary)
    network.wait_for_all_nodes_to_commit(primary)

    # Check that the service config has been updated
    with primary.client() as c:
        rj = c.get("/node/service/configuration").body.json()
        assert rj["reconfiguration_type"] == "TwoTransaction"

    # Check that all nodes have updated their consensus parameters
    for node in network.nodes:
        with node.client() as c:
            rj = c.get("/node/consensus").body.json()
            assert "reconfiguration_type" in rj["details"]
            assert rj["details"]["reconfiguration_type"] == "TwoTransaction"
            assert len(rj["details"]["learners"]) == 0

    new_node = network.create_node("local://localhost", **kwargs)
    network.join_node(new_node, args.package, args)
    network.trust_node(new_node, args, valid_from=valid_from)

    # Check that the new node has the right consensus parameter
    with new_node.client() as c:
        rj = c.get("/node/consensus").body.json()
        assert "reconfiguration_type" in rj["details"]
        assert "learners" in rj["details"]
        assert rj["details"]["reconfiguration_type"] == "TwoTransaction"
        assert len(rj["details"]["learners"]) == 0


def run_migration_tests(args):
    if args.reconfiguration_type != "OneTransaction":
        return

    with infra.network.network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        pdb=args.pdb,
    ) as network:
        network.start_and_open(args)
        test_migration_2tx_reconfiguration(network, args)
        primary, _ = network.find_primary()
        new_node = network.nodes[-1]

        ledger_paths = primary.remote.ledger_paths()
        learner_id = new_node.node_id

    check_2tx_ledger(ledger_paths, learner_id)


if __name__ == "__main__":

    def add(parser):
        parser.add_argument(
            "--include-2tx-reconfig",
            help="Include tests for the 2-transaction reconfiguration scheme",
            default=False,
            action="store_true",
        )

    cr = ConcurrentRunner(add)

    cr.add(
        "1tx_reconfig",
        run_all,
        package="samples/apps/logging/liblogging",
        nodes=infra.e2e_args.min_nodes(cr.args, f=1),
        reconfiguration_type="OneTransaction",
    )

    if cr.args.include_2tx_reconfig:
        cr.add(
            "2tx_reconfig",
            run_all,
            package="samples/apps/logging/liblogging",
            nodes=infra.e2e_args.min_nodes(cr.args, f=1),
            reconfiguration_type="TwoTransaction",
        )

        cr.add(
            "migration",
            run_migration_tests,
            package="samples/apps/logging/liblogging",
            nodes=infra.e2e_args.min_nodes(cr.args, f=1),
            reconfiguration_type="OneTransaction",
        )

    cr.run()