CCF/tests/lts_compatibility.py

668 строки
26 KiB
Python

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the Apache 2.0 License.
import infra.network
import infra.e2e_args
import infra.proc
import infra.logging_app as app
import infra.utils
import infra.github
import infra.jwt_issuer
import infra.crypto
import infra.node
import suite.test_requirements as reqs
import ccf.ledger
import os
import json
import time
import datetime
from e2e_logging import test_random_receipts
from governance import test_all_nodes_cert_renewal, test_service_cert_renewal
from reconfiguration import test_migration_2tx_reconfiguration
from loguru import logger as LOG
# Assumption:
# By default, this assumes that the local checkout is not a non-release branch (e.g. main)
# that is older than the latest release branch. This is to simplify the test, and assume
# that the latest release to test compatibility with is the latest available one.
# Use the CCF_LATEST_RELEASE_BRANCH_SUFFIX envvar below if this isn't the case.
ENV_VAR_LATEST_LTS_BRANCH_NAME = (
"CCF_LATEST_RELEASE_BRANCH_SUFFIX" # e.g. "release/1.x"
)
LOCAL_CHECKOUT_DIRECTORY = "."
# When a 2.x node joins a 1.x service, the node has to self-endorse
# its certificate, using a default value for the validity period
# hardcoded in CCF.
DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS = 365
def issue_activity_on_live_service(network, args):
log_capture = []
network.txs.issue(
network, number_txs=args.snapshot_tx_interval * 2, log_capture=log_capture
)
# At least one transaction that will require historical fetching
network.txs.issue(network, number_txs=1, repeat=True)
# At least one transaction that will require forwarding
network.txs.issue(network, number_txs=1, on_backup=True)
def get_new_constitution_for_install(args, install_path):
constitution_directory = os.path.join(
install_path,
"../samples/constitutions/default"
if install_path == LOCAL_CHECKOUT_DIRECTORY
else "bin",
)
def replace_constitution_fragment(args, fragment_name):
args.constitution[:] = [
os.path.join(constitution_directory, fragment_name)
if fragment_name in f
else f
for f in args.constitution
]
# Note: Use resolve.js script from local checkout as only the trivial sandbox
# version is included in installation
replace_constitution_fragment(args, "actions.js")
replace_constitution_fragment(args, "apply.js")
replace_constitution_fragment(args, "validate.js")
return args.constitution
def test_new_service(
network,
args,
install_path,
binary_dir,
library_dir,
version,
cycle_existing_nodes=False,
):
LOG.info("Update constitution")
primary, _ = network.find_primary()
new_constitution = get_new_constitution_for_install(args, install_path)
network.consortium.set_constitution(primary, new_constitution)
all_nodes = network.get_joined_nodes()
# Note: Changes to constitution between versions should be tested here
LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]")
nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else []
nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1
# Pre-2.0 nodes require X509 time format
valid_from = str(infra.crypto.datetime_to_X509time(datetime.datetime.now()))
for _ in range(0, nodes_to_add_count):
new_node = network.create_node(
"local://localhost",
binary_dir=binary_dir,
library_dir=library_dir,
version=version,
)
network.join_node(new_node, args.package, args)
network.trust_node(
new_node,
args,
valid_from=valid_from,
)
new_node.verify_certificate_validity_period(
expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS
)
all_nodes.append(new_node)
for node in nodes_to_cycle:
network.retire_node(primary, node)
if primary == node:
primary, _ = network.wait_for_new_primary(primary)
# Stopping a node immediately after its removal being
# committed and an election is not safe: the successor
# primary may need to re-establish commit on a config
# that includes the retire node.
# See https://github.com/microsoft/CCF/issues/1713
# for more detail. Until the dedicated endpoint exposing
# this safely is implemented, we work around this by
# submitting and waiting for commit on another transaction.
network.txs.issue(network, number_txs=1, repeat=True)
node.stop()
test_all_nodes_cert_renewal(network, args, valid_from=valid_from)
test_service_cert_renewal(network, args, valid_from=valid_from)
if args.check_2tx_reconfig_migration:
test_migration_2tx_reconfiguration(
network,
args,
initial_is_1tx=False, # Reconfiguration type added in 2.x
binary_dir=binary_dir,
library_dir=library_dir,
version=version,
valid_from=valid_from,
)
LOG.info("Apply transactions to new nodes only")
issue_activity_on_live_service(network, args)
test_random_receipts(network, args, lts=True)
# Local build and install bin/ and lib/ directories differ
def get_bin_and_lib_dirs_for_install_path(install_path):
return (
[LOCAL_CHECKOUT_DIRECTORY] * 2
if install_path == LOCAL_CHECKOUT_DIRECTORY
else (os.path.join(install_path, "bin"), os.path.join(install_path, "lib"))
)
def set_js_args(args, from_install_path, to_install_path=None):
# Use from_version's app and constitution as new JS features may not be available
# on older versions, but upgrade to the new constitution once the new network is ready
js_app_directory = (
"../samples/apps/logging/js"
if from_install_path == LOCAL_CHECKOUT_DIRECTORY
else "samples/logging/js"
)
args.js_app_bundle = os.path.join(from_install_path, js_app_directory)
if to_install_path:
args.new_js_app_bundle = os.path.join(
to_install_path, "../samples/apps/logging/js"
)
get_new_constitution_for_install(args, from_install_path)
def run_code_upgrade_from(
args,
from_install_path,
to_install_path,
from_version=None,
to_version=None,
from_container_image=None,
):
from_binary_dir, from_library_dir = get_bin_and_lib_dirs_for_install_path(
from_install_path
)
to_binary_dir, to_library_dir = get_bin_and_lib_dirs_for_install_path(
to_install_path
)
set_js_args(args, from_install_path, to_install_path)
jwt_issuer = infra.jwt_issuer.JwtIssuer(
"https://localhost", refresh_interval=args.jwt_key_refresh_interval_s
)
with jwt_issuer.start_openid_server():
txs = app.LoggingTxs(jwt_issuer=jwt_issuer)
with infra.network.network(
args.nodes,
binary_directory=from_binary_dir,
library_directory=from_library_dir,
pdb=args.pdb,
txs=txs,
jwt_issuer=jwt_issuer,
version=from_version,
) as network:
network.start_and_open(args, node_container_image=from_container_image)
old_nodes = network.get_joined_nodes()
primary, _ = network.find_primary()
from_major_version = primary.major_version
LOG.info("Apply transactions to old service")
issue_activity_on_live_service(network, args)
new_code_id = infra.utils.get_code_id(
args.enclave_type,
args.oe_binary,
args.package,
library_dir=to_library_dir,
)
network.consortium.add_new_code(primary, new_code_id)
# Note: alternate between joining from snapshot and replaying entire ledger
new_nodes = []
from_snapshot = True
for _ in range(0, len(old_nodes)):
new_node = network.create_node(
"local://localhost",
binary_dir=to_binary_dir,
library_dir=to_library_dir,
version=to_version,
)
network.join_node(
new_node, args.package, args, from_snapshot=from_snapshot
)
network.trust_node(
new_node,
args,
valid_from=str( # Pre-2.0 nodes require X509 time format
infra.crypto.datetime_to_X509time(datetime.datetime.now())
),
)
# For 2.x nodes joining a 1.x service before the constitution is updated,
# the node certificate validity period is set by the joining node itself
# as [node startup time, node startup time + 365 days]
new_node.verify_certificate_validity_period(
expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS,
ignore_proposal_valid_from=True,
)
from_snapshot = not from_snapshot
new_nodes.append(new_node)
# Verify that all nodes run the expected CCF version
for node in network.get_joined_nodes():
# Note: /node/version endpoint was added in 2.x
if not node.major_version or node.major_version > 1:
with node.client() as c:
r = c.get("/node/version")
expected_version = node.version or args.ccf_version
version = r.body.json()["ccf_version"]
assert (
version == expected_version
), f"For node {node.local_node_id}, expect version {expected_version}, got {version}"
LOG.info("Apply transactions to hybrid network, with primary as old node")
issue_activity_on_live_service(network, args)
old_code_id = infra.utils.get_code_id(
args.enclave_type,
args.oe_binary,
args.package,
library_dir=from_library_dir,
)
primary, _ = network.find_primary()
network.consortium.retire_code(primary, old_code_id)
for index, node in enumerate(old_nodes):
network.retire_node(primary, node)
if primary == node:
primary, _ = network.wait_for_new_primary(primary)
# Submit tx and wait for commit after node retirement. See
# https://github.com/microsoft/CCF/issues/1713 for more detail.
network.txs.issue(network, number_txs=1, repeat=True)
# This block is here to test the transition period from a network that
# does not support custom claims to one that does. It can be removed after
# the transition is complete.
#
# The new build, being unreleased, doesn't have a version at all
if not primary.major_version:
LOG.info("Upgrade to new JS app")
# Upgrade to a version of the app containing an endpoint that
# registers custom claims
network.consortium.set_js_app_from_dir(
primary, args.new_js_app_bundle
)
LOG.info("Run transaction with additional claim")
# With wait_for_sync, the client checks that all nodes, including
# the minority of old ones, have acked the transaction
msg_idx = network.txs.idx + 1
txid = network.txs.issue(
network, number_txs=1, record_claim=True, wait_for_sync=True
)
assert len(network.txs.pub[msg_idx]) == 1
claims = network.txs.pub[msg_idx][-1]["msg"]
LOG.info(
"Check receipts are fine, including transaction with claims"
)
test_random_receipts(
network,
args,
lts=True,
additional_seqnos={txid.seqno: claims.encode()},
)
# Also check receipts on an old node
if index + 1 < len(old_nodes):
next_node = old_nodes[index + 1]
test_random_receipts(
network,
args,
lts=True,
additional_seqnos={txid.seqno: None},
node=next_node,
)
node.stop()
LOG.info("Service is now made of new nodes only")
primary, _ = network.find_nodes()
# Rollover JWKS so that new primary must read historical CA bundle table
# and retrieve new keys via auto refresh
if not os.getenv("CONTAINER_NODES"):
jwt_issuer.refresh_keys()
# Note: /gov/jwt_keys/all endpoint was added in 2.x
if not primary.major_version or primary.major_version > 1:
jwt_issuer.wait_for_refresh(network)
else:
time.sleep(3)
else:
# https://github.com/microsoft/CCF/issues/2608#issuecomment-924785744
LOG.warning("Skipping JWT refresh as running nodes in container")
# Code update from 1.x to 2.x requires cycling the freshly-added 2.x nodes
# once. This is because 2.x nodes will not have an endorsed certificate
# recorded in the store and thus will not be able to have their certificate
# refreshed, etc.
test_new_service(
network,
args,
to_install_path,
to_binary_dir,
to_library_dir,
to_version,
cycle_existing_nodes=True,
)
# Check that the ledger can be parsed
# Note: When upgrading from 1.x to 2.x, it is possible that ledger chunk are not
# in sync between nodes, which may cause some chunks to differ when starting
# from a snapshot. See https://github.com/microsoft/ccf/issues/3613. In such case,
# we only verify that the ledger can be parsed, even if some chunks are duplicated.
# This can go once 2.0 is released.
insecure_ledger_verification = (
from_major_version == 1 and primary.version_after("ccf-2.0.0-rc7")
)
network.get_latest_ledger_public_state(
insecure=insecure_ledger_verification
)
@reqs.description("Run live compatibility with latest LTS")
def run_live_compatibility_with_latest(
args,
repo,
local_branch,
this_release_branch_only=False,
lts_install_path=None,
lts_container_image=None,
):
"""
Tests that a service from the latest LTS can be safely upgraded to the version of
the local checkout.
"""
if lts_install_path is None:
lts_version, lts_install_path = repo.install_latest_lts_for_branch(
os.getenv(ENV_VAR_LATEST_LTS_BRANCH_NAME, local_branch),
this_release_branch_only,
)
else:
lts_version = infra.github.get_version_from_install(lts_install_path)
if lts_version is None:
LOG.warning(
f"Latest LTS not found for {local_branch} branch (this_release_branch_only: {this_release_branch_only})"
)
return None
LOG.info(f"From LTS {lts_version} to local {local_branch} branch")
if not args.dry_run:
run_code_upgrade_from(
args,
from_install_path=lts_install_path,
to_install_path=LOCAL_CHECKOUT_DIRECTORY,
from_version=lts_version,
to_version=None,
from_container_image=lts_container_image,
)
return lts_version
@reqs.description("Run ledger compatibility since first LTS")
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot):
"""
Tests that a service from the very first LTS can be recovered
to the next LTS, and so forth, until the version of the local checkout.
The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the
entire historical ledger is used.
"""
LOG.info("Use snapshot: {}", use_snapshot)
repo = infra.github.Repository()
lts_releases = repo.get_lts_releases(local_branch)
has_pre_2_rc7_ledger = False
LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}")
lts_versions = []
# Add an empty entry to release to indicate local checkout
# Note: dicts are ordered from Python3.7
lts_releases[None] = None
jwt_issuer = infra.jwt_issuer.JwtIssuer(
"https://localhost", refresh_interval=args.jwt_key_refresh_interval_s
)
previous_version = None
with jwt_issuer.start_openid_server():
txs = app.LoggingTxs(jwt_issuer=jwt_issuer)
for idx, (_, lts_release) in enumerate(lts_releases.items()):
if lts_release:
version, install_path = repo.install_release(lts_release)
lts_versions.append(version)
set_js_args(args, install_path)
else:
version = args.ccf_version
install_path = LOCAL_CHECKOUT_DIRECTORY
get_new_constitution_for_install(args, install_path)
binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path(
install_path
)
if not args.dry_run:
network_args = {
"hosts": args.nodes,
"binary_dir": binary_dir,
"library_dir": library_dir,
"txs": txs,
"jwt_issuer": jwt_issuer,
"version": version,
}
if idx == 0:
LOG.info(f"Starting new service (version: {version})")
network = infra.network.Network(**network_args)
network.start_and_open(args)
else:
LOG.info(f"Recovering service (new version: {version})")
network = infra.network.Network(
**network_args, existing_network=network
)
network.start_in_recovery(
args,
ledger_dir,
committed_ledger_dirs,
snapshots_dir=snapshots_dir,
)
# Recovery count is not stored in pre-2.0.3 ledgers
network.recover(
args,
expected_recovery_count=1
if not infra.node.version_after(previous_version, "ccf-2.0.3")
else None,
)
previous_version = version
nodes = network.get_joined_nodes()
primary, _ = network.find_primary()
# Verify that all nodes run the expected CCF version
for node in nodes:
# Note: /node/version endpoint and custom certificate validity
# were added in 2.x
if not node.major_version or node.major_version > 1:
with node.client() as c:
r = c.get("/node/version")
expected_version = node.version or args.ccf_version
version = r.body.json()["ccf_version"]
assert (
r.body.json()["ccf_version"] == expected_version
), f"Node version is not {expected_version}"
node.verify_certificate_validity_period()
# Rollover JWKS so that new primary must read historical CA bundle table
# and retrieve new keys via auto refresh
jwt_issuer.refresh_keys()
# Note: /gov/jwt_keys/all endpoint was added in 2.x
primary, _ = network.find_nodes()
if not primary.major_version or primary.major_version > 1:
jwt_issuer.wait_for_refresh(network)
else:
time.sleep(3)
if idx > 0:
test_new_service(
network,
args,
install_path,
binary_dir,
library_dir,
version,
)
# We accept ledger chunk file differences during upgrades
# from 1.x to 2.x post rc7 ledger. This is necessary because
# the ledger files may not be chunked at the same interval
# between those versions (see https://github.com/microsoft/ccf/issues/3613;
# 1.x ledgers do not contain the header flags to synchronize ledger chunks).
# This can go once 2.0 is released.
current_version_past_2_rc7 = primary.version_after("ccf-2.0.0-rc7")
has_pre_2_rc7_ledger = (
not current_version_past_2_rc7 or has_pre_2_rc7_ledger
)
is_ledger_chunk_breaking = (
has_pre_2_rc7_ledger and current_version_past_2_rc7
)
snapshots_dir = (
network.get_committed_snapshots(primary) if use_snapshot else None
)
network.save_service_identity(args)
network.stop_all_nodes(
skip_verification=True,
accept_ledger_diff=is_ledger_chunk_breaking,
)
ledger_dir, committed_ledger_dirs = primary.get_ledger()
# Check that ledger and snapshots can be parsed
ccf.ledger.Ledger(committed_ledger_dirs).get_latest_public_state()
if snapshots_dir:
for s in os.listdir(snapshots_dir):
with ccf.ledger.Snapshot(
os.path.join(snapshots_dir, s)
) as snapshot:
snapshot.get_public_domain()
return lts_versions
if __name__ == "__main__":
def add(parser):
parser.add_argument("--check-ledger-compatibility", action="store_true")
parser.add_argument("--check-2tx-reconfig-migration", action="store_true")
parser.add_argument(
"--compatibility-report-file", type=str, default="compatibility_report.json"
)
# It is only possible to test compatibility with past releases since only the local infra
# is able to spawn old nodes
parser.add_argument(
"--release-install-path",
type=str,
help='Absolute path to existing CCF release, e.g. "/opt/ccf"',
default=None,
)
parser.add_argument(
"--release-install-image",
type=str,
help="If --release-install-path is set, specify a docker image to run release in (only if CONTAINER_NODES envvar is set) ",
default=None,
)
parser.add_argument("--dry-run", action="store_true")
args = infra.e2e_args.cli_args(add)
# JS generic is the only app included in CCF install
args.package = "libjs_generic"
args.nodes = infra.e2e_args.max_nodes(args, f=0)
args.jwt_key_refresh_interval_s = 3
args.sig_ms_interval = 1000 # Set to cchost default value
# Hardcoded because host only accepts info log on release builds
args.host_log_level = "info"
repo = infra.github.Repository()
local_branch = infra.github.GitEnv.local_branch()
if args.dry_run:
LOG.warning("Dry run: no compatibility check")
compatibility_report = {}
compatibility_report["version"] = args.ccf_version
compatibility_report["live compatibility"] = {}
if args.release_install_path:
version = run_live_compatibility_with_latest(
args,
repo,
local_branch,
lts_install_path=args.release_install_path,
lts_container_image=args.release_install_image,
)
compatibility_report["live compatibility"].update(
{f"with release ({args.release_install_path})": version}
)
else:
# Compatibility with previous LTS
# (e.g. when releasing 2.0.1, check compatibility with existing 1.0.17)
latest_lts_version = run_live_compatibility_with_latest(
args, repo, local_branch, this_release_branch_only=False
)
compatibility_report["live compatibility"].update(
{"with previous LTS": latest_lts_version}
)
# Compatibility with latest LTS on the same release branch
# (e.g. when releasing 2.0.1, check compatibility with existing 2.0.0)
latest_lts_version = run_live_compatibility_with_latest(
args, repo, local_branch, this_release_branch_only=True
)
compatibility_report["live compatibility"].update(
{"with same LTS": latest_lts_version}
)
if args.check_ledger_compatibility:
compatibility_report["data compatibility"] = {}
lts_versions = run_ledger_compatibility_since_first(
args, local_branch, use_snapshot=False
)
compatibility_report["data compatibility"].update(
{"with previous ledger": lts_versions}
)
lts_versions = run_ledger_compatibility_since_first(
args, local_branch, use_snapshot=True
)
compatibility_report["data compatibility"].update(
{"with previous snapshots": lts_versions}
)
if not args.dry_run:
with open(args.compatibility_report_file, "w", encoding="utf-8") as f:
json.dump(compatibility_report, f, indent=2)
LOG.info(
f"Compatibility report written to {args.compatibility_report_file}"
)
LOG.success(f"Compatibility report:\n {json.dumps(compatibility_report, indent=2)}")