This commit is contained in:
Julien Maffre 2020-12-18 09:37:39 +00:00 коммит произвёл GitHub
Родитель 7d11a18993
Коммит 016d5441b5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 117 добавлений и 146 удалений

Просмотреть файл

@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## Unreleased
## Changed
- Snapshots are generated by default on the current primary node, every `10,000` committed transaction (#2029).
## [0.16.1]
### Added

Просмотреть файл

@ -515,22 +515,6 @@ if(BUILD_TESTS)
4000
)
add_e2e_test(
NAME recovery_snapshot_test
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/recovery.py
CONSENSUS cft
ADDITIONAL_ARGS
--recovery
2
# Shorten Raft election timeout to speed up test when it kills a node on
# purpose to check that a recovery network is robust to a view change.
--raft-election-timeout
4000
--snapshot-tx-interval
5
--use-snapshot
)
add_e2e_test(
NAME rekey_test
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/rekey.py
@ -559,7 +543,7 @@ if(BUILD_TESTS)
LABEL suite
ADDITIONAL_ARGS
--test-duration
150
200
--enforce-reqs
--test-suite
reconfiguration
@ -567,23 +551,6 @@ if(BUILD_TESTS)
4000
)
add_e2e_test(
NAME snapshots_test_suite
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_suite.py
CONSENSUS cft
LABEL suite
ADDITIONAL_ARGS
--test-duration
150
--enforce-reqs
--test-suite
snapshots
--raft-election-timeout
4000
--snapshot-tx-interval
5
)
add_e2e_test(
NAME full_test_suite
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_suite.py
@ -680,13 +647,6 @@ if(BUILD_TESTS)
ADDITIONAL_ARGS --raft-election-timeout 4000
)
add_e2e_test(
NAME reconfiguration_snapshot_test
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/reconfiguration.py
CONSENSUS cft
ADDITIONAL_ARGS --snapshot-tx-interval 10 --raft-election-timeout 4000
)
add_e2e_test(
NAME code_update_test
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/code_update.py
@ -744,11 +704,13 @@ if(BUILD_TESTS)
)
if(NOT SAN)
# Writing new ledger files and generating new snapshots uses more file
# descriptors so disable those for this test
add_e2e_test(
NAME connections_cft
PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/connections.py
CONSENSUS cft
ADDITIONAL_ARGS --ledger-chunk-bytes 100Mib
ADDITIONAL_ARGS --ledger-chunk-bytes 100Mib --snapshot-tx-interval 10000
)
endif()
@ -842,7 +804,7 @@ if(BUILD_TESTS)
--max-writes-ahead
1000
--repetitions
1000
10000
--msg-ser-fmt
msgpack
)
@ -881,7 +843,7 @@ if(BUILD_TESTS)
--max-writes-ahead
1000
--repetitions
800
1000
--msg-ser-fmt
text
)

Просмотреть файл

@ -546,7 +546,8 @@ function(add_perf_test)
${PYTHON} ${PARSED_ARGS_PYTHON_SCRIPT} -b . -c ${PARSED_ARGS_CLIENT_BIN}
${CCF_NETWORK_TEST_ARGS} --consensus ${PARSED_ARGS_CONSENSUS} -g
${PARSED_ARGS_GOV_SCRIPT} --write-tx-times ${VERIFICATION_ARG} --label
${LABEL_ARG} ${PARSED_ARGS_ADDITIONAL_ARGS} ${NODES}
${LABEL_ARG} --snapshot-tx-interval 10000 ${PARSED_ARGS_ADDITIONAL_ARGS}
${NODES}
)
# Make python test client framework importable

Просмотреть файл

@ -50,9 +50,7 @@ To avoid this, it is possible for a new node to be added (or a service to be rec
Snapshot Generation
~~~~~~~~~~~~~~~~~~~
Snapshots are generated at regular intervals by the current primary node and stored under the directory specified via the ``--snapshot-dir`` CLI option (defaults to ``snapshots/``). The transaction interval at which snapshots are generated is specified via the ``--snapshot-tx-interval`` CLI option (defaults to no snapshot).
.. TODO: Change defaults once https://github.com/microsoft/CCF/issues/1956 is complete
Snapshots are generated at regular intervals by the current primary node and stored under the directory specified via the ``--snapshot-dir`` CLI option (defaults to ``snapshots/``). The transaction interval at which snapshots are generated is specified via the ``--snapshot-tx-interval`` CLI option (defaults to a new snapshot generated every ``10,000`` committed transactions).
.. note:: Because the generation of a snapshot requires a new ledger chunk to be created (see :ref:`operations/ledger_snapshot:File Layout`), all nodes in the network must be started with the same ``--snapshot-tx-interval`` value.

Просмотреть файл

@ -1115,6 +1115,7 @@ namespace aft
{
LOG_FAIL_FMT("Follower failed to apply log entry: {}", i);
state->last_idx--;
ledger->truncate(state->last_idx);
send_append_entries_response(
r.from_node, AppendEntriesResponseType::FAIL);
break;
@ -2025,8 +2026,9 @@ namespace aft
LOG_DEBUG_FMT("Compacting...");
snapshotter->commit(idx);
if (replica_state == Leader)
if (replica_state == Leader && consensus_type == ConsensusType::CFT)
{
// Snapshots are not yet supported with BFT
snapshotter->snapshot(idx);
}
store->compact(idx);

Просмотреть файл

@ -160,13 +160,12 @@ int main(int argc, char** argv)
->capture_default_str()
->transform(CLI::AsSizeValue(true)); // 1000 is kb
size_t snapshot_tx_interval = std::numeric_limits<std::size_t>::max();
size_t snapshot_tx_interval = 10'000;
app
.add_option(
"--snapshot-tx-interval",
snapshot_tx_interval,
"Number of transactions between snapshots (experimental). "
"Defaults to no snapshot.")
"Number of transactions between snapshots")
->capture_default_str();
logger::Level host_log_level{logger::Level::INFO};

Просмотреть файл

@ -169,7 +169,8 @@ namespace asynchost
get_snapshot_idx_from_file_name(file_name) == snapshot_idx)
{
LOG_INFO_FMT(
"Committing snapshot file \"{}\" with evidence proof committed at "
"Committing snapshot file \"{}\" with evidence proof committed "
"at "
"{}",
file_name,
evidence_commit_idx);

Просмотреть файл

@ -53,16 +53,24 @@ def test_verify_quotes(network, args):
@reqs.description("Node with bad code fails to join")
def test_add_node_with_bad_code(network, args):
if args.enclave_type == "virtual":
LOG.warning("Skipping test_add_node_with_bad_code with virtual enclave")
return network
replacement_package = (
"liblogging" if args.package == "libjs_generic" else "libjs_generic"
)
new_code_id = get_code_id(
args.oe_binary,
infra.path.build_lib_path(args.replacement_package, args.enclave_type),
infra.path.build_lib_path(replacement_package, args.enclave_type),
)
LOG.info(f"Adding a node with unsupported code id {new_code_id}")
code_not_found_exception = None
try:
network.create_and_add_pending_node(
args.replacement_package, "local://localhost", args, timeout=3
replacement_package, "local://localhost", args, timeout=3
)
except infra.network.CodeIdNotFound as err:
code_not_found_exception = err
@ -76,11 +84,15 @@ def test_add_node_with_bad_code(network, args):
@reqs.description("Update all nodes code")
def test_update_all_nodes(network, args):
replacement_package = (
"liblogging" if args.package == "libjs_generic" else "libjs_generic"
)
primary, _ = network.find_nodes()
first_code_id, new_code_id = [
get_code_id(args.oe_binary, infra.path.build_lib_path(pkg, args.enclave_type))
for pkg in [args.package, args.replacement_package]
for pkg in [args.package, replacement_package]
]
LOG.info("Add new code id")
@ -115,7 +127,7 @@ def test_update_all_nodes(network, args):
LOG.info("Start fresh nodes running new code")
for _ in range(0, len(network.nodes)):
new_node = network.create_and_trust_node(
args.replacement_package, "local://localhost", args
replacement_package, "local://localhost", args
)
assert new_node
@ -156,6 +168,5 @@ if __name__ == "__main__":
sys.exit()
args.package = "liblogging"
args.replacement_package = "libjs_generic"
args.nodes = infra.e2e_args.min_nodes(args, f=1)
run(args)

Просмотреть файл

@ -246,7 +246,8 @@ def cli_args(add=lambda x: None, parser=None, accept_unknown=False):
parser.add_argument(
"--snapshot-tx-interval",
help="Number of transactions between two snapshots",
default=None,
type=int,
default=10,
)
parser.add_argument(
"--jwt-key-refresh-interval-s",

Просмотреть файл

@ -179,7 +179,7 @@ class Network:
ledger_dir=None,
copy_ledger_read_only=False,
read_only_ledger_dir=None,
from_snapshot=False,
from_snapshot=True,
snapshot_dir=None,
):
forwarded_args = {
@ -198,13 +198,11 @@ class Network:
# specified
if from_snapshot and snapshot_dir is None:
snapshot_dir = self.get_committed_snapshots(target_node)
assert os.listdir(
snapshot_dir
), f"There are no snapshots to resume from in directory {snapshot_dir}"
committed_ledger_dir = None
current_ledger_dir = None
if snapshot_dir is not None:
if from_snapshot:
if os.listdir(snapshot_dir):
LOG.info(f"Joining from snapshot directory: {snapshot_dir}")
# Only when joining from snapshot, retrieve ledger dirs from target node
# if the ledger directories are not specified. When joining without snapshot,
@ -215,6 +213,14 @@ class Network:
current_ledger_dir, committed_ledger_dir = target_node.get_ledger(
include_read_only_dirs=True
)
else:
LOG.warning(
f"Attempting to join from snapshot but {snapshot_dir} is empty: defaulting to complete replay of transaction history"
)
else:
LOG.info(
"Joining without snapshot: complete transaction history will be replayed"
)
node.join(
lib_name=lib_name,
@ -293,12 +299,14 @@ class Network:
)
self._adjust_local_node_ids(node)
else:
# When a new service is started, initial nodes join without a snapshot
self._add_node(
node,
args.package,
args,
recovery=recovery,
ledger_dir=ledger_dir,
from_snapshot=snapshot_dir is not None,
read_only_ledger_dir=read_only_ledger_dir,
snapshot_dir=snapshot_dir,
)
@ -494,9 +502,8 @@ class Network:
host,
args,
target_node=None,
from_snapshot=False,
copy_ledger_read_only=False,
timeout=JOIN_TIMEOUT,
**kwargs,
):
"""
Create a new node and add it to the network. Note that the new node
@ -509,8 +516,7 @@ class Network:
lib_name,
args,
target_node,
from_snapshot=from_snapshot,
copy_ledger_read_only=copy_ledger_read_only,
**kwargs,
)
primary, _ = self.find_primary()
try:
@ -547,8 +553,7 @@ class Network:
host,
args,
target_node=None,
from_snapshot=False,
copy_ledger_read_only=False,
**kwargs,
):
"""
Create a new node, add it to the network and let members vote to trust
@ -559,8 +564,7 @@ class Network:
host,
args,
target_node,
from_snapshot,
copy_ledger_read_only,
**kwargs,
)
primary, _ = self.find_primary()
@ -665,7 +669,7 @@ class Network:
assert "Primary unknown" in res.body.text(), res
except CCFConnectionException:
LOG.warning(
f"Could not successful connect to node {node.node_id}. Retrying..."
f"Could not successfully connect to node {node.node_id}. Retrying..."
)
if primary_id is not None:
break

Просмотреть файл

@ -43,9 +43,14 @@ def check_can_progress(node, timeout=3):
assert False, f"Stuck at {r}"
@reqs.description("Adding a valid node from primary")
@reqs.description("Adding a valid node without snapshot")
def test_add_node(network, args):
new_node = network.create_and_trust_node(args.package, "local://localhost", args)
new_node = network.create_and_trust_node(
args.package,
"local://localhost",
args,
from_snapshot=False,
)
with new_node.client() as c:
s = c.get("/node/state")
assert s.body.json()["id"] == new_node.node_id
@ -56,14 +61,25 @@ def test_add_node(network, args):
@reqs.description("Adding a valid node from a backup")
@reqs.at_least_n_nodes(2)
def test_add_node_from_backup(network, args):
backup = network.find_any_backup()
primary, backup = network.find_primary_and_any_backup()
# Retrieve snapshot from primary as only primary node
# generates snapshots
snapshot_dir = network.get_committed_snapshots(primary)
new_node = network.create_and_trust_node(
args.package, "local://localhost", args, target_node=backup
args.package,
"local://localhost",
args,
target_node=backup,
snapshot_dir=snapshot_dir,
)
assert new_node
return network
# Note: this test cannot be included in the full test suite yet as
# add_from_snapshot() decorator makes use of historical queries (#1648)
@reqs.description("Adding a valid node from snapshot")
@reqs.at_least_n_nodes(2)
@reqs.add_from_snapshot()
@ -72,7 +88,6 @@ def test_add_node_from_snapshot(network, args, copy_ledger_read_only=True):
args.package,
"local://localhost",
args,
from_snapshot=True,
copy_ledger_read_only=copy_ledger_read_only,
)
assert new_node
@ -89,32 +104,12 @@ def test_add_as_many_pending_nodes(network, args):
)
for _ in range(number_new_nodes):
network.create_and_add_pending_node(args.package, "local://localhost", args)
check_can_progress(network.find_primary()[0])
return network
@reqs.description("Add node with untrusted code version")
def test_add_node_untrusted_code(network, args):
if args.enclave_type != "virtual":
LOG.info("Adding an invalid node (unknown code id)")
code_not_found_exception = None
try:
lib_name = (
"liblogging" if args.package == "libjs_generic" else "libjs_generic"
)
network.create_and_add_pending_node(
lib_name, "local://localhost", args, timeout=3
args.package,
"local://localhost",
args,
)
except infra.network.CodeIdNotFound as err:
code_not_found_exception = err
assert (
code_not_found_exception is not None
), "Adding node with unknown code id should fail"
else:
LOG.warning("Skipping unknown code id test with virtual enclave")
check_can_progress(network.find_primary()[0])
return network
@ -161,14 +156,12 @@ def run(args):
test_add_node_from_backup(network, args)
test_add_node(network, args)
test_add_node_untrusted_code(network, args)
test_retire_backup(network, args)
test_add_as_many_pending_nodes(network, args)
test_add_node(network, args)
test_retire_primary(network, args)
if args.snapshot_tx_interval is not None:
test_add_node_from_snapshot(network, args, copy_ledger_read_only=True)
test_add_node_from_snapshot(network, args)
test_add_node_from_snapshot(network, args, copy_ledger_read_only=False)
errors, _ = network.get_joined_nodes()[-1].stop()
if not any(

Просмотреть файл

@ -111,13 +111,14 @@ def run(args):
network.start_and_join(args)
for i in range(args.recovery):
# Alternate between recovery with primary change and stable primary-ship
# Alternate between recovery with primary change and stable primary-ship,
# with and without snapshots
if i % 2 == 0:
recovered_network = test_share_resilience(
network, args, args.use_snapshot
network, args, from_snapshot=True
)
else:
recovered_network = test(network, args, args.use_snapshot)
recovered_network = test(network, args, from_snapshot=False)
network.stop_all_nodes()
network = recovered_network
LOG.success("Recovery complete on all nodes")
@ -142,12 +143,6 @@ checked. Note that the key for each logging message is unique (per table).
type=int,
default=5,
)
parser.add_argument(
"--use-snapshot",
help="Use latest snapshot for faster recovery procedure",
action="store_true",
default=False,
)
args = infra.e2e_args.cli_args(add)
args.package = "liblogging"

Просмотреть файл

@ -82,5 +82,6 @@ exec python "${START_NETWORK_SCRIPT}" \
--initial-user-count 1 \
--gov-script "${GOV_SCRIPT}" \
--ledger-chunk-bytes 5MB \
--snapshot-tx-interval 10000 \
--label sandbox \
"${extra_args[@]}"

Просмотреть файл

@ -43,7 +43,16 @@ suites["membership_recovery"] = suite_membership_recovery
# This suite tests that nodes addition, deletion and primary changes
# can be interleaved
# Note: snapshot tests are not yet integrated in the main test suite
# as they test historical queries which do not yet work across rekey/recovery
# https://github.com/microsoft/CCF/issues/1648
suite_reconfiguration = [
reconfiguration.test_add_node_from_snapshot,
reconfiguration.test_add_node_from_snapshot,
election.test_kill_primary,
reconfiguration.test_add_node_from_snapshot,
reconfiguration.test_retire_primary,
e2e_logging.test_view_history,
reconfiguration.test_add_node,
reconfiguration.test_retire_primary,
reconfiguration.test_add_node,
@ -51,21 +60,9 @@ suite_reconfiguration = [
reconfiguration.test_add_node,
reconfiguration.test_add_node,
reconfiguration.test_retire_backup,
reconfiguration.test_add_node,
election.test_kill_primary,
]
suites["reconfiguration"] = suite_reconfiguration
# Temporary suite while snapshotting feature is being implemented
# https://github.com/microsoft/CCF/milestone/12
suite_snapshots = [
reconfiguration.test_add_node_from_snapshot,
election.test_kill_primary,
reconfiguration.test_add_node_from_snapshot,
e2e_logging.test_view_history,
]
suites["snapshots"] = suite_snapshots
all_tests_suite = [
# e2e_logging:
e2e_logging.test,
@ -93,7 +90,6 @@ all_tests_suite = [
reconfiguration.test_add_node,
reconfiguration.test_add_node_from_backup,
reconfiguration.test_add_as_many_pending_nodes,
reconfiguration.test_add_node_untrusted_code,
reconfiguration.test_retire_backup,
# recovery:
recovery.test,
@ -104,6 +100,7 @@ all_tests_suite = [
election.test_kill_primary,
# code update:
code_update.test_verify_quotes,
code_update.test_add_node_with_bad_code,
]
suites["all"] = all_tests_suite