зеркало из https://github.com/microsoft/CCF.git
[release/2.x] Cherry pick: Historical queries: Re-request seqnos if host fails to respond (#4976) (#4982)
This commit is contained in:
Родитель
75d81934c5
Коммит
60b3f8eba2
|
@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
|
||||||
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
||||||
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [2.0.15]
|
||||||
|
|
||||||
|
[2.0.15]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.15
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- Additional logging of historical query flow in `UNSAFE` builds.
|
||||||
|
- Historical query system will re-request entries if the host fails to provide them within a fixed time.
|
||||||
|
|
||||||
## [2.0.14]
|
## [2.0.14]
|
||||||
|
|
||||||
[2.0.14]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.14
|
[2.0.14]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.14
|
||||||
|
|
|
@ -10,7 +10,7 @@ option(UNSAFE_VERSION "Produce build with unsafe logging levels" OFF)
|
||||||
set(CCF_PROJECT "ccf")
|
set(CCF_PROJECT "ccf")
|
||||||
if(UNSAFE_VERSION)
|
if(UNSAFE_VERSION)
|
||||||
set(CCF_PROJECT "${CCF_PROJECT}_unsafe")
|
set(CCF_PROJECT "${CCF_PROJECT}_unsafe")
|
||||||
add_compile_definitions(UNSAFE_VERSION)
|
add_compile_definitions(UNSAFE_VERSION ENABLE_HISTORICAL_VERBOSE_LOGGING)
|
||||||
file(WRITE ${CMAKE_BINARY_DIR}/UNSAFE "UNSAFE")
|
file(WRITE ${CMAKE_BINARY_DIR}/UNSAFE "UNSAFE")
|
||||||
install(FILES ${CMAKE_BINARY_DIR}/UNSAFE DESTINATION share)
|
install(FILES ${CMAKE_BINARY_DIR}/UNSAFE DESTINATION share)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -89,7 +89,7 @@ fi
|
||||||
|
|
||||||
source scripts/env/bin/activate
|
source scripts/env/bin/activate
|
||||||
pip install -U pip
|
pip install -U pip
|
||||||
pip install -U wheel black pylint mypy 1>/dev/null
|
pip install -U wheel black==22.12.0 pylint==2.15.10 mypy 1>/dev/null
|
||||||
|
|
||||||
echo "$CHECK_DELIMITER"
|
echo "$CHECK_DELIMITER"
|
||||||
echo "-- Python format"
|
echo "-- Python format"
|
||||||
|
|
|
@ -62,6 +62,8 @@ FMT_END_NAMESPACE
|
||||||
|
|
||||||
namespace ccf::historical
|
namespace ccf::historical
|
||||||
{
|
{
|
||||||
|
static constexpr auto slow_fetch_threshold = std::chrono::milliseconds(1000);
|
||||||
|
|
||||||
static std::optional<ccf::PrimarySignature> get_signature(
|
static std::optional<ccf::PrimarySignature> get_signature(
|
||||||
const kv::StorePtr& sig_store)
|
const kv::StorePtr& sig_store)
|
||||||
{
|
{
|
||||||
|
@ -293,17 +295,6 @@ namespace ccf::historical
|
||||||
return SeqNoCollection(newly_requested.begin(), newly_requested.end());
|
return SeqNoCollection(newly_requested.begin(), newly_requested.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
enum class PopulateReceiptsResult
|
|
||||||
{
|
|
||||||
// Common result. The new seqno may have added receipts for some entries
|
|
||||||
Continue,
|
|
||||||
|
|
||||||
// Occasional result. The new seqno was at the end of the sequence (or
|
|
||||||
// an attempt at retrieving a trailing supporting signature), but we
|
|
||||||
// still have receiptless entries, so attempt to fetch the next
|
|
||||||
FetchNext,
|
|
||||||
};
|
|
||||||
|
|
||||||
std::optional<ccf::SeqNo> populate_receipts(ccf::SeqNo new_seqno)
|
std::optional<ccf::SeqNo> populate_receipts(ccf::SeqNo new_seqno)
|
||||||
{
|
{
|
||||||
HISTORICAL_LOG(
|
HISTORICAL_LOG(
|
||||||
|
@ -508,7 +499,11 @@ namespace ccf::historical
|
||||||
// Track all things currently requested by external callers
|
// Track all things currently requested by external callers
|
||||||
std::map<CompoundHandle, Request> requests;
|
std::map<CompoundHandle, Request> requests;
|
||||||
|
|
||||||
std::set<ccf::SeqNo> pending_fetches;
|
// Store each seqno that is currently being fetched by the host, to avoid
|
||||||
|
// spamming it with duplicate requests, and how long it has been fetched
|
||||||
|
// for. If this gap gets too large, we will log a warning and allow it to be
|
||||||
|
// re-fetched
|
||||||
|
std::unordered_map<ccf::SeqNo, std::chrono::milliseconds> pending_fetches;
|
||||||
|
|
||||||
ExpiryDuration default_expiry_duration = std::chrono::seconds(1800);
|
ExpiryDuration default_expiry_duration = std::chrono::seconds(1800);
|
||||||
|
|
||||||
|
@ -522,22 +517,33 @@ namespace ccf::historical
|
||||||
std::optional<ccf::SeqNo> unfetched_from = std::nullopt;
|
std::optional<ccf::SeqNo> unfetched_from = std::nullopt;
|
||||||
std::optional<ccf::SeqNo> unfetched_to = std::nullopt;
|
std::optional<ccf::SeqNo> unfetched_to = std::nullopt;
|
||||||
|
|
||||||
|
LOG_TRACE_FMT("fetch_entries_range({}, {})", from, to);
|
||||||
|
|
||||||
for (auto seqno = from; seqno <= to; ++seqno)
|
for (auto seqno = from; seqno <= to; ++seqno)
|
||||||
{
|
{
|
||||||
const auto ib = pending_fetches.insert(seqno);
|
const auto ib = pending_fetches.try_emplace(seqno, 0);
|
||||||
if (ib.second)
|
if (
|
||||||
|
// Newly requested fetch
|
||||||
|
ib.second ||
|
||||||
|
// Fetch in-progress, but looks slow enough to retry
|
||||||
|
ib.first->second >= slow_fetch_threshold)
|
||||||
{
|
{
|
||||||
if (!unfetched_from.has_value())
|
if (!unfetched_from.has_value())
|
||||||
{
|
{
|
||||||
unfetched_from = seqno;
|
unfetched_from = seqno;
|
||||||
}
|
}
|
||||||
unfetched_to = seqno;
|
unfetched_to = seqno;
|
||||||
|
ib.first->second = std::chrono::milliseconds(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unfetched_from.has_value())
|
if (unfetched_from.has_value())
|
||||||
{
|
{
|
||||||
// Newly requested seqnos
|
// Newly requested seqnos
|
||||||
|
LOG_TRACE_FMT(
|
||||||
|
"Writing to ringbuffer ledger_get_range({}, {})",
|
||||||
|
unfetched_from.value(),
|
||||||
|
unfetched_to.value());
|
||||||
RINGBUFFER_WRITE_MESSAGE(
|
RINGBUFFER_WRITE_MESSAGE(
|
||||||
consensus::ledger_get_range,
|
consensus::ledger_get_range,
|
||||||
to_host,
|
to_host,
|
||||||
|
@ -1101,6 +1107,8 @@ namespace ccf::historical
|
||||||
const uint8_t* data,
|
const uint8_t* data,
|
||||||
size_t size)
|
size_t size)
|
||||||
{
|
{
|
||||||
|
LOG_TRACE_FMT("handle_ledger_entries({}, {})", from_seqno, to_seqno);
|
||||||
|
|
||||||
auto seqno = from_seqno;
|
auto seqno = from_seqno;
|
||||||
bool all_accepted = true;
|
bool all_accepted = true;
|
||||||
while (size > 0)
|
while (size > 0)
|
||||||
|
@ -1129,6 +1137,8 @@ namespace ccf::historical
|
||||||
{
|
{
|
||||||
std::lock_guard<ccf::Mutex> guard(requests_lock);
|
std::lock_guard<ccf::Mutex> guard(requests_lock);
|
||||||
|
|
||||||
|
LOG_TRACE_FMT("handle_no_entry_range({}, {})", from_seqno, to_seqno);
|
||||||
|
|
||||||
for (auto seqno = from_seqno; seqno <= to_seqno; ++seqno)
|
for (auto seqno = from_seqno; seqno <= to_seqno; ++seqno)
|
||||||
{
|
{
|
||||||
// The host failed or refused to give this entry. Currently just
|
// The host failed or refused to give this entry. Currently just
|
||||||
|
@ -1231,6 +1241,20 @@ namespace ccf::historical
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto& [seqno, time_since_sent] : pending_fetches)
|
||||||
|
{
|
||||||
|
const auto time_after = time_since_sent + elapsed_ms;
|
||||||
|
// Log once, when we cross the time threshold
|
||||||
|
if (
|
||||||
|
time_since_sent < slow_fetch_threshold &&
|
||||||
|
time_after >= slow_fetch_threshold)
|
||||||
|
{
|
||||||
|
LOG_FAIL_FMT(
|
||||||
|
"Fetch for seqno {} is taking an unusually long time", seqno);
|
||||||
|
}
|
||||||
|
time_since_sent = time_after;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
#define OVERRIDE_MAX_HISTORY_LEN 4
|
#define OVERRIDE_MAX_HISTORY_LEN 4
|
||||||
// Uncomment this to aid debugging
|
// Uncomment this to aid debugging
|
||||||
//#define ENABLE_HISTORICAL_VERBOSE_LOGGING
|
// #define ENABLE_HISTORICAL_VERBOSE_LOGGING
|
||||||
|
|
||||||
#include "node/historical_queries.h"
|
#include "node/historical_queries.h"
|
||||||
|
|
||||||
|
@ -260,6 +260,7 @@ TEST_CASE("StateCache point queries")
|
||||||
static const ccf::historical::RequestHandle default_handle = 0;
|
static const ccf::historical::RequestHandle default_handle = 0;
|
||||||
static const ccf::historical::RequestHandle low_handle = 1;
|
static const ccf::historical::RequestHandle low_handle = 1;
|
||||||
static const ccf::historical::RequestHandle high_handle = 2;
|
static const ccf::historical::RequestHandle high_handle = 2;
|
||||||
|
static const ccf::historical::RequestHandle retry_handle = 3;
|
||||||
|
|
||||||
{
|
{
|
||||||
INFO(
|
INFO(
|
||||||
|
@ -274,6 +275,48 @@ TEST_CASE("StateCache point queries")
|
||||||
REQUIRE(cache.get_state_at(low_handle, low_seqno) == nullptr);
|
REQUIRE(cache.get_state_at(low_handle, low_seqno) == nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
INFO(
|
||||||
|
"If too much time passes without providing request seqnos, failures are "
|
||||||
|
"logged");
|
||||||
|
|
||||||
|
size_t fails_count = 0;
|
||||||
|
|
||||||
|
struct FailCountingStub : public logger::AbstractLogger
|
||||||
|
{
|
||||||
|
size_t& fails_count;
|
||||||
|
|
||||||
|
FailCountingStub(size_t& fc) : fails_count(fc) {}
|
||||||
|
|
||||||
|
void write(
|
||||||
|
const logger::LogLine& ll,
|
||||||
|
const std::optional<double>& enclave_offset = std::nullopt) override
|
||||||
|
{
|
||||||
|
if (ll.log_level == logger::FAIL)
|
||||||
|
{
|
||||||
|
++fails_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
logger::config::loggers().emplace_back(
|
||||||
|
std::make_unique<FailCountingStub>(fails_count));
|
||||||
|
|
||||||
|
// Less time than threshold has elapsed => No failed logging
|
||||||
|
cache.tick(ccf::historical::slow_fetch_threshold / 2);
|
||||||
|
REQUIRE(fails_count == 0);
|
||||||
|
|
||||||
|
// Threshold elapsed => Failed logging
|
||||||
|
cache.tick(ccf::historical::slow_fetch_threshold / 2);
|
||||||
|
REQUIRE(fails_count == 3);
|
||||||
|
|
||||||
|
// Much more time passes => No more logging
|
||||||
|
cache.tick(ccf::historical::slow_fetch_threshold * 4);
|
||||||
|
REQUIRE(fails_count == 3);
|
||||||
|
|
||||||
|
logger::config::loggers().clear();
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
INFO("The host sees requests for these indices");
|
INFO("The host sees requests for these indices");
|
||||||
REQUIRE(!stub_writer->writes.empty());
|
REQUIRE(!stub_writer->writes.empty());
|
||||||
|
@ -291,11 +334,37 @@ TEST_CASE("StateCache point queries")
|
||||||
auto& to_seqno = to_seqno_;
|
auto& to_seqno = to_seqno_;
|
||||||
REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
|
REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
|
||||||
REQUIRE(from_seqno == to_seqno);
|
REQUIRE(from_seqno == to_seqno);
|
||||||
actual.insert(from_seqno);
|
// Despite multiple calls for each seqno, the host only sees a single
|
||||||
|
// request for each
|
||||||
|
REQUIRE(actual.insert(from_seqno).second);
|
||||||
}
|
}
|
||||||
|
stub_writer->writes.clear();
|
||||||
REQUIRE(actual == expected);
|
REQUIRE(actual == expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
INFO("Once sufficient time has passed, re-requesting will re-fetch");
|
||||||
|
|
||||||
|
cache.tick(ccf::historical::slow_fetch_threshold);
|
||||||
|
REQUIRE(cache.get_state_at(default_handle, low_seqno) == nullptr);
|
||||||
|
REQUIRE(cache.get_state_at(default_handle, unsigned_seqno) == nullptr);
|
||||||
|
|
||||||
|
REQUIRE(!stub_writer->writes.empty());
|
||||||
|
const auto& write = stub_writer->writes[0];
|
||||||
|
const uint8_t* data = write.contents.data();
|
||||||
|
size_t size = write.contents.size();
|
||||||
|
REQUIRE(write.m == consensus::ledger_get_range);
|
||||||
|
auto [from_seqno_, to_seqno_, purpose_] =
|
||||||
|
ringbuffer::read_message<consensus::ledger_get_range>(data, size);
|
||||||
|
auto& purpose = purpose_;
|
||||||
|
auto& from_seqno = from_seqno_;
|
||||||
|
auto& to_seqno = to_seqno_;
|
||||||
|
REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
|
||||||
|
REQUIRE(from_seqno == to_seqno);
|
||||||
|
REQUIRE(from_seqno == low_seqno);
|
||||||
|
stub_writer->writes.clear();
|
||||||
|
}
|
||||||
|
|
||||||
auto provide_ledger_entry = [&](size_t i) {
|
auto provide_ledger_entry = [&](size_t i) {
|
||||||
bool accepted = cache.handle_ledger_entry(i, ledger.at(i));
|
bool accepted = cache.handle_ledger_entry(i, ledger.at(i));
|
||||||
return accepted;
|
return accepted;
|
||||||
|
|
Загрузка…
Ссылка в новой задаче