[release/2.x] Cherry pick: Historical queries: Re-request seqnos if host fails to respond (#4976) (#4982)

This commit is contained in:
Eddy Ashton 2023-02-10 09:13:25 +00:00 коммит произвёл GitHub
Родитель 75d81934c5
Коммит 60b3f8eba2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 120 добавлений и 18 удалений

Просмотреть файл

@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [2.0.15]
[2.0.15]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.15
### Changed
- Additional logging of historical query flow in `UNSAFE` builds.
- Historical query system will re-request entries if the host fails to provide them within a fixed time.
## [2.0.14]
[2.0.14]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.14

Просмотреть файл

@ -10,7 +10,7 @@ option(UNSAFE_VERSION "Produce build with unsafe logging levels" OFF)
set(CCF_PROJECT "ccf")
if(UNSAFE_VERSION)
set(CCF_PROJECT "${CCF_PROJECT}_unsafe")
add_compile_definitions(UNSAFE_VERSION)
add_compile_definitions(UNSAFE_VERSION ENABLE_HISTORICAL_VERBOSE_LOGGING)
file(WRITE ${CMAKE_BINARY_DIR}/UNSAFE "UNSAFE")
install(FILES ${CMAKE_BINARY_DIR}/UNSAFE DESTINATION share)
endif()

Просмотреть файл

@ -89,7 +89,7 @@ fi
source scripts/env/bin/activate
pip install -U pip
pip install -U wheel black pylint mypy 1>/dev/null
pip install -U wheel black==22.12.0 pylint==2.15.10 mypy 1>/dev/null
echo "$CHECK_DELIMITER"
echo "-- Python format"

Просмотреть файл

@ -62,6 +62,8 @@ FMT_END_NAMESPACE
namespace ccf::historical
{
static constexpr auto slow_fetch_threshold = std::chrono::milliseconds(1000);
static std::optional<ccf::PrimarySignature> get_signature(
const kv::StorePtr& sig_store)
{
@ -293,17 +295,6 @@ namespace ccf::historical
return SeqNoCollection(newly_requested.begin(), newly_requested.end());
}
enum class PopulateReceiptsResult
{
// Common result. The new seqno may have added receipts for some entries
Continue,
// Occasional result. The new seqno was at the end of the sequence (or
// an attempt at retrieving a trailing supporting signature), but we
// still have receiptless entries, so attempt to fetch the next
FetchNext,
};
std::optional<ccf::SeqNo> populate_receipts(ccf::SeqNo new_seqno)
{
HISTORICAL_LOG(
@ -508,7 +499,11 @@ namespace ccf::historical
// Track all things currently requested by external callers
std::map<CompoundHandle, Request> requests;
std::set<ccf::SeqNo> pending_fetches;
// Store each seqno that is currently being fetched by the host, to avoid
// spamming it with duplicate requests, and how long it has been fetched
// for. If this gap gets too large, we will log a warning and allow it to be
// re-fetched
std::unordered_map<ccf::SeqNo, std::chrono::milliseconds> pending_fetches;
ExpiryDuration default_expiry_duration = std::chrono::seconds(1800);
@ -522,22 +517,33 @@ namespace ccf::historical
std::optional<ccf::SeqNo> unfetched_from = std::nullopt;
std::optional<ccf::SeqNo> unfetched_to = std::nullopt;
LOG_TRACE_FMT("fetch_entries_range({}, {})", from, to);
for (auto seqno = from; seqno <= to; ++seqno)
{
const auto ib = pending_fetches.insert(seqno);
if (ib.second)
const auto ib = pending_fetches.try_emplace(seqno, 0);
if (
// Newly requested fetch
ib.second ||
// Fetch in-progress, but looks slow enough to retry
ib.first->second >= slow_fetch_threshold)
{
if (!unfetched_from.has_value())
{
unfetched_from = seqno;
}
unfetched_to = seqno;
ib.first->second = std::chrono::milliseconds(0);
}
}
if (unfetched_from.has_value())
{
// Newly requested seqnos
LOG_TRACE_FMT(
"Writing to ringbuffer ledger_get_range({}, {})",
unfetched_from.value(),
unfetched_to.value());
RINGBUFFER_WRITE_MESSAGE(
consensus::ledger_get_range,
to_host,
@ -1101,6 +1107,8 @@ namespace ccf::historical
const uint8_t* data,
size_t size)
{
LOG_TRACE_FMT("handle_ledger_entries({}, {})", from_seqno, to_seqno);
auto seqno = from_seqno;
bool all_accepted = true;
while (size > 0)
@ -1129,6 +1137,8 @@ namespace ccf::historical
{
std::lock_guard<ccf::Mutex> guard(requests_lock);
LOG_TRACE_FMT("handle_no_entry_range({}, {})", from_seqno, to_seqno);
for (auto seqno = from_seqno; seqno <= to_seqno; ++seqno)
{
// The host failed or refused to give this entry. Currently just
@ -1231,6 +1241,20 @@ namespace ccf::historical
++it;
}
}
for (auto& [seqno, time_since_sent] : pending_fetches)
{
const auto time_after = time_since_sent + elapsed_ms;
// Log once, when we cross the time threshold
if (
time_since_sent < slow_fetch_threshold &&
time_after >= slow_fetch_threshold)
{
LOG_FAIL_FMT(
"Fetch for seqno {} is taking an unusually long time", seqno);
}
time_since_sent = time_after;
}
}
};

Просмотреть файл

@ -3,7 +3,7 @@
#define OVERRIDE_MAX_HISTORY_LEN 4
// Uncomment this to aid debugging
//#define ENABLE_HISTORICAL_VERBOSE_LOGGING
// #define ENABLE_HISTORICAL_VERBOSE_LOGGING
#include "node/historical_queries.h"
@ -260,6 +260,7 @@ TEST_CASE("StateCache point queries")
static const ccf::historical::RequestHandle default_handle = 0;
static const ccf::historical::RequestHandle low_handle = 1;
static const ccf::historical::RequestHandle high_handle = 2;
static const ccf::historical::RequestHandle retry_handle = 3;
{
INFO(
@ -274,6 +275,48 @@ TEST_CASE("StateCache point queries")
REQUIRE(cache.get_state_at(low_handle, low_seqno) == nullptr);
}
{
INFO(
"If too much time passes without providing request seqnos, failures are "
"logged");
size_t fails_count = 0;
struct FailCountingStub : public logger::AbstractLogger
{
size_t& fails_count;
FailCountingStub(size_t& fc) : fails_count(fc) {}
void write(
const logger::LogLine& ll,
const std::optional<double>& enclave_offset = std::nullopt) override
{
if (ll.log_level == logger::FAIL)
{
++fails_count;
}
}
};
logger::config::loggers().emplace_back(
std::make_unique<FailCountingStub>(fails_count));
// Less time than threshold has elapsed => No failed logging
cache.tick(ccf::historical::slow_fetch_threshold / 2);
REQUIRE(fails_count == 0);
// Threshold elapsed => Failed logging
cache.tick(ccf::historical::slow_fetch_threshold / 2);
REQUIRE(fails_count == 3);
// Much more time passes => No more logging
cache.tick(ccf::historical::slow_fetch_threshold * 4);
REQUIRE(fails_count == 3);
logger::config::loggers().clear();
}
{
INFO("The host sees requests for these indices");
REQUIRE(!stub_writer->writes.empty());
@ -291,11 +334,37 @@ TEST_CASE("StateCache point queries")
auto& to_seqno = to_seqno_;
REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
REQUIRE(from_seqno == to_seqno);
actual.insert(from_seqno);
// Despite multiple calls for each seqno, the host only sees a single
// request for each
REQUIRE(actual.insert(from_seqno).second);
}
stub_writer->writes.clear();
REQUIRE(actual == expected);
}
{
INFO("Once sufficient time has passed, re-requesting will re-fetch");
cache.tick(ccf::historical::slow_fetch_threshold);
REQUIRE(cache.get_state_at(default_handle, low_seqno) == nullptr);
REQUIRE(cache.get_state_at(default_handle, unsigned_seqno) == nullptr);
REQUIRE(!stub_writer->writes.empty());
const auto& write = stub_writer->writes[0];
const uint8_t* data = write.contents.data();
size_t size = write.contents.size();
REQUIRE(write.m == consensus::ledger_get_range);
auto [from_seqno_, to_seqno_, purpose_] =
ringbuffer::read_message<consensus::ledger_get_range>(data, size);
auto& purpose = purpose_;
auto& from_seqno = from_seqno_;
auto& to_seqno = to_seqno_;
REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
REQUIRE(from_seqno == to_seqno);
REQUIRE(from_seqno == low_seqno);
stub_writer->writes.clear();
}
auto provide_ledger_entry = [&](size_t i) {
bool accepted = cache.handle_ledger_entry(i, ledger.at(i));
return accepted;