[release/2.x] Cherry pick: Historical queries: Re-request seqnos if host fails to respond (#4976) (#4982)

2023-02-10 09:13:25 +00:00 · 2023-02-10 09:13:25 +00:00 · 60b3f8eba2
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

+## [2.0.15]
+
+[2.0.15]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.15
+
+### Changed
+
+- Additional logging of historical query flow in `UNSAFE` builds.
+- Historical query system will re-request entries if the host fails to provide them within a fixed time.
+
 ## [2.0.14]

 [2.0.14]: https://github.com/microsoft/CCF/releases/tag/ccf-2.0.14
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@ -10,7 +10,7 @@ option(UNSAFE_VERSION "Produce build with unsafe logging levels" OFF)
 set(CCF_PROJECT "ccf")
 if(UNSAFE_VERSION)
  set(CCF_PROJECT "${CCF_PROJECT}_unsafe")
-  add_compile_definitions(UNSAFE_VERSION)
+  add_compile_definitions(UNSAFE_VERSION ENABLE_HISTORICAL_VERBOSE_LOGGING)
  file(WRITE ${CMAKE_BINARY_DIR}/UNSAFE "UNSAFE")
  install(FILES ${CMAKE_BINARY_DIR}/UNSAFE DESTINATION share)
 endif()
--- a/scripts/ci-checks.sh
+++ b/scripts/ci-checks.sh
@ -89,7 +89,7 @@ fi

 source scripts/env/bin/activate
 pip install -U pip
-pip install -U wheel black pylint mypy 1>/dev/null
+pip install -U wheel black==22.12.0 pylint==2.15.10 mypy 1>/dev/null

 echo "$CHECK_DELIMITER"
 echo "-- Python format"
--- a/src/node/historical_queries.h
+++ b/src/node/historical_queries.h
@ -62,6 +62,8 @@ FMT_END_NAMESPACE

 namespace ccf::historical
 {
+  static constexpr auto slow_fetch_threshold = std::chrono::milliseconds(1000);
+
  static std::optional<ccf::PrimarySignature> get_signature(
    const kv::StorePtr& sig_store)
  {
@ -293,17 +295,6 @@ namespace ccf::historical
        return SeqNoCollection(newly_requested.begin(), newly_requested.end());
      }

-      enum class PopulateReceiptsResult
-      {
-        // Common result. The new seqno may have added receipts for some entries
-        Continue,
-
-        // Occasional result. The new seqno was at the end of the sequence (or
-        // an attempt at retrieving a trailing supporting signature), but we
-        // still have receiptless entries, so attempt to fetch the next
-        FetchNext,
-      };
-
      std::optional<ccf::SeqNo> populate_receipts(ccf::SeqNo new_seqno)
      {
        HISTORICAL_LOG(
@ -508,7 +499,11 @@ namespace ccf::historical
    // Track all things currently requested by external callers
    std::map<CompoundHandle, Request> requests;

-    std::set<ccf::SeqNo> pending_fetches;
+    // Store each seqno that is currently being fetched by the host, to avoid
+    // spamming it with duplicate requests, and how long it has been fetched
+    // for. If this gap gets too large, we will log a warning and allow it to be
+    // re-fetched
+    std::unordered_map<ccf::SeqNo, std::chrono::milliseconds> pending_fetches;

    ExpiryDuration default_expiry_duration = std::chrono::seconds(1800);

@ -522,22 +517,33 @@ namespace ccf::historical
      std::optional<ccf::SeqNo> unfetched_from = std::nullopt;
      std::optional<ccf::SeqNo> unfetched_to = std::nullopt;

+      LOG_TRACE_FMT("fetch_entries_range({}, {})", from, to);
+
      for (auto seqno = from; seqno <= to; ++seqno)
      {
-        const auto ib = pending_fetches.insert(seqno);
-        if (ib.second)
+        const auto ib = pending_fetches.try_emplace(seqno, 0);
+        if (
+          // Newly requested fetch
+          ib.second ||
+          // Fetch in-progress, but looks slow enough to retry
+          ib.first->second >= slow_fetch_threshold)
        {
          if (!unfetched_from.has_value())
          {
            unfetched_from = seqno;
          }
          unfetched_to = seqno;
+          ib.first->second = std::chrono::milliseconds(0);
        }
      }

      if (unfetched_from.has_value())
      {
        // Newly requested seqnos
+        LOG_TRACE_FMT(
+          "Writing to ringbuffer ledger_get_range({}, {})",
+          unfetched_from.value(),
+          unfetched_to.value());
        RINGBUFFER_WRITE_MESSAGE(
          consensus::ledger_get_range,
          to_host,
@ -1101,6 +1107,8 @@ namespace ccf::historical
      const uint8_t* data,
      size_t size)
    {
+      LOG_TRACE_FMT("handle_ledger_entries({}, {})", from_seqno, to_seqno);
+
      auto seqno = from_seqno;
      bool all_accepted = true;
      while (size > 0)
@ -1129,6 +1137,8 @@ namespace ccf::historical
    {
      std::lock_guard<ccf::Mutex> guard(requests_lock);

+      LOG_TRACE_FMT("handle_no_entry_range({}, {})", from_seqno, to_seqno);
+
      for (auto seqno = from_seqno; seqno <= to_seqno; ++seqno)
      {
        // The host failed or refused to give this entry. Currently just
@ -1231,6 +1241,20 @@ namespace ccf::historical
          ++it;
        }
      }
+
+      for (auto& [seqno, time_since_sent] : pending_fetches)
+      {
+        const auto time_after = time_since_sent + elapsed_ms;
+        // Log once, when we cross the time threshold
+        if (
+          time_since_sent < slow_fetch_threshold &&
+          time_after >= slow_fetch_threshold)
+        {
+          LOG_FAIL_FMT(
+            "Fetch for seqno {} is taking an unusually long time", seqno);
+        }
+        time_since_sent = time_after;
+      }
    }
  };

--- a/src/node/test/historical_queries.cpp
+++ b/src/node/test/historical_queries.cpp
@ -3,7 +3,7 @@

 #define OVERRIDE_MAX_HISTORY_LEN 4
 // Uncomment this to aid debugging
-//#define ENABLE_HISTORICAL_VERBOSE_LOGGING
+// #define ENABLE_HISTORICAL_VERBOSE_LOGGING

 #include "node/historical_queries.h"

@ -260,6 +260,7 @@ TEST_CASE("StateCache point queries")
  static const ccf::historical::RequestHandle default_handle = 0;
  static const ccf::historical::RequestHandle low_handle = 1;
  static const ccf::historical::RequestHandle high_handle = 2;
+  static const ccf::historical::RequestHandle retry_handle = 3;

  {
    INFO(
@ -274,6 +275,48 @@ TEST_CASE("StateCache point queries")
    REQUIRE(cache.get_state_at(low_handle, low_seqno) == nullptr);
  }

+  {
+    INFO(
+      "If too much time passes without providing request seqnos, failures are "
+      "logged");
+
+    size_t fails_count = 0;
+
+    struct FailCountingStub : public logger::AbstractLogger
+    {
+      size_t& fails_count;
+
+      FailCountingStub(size_t& fc) : fails_count(fc) {}
+
+      void write(
+        const logger::LogLine& ll,
+        const std::optional<double>& enclave_offset = std::nullopt) override
+      {
+        if (ll.log_level == logger::FAIL)
+        {
+          ++fails_count;
+        }
+      }
+    };
+
+    logger::config::loggers().emplace_back(
+      std::make_unique<FailCountingStub>(fails_count));
+
+    // Less time than threshold has elapsed => No failed logging
+    cache.tick(ccf::historical::slow_fetch_threshold / 2);
+    REQUIRE(fails_count == 0);
+
+    // Threshold elapsed => Failed logging
+    cache.tick(ccf::historical::slow_fetch_threshold / 2);
+    REQUIRE(fails_count == 3);
+
+    // Much more time passes => No more logging
+    cache.tick(ccf::historical::slow_fetch_threshold * 4);
+    REQUIRE(fails_count == 3);
+
+    logger::config::loggers().clear();
+  }
+
  {
    INFO("The host sees requests for these indices");
    REQUIRE(!stub_writer->writes.empty());
@ -291,11 +334,37 @@ TEST_CASE("StateCache point queries")
      auto& to_seqno = to_seqno_;
      REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
      REQUIRE(from_seqno == to_seqno);
-      actual.insert(from_seqno);
+      // Despite multiple calls for each seqno, the host only sees a single
+      // request for each
+      REQUIRE(actual.insert(from_seqno).second);
    }
+    stub_writer->writes.clear();
    REQUIRE(actual == expected);
  }

+  {
+    INFO("Once sufficient time has passed, re-requesting will re-fetch");
+
+    cache.tick(ccf::historical::slow_fetch_threshold);
+    REQUIRE(cache.get_state_at(default_handle, low_seqno) == nullptr);
+    REQUIRE(cache.get_state_at(default_handle, unsigned_seqno) == nullptr);
+
+    REQUIRE(!stub_writer->writes.empty());
+    const auto& write = stub_writer->writes[0];
+    const uint8_t* data = write.contents.data();
+    size_t size = write.contents.size();
+    REQUIRE(write.m == consensus::ledger_get_range);
+    auto [from_seqno_, to_seqno_, purpose_] =
+      ringbuffer::read_message<consensus::ledger_get_range>(data, size);
+    auto& purpose = purpose_;
+    auto& from_seqno = from_seqno_;
+    auto& to_seqno = to_seqno_;
+    REQUIRE(purpose == consensus::LedgerRequestPurpose::HistoricalQuery);
+    REQUIRE(from_seqno == to_seqno);
+    REQUIRE(from_seqno == low_seqno);
+    stub_writer->writes.clear();
+  }
+
  auto provide_ledger_entry = [&](size_t i) {
    bool accepted = cache.handle_ledger_entry(i, ledger.at(i));
    return accepted;