Bug 1931812 - Don't reingest the DB on each query benchmark

This addresses the bug and also makes one other change: I added a new method to `BenchmarkWithInput` called `global_input()` that returns some data that's then passed by reference to the benchmark's iterations. That's a little better than creating global data in benchmark constructors because it means only the global data for the current benchmark is in memory instead of the global data for all benchmarks in the group. In the case of query benchmarks, the global data is the store, so we avoid having N stores in memory at once, N = the number of benchmarks in the group. (I'm adding quite a few weather benchmarks.) It also made sense to rename `generate_input()` to `iteration_input()` to emphasize that that method is called on each iteration. I'm open to reverting this other change though if you disagree. The speedup between this and the main branch when running `time cargo suggest-bench -- query/query` isn't dramatic but it's a little better: ``` main branch: ~83s this branch: ~68s (~18% improvement) ```
2024-11-18 00:04:48 -08:00 · 2024-11-18 00:04:48 -08:00 · 8cc7b4bb3c
--- a/components/suggest/benches/benchmark_all.rs
+++ b/components/suggest/benches/benchmark_all.rs
@ -24,10 +24,11 @@ fn run_benchmarks<B: BenchmarkWithInput, M: Measurement>(
    benchmarks: Vec<(&'static str, B)>,
 ) {
    for (name, benchmark) in benchmarks {
+        let g_input = benchmark.global_input();
        group.bench_function(name.to_string(), |b| {
            b.iter_batched(
-                || benchmark.generate_input(),
-                |input| benchmark.benchmarked_code(input),
+                || benchmark.iteration_input(),
+                |i_input| benchmark.benchmarked_code(&g_input, i_input),
                // See https://docs.rs/criterion/latest/criterion/enum.BatchSize.html#variants for
                // a discussion of this.  PerIteration is chosen for these benchmarks because the
                // input holds a database file handle
--- a/components/suggest/src/benchmarks/ingest.rs
+++ b/components/suggest/src/benchmarks/ingest.rs
@ -48,9 +48,12 @@ impl IngestBenchmark {
 pub struct InputType(SuggestStoreInner<RemoteSettingsBenchmarkClient>);

 impl BenchmarkWithInput for IngestBenchmark {
-    type Input = InputType;
+    type GlobalInput = ();
+    type IterationInput = InputType;

-    fn generate_input(&self) -> Self::Input {
+    fn global_input(&self) -> Self::GlobalInput {}
+
+    fn iteration_input(&self) -> Self::IterationInput {
        let data_path = self.temp_dir.path().join(unique_db_filename());
        let store = SuggestStoreInner::new(data_path, vec![], self.client.clone());
        store.ensure_db_initialized();
@ -61,7 +64,7 @@ impl BenchmarkWithInput for IngestBenchmark {
        InputType(store)
    }

-    fn benchmarked_code(&self, input: Self::Input) {
+    fn benchmarked_code(&self, _: &Self::GlobalInput, input: Self::IterationInput) {
        let InputType(store) = input;
        store.ingest_records_by_type(self.record_type);
    }
--- a/components/suggest/src/benchmarks/mod.rs
+++ b/components/suggest/src/benchmarks/mod.rs
@ -10,7 +10,16 @@
 //!
 //! All benchmarks are defined as structs that implement either the [Benchmark] or [BenchmarkWithInput]

-use std::sync::atomic::{AtomicU32, Ordering};
+use std::{
+    path::PathBuf,
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        OnceLock,
+    },
+};
+use tempfile::TempDir;
+
+use crate::{SuggestIngestionConstraints, SuggestStore};

 pub mod client;
 pub mod ingest;
@ -28,21 +37,52 @@ pub trait Benchmark {
 /// Trait for benchmarks that require input
 ///
 /// This will run using Criterion's `iter_batched` function.  Criterion will create a batch of
-/// inputs, then pass each one to benchmark.
+/// inputs, then pass each one to the benchmark's iterations.
 ///
-/// This supports simple benchmarks that don't require any input.  Note: global setup can be done
-/// in the `new()` method for the struct.
+/// This supports simple benchmarks that don't require any input.
 pub trait BenchmarkWithInput {
-    type Input;
+    /// Input that will be created once and then passed by reference to each
+    /// of the benchmark's iterations.
+    type GlobalInput;

-    /// Generate the input (this is not included in the benchmark time)
-    fn generate_input(&self) -> Self::Input;
+    /// Input that will be created for each of the benchmark's iterations.
+    type IterationInput;
+
+    /// Generate the global input (not included in the benchmark time)
+    fn global_input(&self) -> Self::GlobalInput;
+
+    /// Generate the per-iteration input (not included in the benchmark time)
+    fn iteration_input(&self) -> Self::IterationInput;

    /// Perform the operations that we're benchmarking.
-    fn benchmarked_code(&self, input: Self::Input);
+    fn benchmarked_code(&self, g_input: &Self::GlobalInput, i_input: Self::IterationInput);
 }

 fn unique_db_filename() -> String {
    static COUNTER: AtomicU32 = AtomicU32::new(0);
    format!("db{}.sqlite", COUNTER.fetch_add(1, Ordering::Relaxed))
 }
+
+/// Creates a new store that will contain all provider data currently in remote
+/// settings.
+fn new_store() -> SuggestStore {
+    // Create a "starter" store that will do an initial ingest, and then
+    // initialize every returned store with a copy of its DB so that each one
+    // doesn't need to reingest.
+    static STARTER: OnceLock<(TempDir, PathBuf)> = OnceLock::new();
+    let (starter_dir, starter_db_path) = STARTER.get_or_init(|| {
+        let temp_dir = tempfile::tempdir().unwrap();
+        let db_path = temp_dir.path().join(unique_db_filename());
+        let store =
+            SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store");
+        store
+            .ingest(SuggestIngestionConstraints::all_providers())
+            .expect("Error during ingestion");
+        store.checkpoint();
+        (temp_dir, db_path)
+    });
+
+    let db_path = starter_dir.path().join(unique_db_filename());
+    std::fs::copy(starter_db_path, &db_path).expect("Error copying starter DB file");
+    SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store")
+}
--- a/components/suggest/src/benchmarks/query.rs
+++ b/components/suggest/src/benchmarks/query.rs
@ -3,50 +3,39 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 use crate::{
-    benchmarks::{unique_db_filename, BenchmarkWithInput},
-    SuggestIngestionConstraints, SuggestStore, SuggestionProvider, SuggestionQuery,
+    benchmarks::{new_store, BenchmarkWithInput},
+    SuggestStore, SuggestionProvider, SuggestionQuery,
 };

 pub struct QueryBenchmark {
-    store: SuggestStore,
    provider: SuggestionProvider,
    query: &'static str,
 }

 impl QueryBenchmark {
    pub fn new(provider: SuggestionProvider, query: &'static str) -> Self {
-        let temp_dir = tempfile::tempdir().unwrap();
-        let data_path = temp_dir.path().join(unique_db_filename());
-        let store =
-            SuggestStore::new(&data_path.to_string_lossy(), None).expect("Error building store");
-        store
-            .ingest(SuggestIngestionConstraints::all_providers())
-            .expect("Error during ingestion");
-        Self {
-            store,
-            provider,
-            query,
-        }
+        Self { provider, query }
    }
 }

-// The input for each benchmark a query to pass to the store
-pub struct InputType(SuggestionQuery);
-
 impl BenchmarkWithInput for QueryBenchmark {
-    type Input = InputType;
+    type GlobalInput = SuggestStore;
+    type IterationInput = SuggestionQuery;

-    fn generate_input(&self) -> Self::Input {
-        InputType(SuggestionQuery {
+    fn global_input(&self) -> Self::GlobalInput {
+        new_store()
+    }
+
+    fn iteration_input(&self) -> Self::IterationInput {
+        SuggestionQuery {
            providers: vec![self.provider],
            keyword: self.query.to_string(),
            ..SuggestionQuery::default()
-        })
+        }
    }

-    fn benchmarked_code(&self, input: Self::Input) {
-        let InputType(query) = input;
-        self.store
+    fn benchmarked_code(&self, store: &Self::GlobalInput, query: Self::IterationInput) {
+        store
            .query(query)
            .unwrap_or_else(|e| panic!("Error querying store: {e}"));
    }
--- a/components/suggest/src/store.rs
+++ b/components/suggest/src/store.rs
@ -266,6 +266,16 @@ impl SuggestStore {
    }
 }

+#[cfg(feature = "benchmark_api")]
+impl SuggestStore {
+    /// Creates a WAL checkpoint. This will cause changes in the write-ahead log
+    /// to be written to the DB. See:
+    /// https://sqlite.org/pragma.html#pragma_wal_checkpoint
+    pub fn checkpoint(&self) {
+        self.inner.checkpoint();
+    }
+}
+
 /// Constraints limit which suggestions to ingest from Remote Settings.
 #[derive(Clone, Default, Debug, uniffi::Record)]
 pub struct SuggestIngestionConstraints {
@ -751,6 +761,12 @@ where
        self.dbs().unwrap();
    }

+    fn checkpoint(&self) {
+        let conn = self.dbs().unwrap().writer.conn.lock();
+        conn.pragma_update(None, "wal_checkpoint", "TRUNCATE")
+            .expect("Error performing checkpoint");
+    }
+
    pub fn ingest_records_by_type(&self, ingest_record_type: SuggestRecordType) {
        let writer = &self.dbs().unwrap().writer;
        let mut context = MetricsContext::default();