Bug 1931812 - Don't reingest the DB on each query benchmark

This addresses the bug and also makes one other change: I added a new method to
`BenchmarkWithInput` called `global_input()` that returns some data that's then
passed by reference to the benchmark's iterations. That's a little better than
creating global data in benchmark constructors because it means only the global
data for the current benchmark is in memory instead of the global data for all
benchmarks in the group. In the case of query benchmarks, the global data is the
store, so we avoid having N stores in memory at once, N = the number of
benchmarks in the group. (I'm adding quite a few weather benchmarks.) It also
made sense to rename `generate_input()` to `iteration_input()` to emphasize that
that method is called on each iteration.

I'm open to reverting this other change though if you disagree.

The speedup between this and the main branch when running
`time cargo suggest-bench -- query/query` isn't dramatic but it's a little
better:

```
main branch: ~83s
this branch: ~68s (~18% improvement)
```
This commit is contained in:
Drew Willcoxon 2024-11-18 00:04:48 -08:00 коммит произвёл Drew
Родитель 8e7aa1723c
Коммит 8cc7b4bb3c
5 изменённых файлов: 87 добавлений и 38 удалений

Просмотреть файл

@ -24,10 +24,11 @@ fn run_benchmarks<B: BenchmarkWithInput, M: Measurement>(
benchmarks: Vec<(&'static str, B)>,
) {
for (name, benchmark) in benchmarks {
let g_input = benchmark.global_input();
group.bench_function(name.to_string(), |b| {
b.iter_batched(
|| benchmark.generate_input(),
|input| benchmark.benchmarked_code(input),
|| benchmark.iteration_input(),
|i_input| benchmark.benchmarked_code(&g_input, i_input),
// See https://docs.rs/criterion/latest/criterion/enum.BatchSize.html#variants for
// a discussion of this. PerIteration is chosen for these benchmarks because the
// input holds a database file handle

Просмотреть файл

@ -48,9 +48,12 @@ impl IngestBenchmark {
pub struct InputType(SuggestStoreInner<RemoteSettingsBenchmarkClient>);
impl BenchmarkWithInput for IngestBenchmark {
type Input = InputType;
type GlobalInput = ();
type IterationInput = InputType;
fn generate_input(&self) -> Self::Input {
fn global_input(&self) -> Self::GlobalInput {}
fn iteration_input(&self) -> Self::IterationInput {
let data_path = self.temp_dir.path().join(unique_db_filename());
let store = SuggestStoreInner::new(data_path, vec![], self.client.clone());
store.ensure_db_initialized();
@ -61,7 +64,7 @@ impl BenchmarkWithInput for IngestBenchmark {
InputType(store)
}
fn benchmarked_code(&self, input: Self::Input) {
fn benchmarked_code(&self, _: &Self::GlobalInput, input: Self::IterationInput) {
let InputType(store) = input;
store.ingest_records_by_type(self.record_type);
}

Просмотреть файл

@ -10,7 +10,16 @@
//!
//! All benchmarks are defined as structs that implement either the [Benchmark] or [BenchmarkWithInput]
use std::sync::atomic::{AtomicU32, Ordering};
use std::{
path::PathBuf,
sync::{
atomic::{AtomicU32, Ordering},
OnceLock,
},
};
use tempfile::TempDir;
use crate::{SuggestIngestionConstraints, SuggestStore};
pub mod client;
pub mod ingest;
@ -28,21 +37,52 @@ pub trait Benchmark {
/// Trait for benchmarks that require input
///
/// This will run using Criterion's `iter_batched` function. Criterion will create a batch of
/// inputs, then pass each one to benchmark.
/// inputs, then pass each one to the benchmark's iterations.
///
/// This supports simple benchmarks that don't require any input. Note: global setup can be done
/// in the `new()` method for the struct.
/// This supports simple benchmarks that don't require any input.
pub trait BenchmarkWithInput {
type Input;
/// Input that will be created once and then passed by reference to each
/// of the benchmark's iterations.
type GlobalInput;
/// Generate the input (this is not included in the benchmark time)
fn generate_input(&self) -> Self::Input;
/// Input that will be created for each of the benchmark's iterations.
type IterationInput;
/// Generate the global input (not included in the benchmark time)
fn global_input(&self) -> Self::GlobalInput;
/// Generate the per-iteration input (not included in the benchmark time)
fn iteration_input(&self) -> Self::IterationInput;
/// Perform the operations that we're benchmarking.
fn benchmarked_code(&self, input: Self::Input);
fn benchmarked_code(&self, g_input: &Self::GlobalInput, i_input: Self::IterationInput);
}
fn unique_db_filename() -> String {
static COUNTER: AtomicU32 = AtomicU32::new(0);
format!("db{}.sqlite", COUNTER.fetch_add(1, Ordering::Relaxed))
}
/// Creates a new store that will contain all provider data currently in remote
/// settings.
fn new_store() -> SuggestStore {
// Create a "starter" store that will do an initial ingest, and then
// initialize every returned store with a copy of its DB so that each one
// doesn't need to reingest.
static STARTER: OnceLock<(TempDir, PathBuf)> = OnceLock::new();
let (starter_dir, starter_db_path) = STARTER.get_or_init(|| {
let temp_dir = tempfile::tempdir().unwrap();
let db_path = temp_dir.path().join(unique_db_filename());
let store =
SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store");
store
.ingest(SuggestIngestionConstraints::all_providers())
.expect("Error during ingestion");
store.checkpoint();
(temp_dir, db_path)
});
let db_path = starter_dir.path().join(unique_db_filename());
std::fs::copy(starter_db_path, &db_path).expect("Error copying starter DB file");
SuggestStore::new(&db_path.to_string_lossy(), None).expect("Error building store")
}

Просмотреть файл

@ -3,50 +3,39 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::{
benchmarks::{unique_db_filename, BenchmarkWithInput},
SuggestIngestionConstraints, SuggestStore, SuggestionProvider, SuggestionQuery,
benchmarks::{new_store, BenchmarkWithInput},
SuggestStore, SuggestionProvider, SuggestionQuery,
};
pub struct QueryBenchmark {
store: SuggestStore,
provider: SuggestionProvider,
query: &'static str,
}
impl QueryBenchmark {
pub fn new(provider: SuggestionProvider, query: &'static str) -> Self {
let temp_dir = tempfile::tempdir().unwrap();
let data_path = temp_dir.path().join(unique_db_filename());
let store =
SuggestStore::new(&data_path.to_string_lossy(), None).expect("Error building store");
store
.ingest(SuggestIngestionConstraints::all_providers())
.expect("Error during ingestion");
Self {
store,
provider,
query,
}
Self { provider, query }
}
}
// The input for each benchmark a query to pass to the store
pub struct InputType(SuggestionQuery);
impl BenchmarkWithInput for QueryBenchmark {
type Input = InputType;
type GlobalInput = SuggestStore;
type IterationInput = SuggestionQuery;
fn generate_input(&self) -> Self::Input {
InputType(SuggestionQuery {
fn global_input(&self) -> Self::GlobalInput {
new_store()
}
fn iteration_input(&self) -> Self::IterationInput {
SuggestionQuery {
providers: vec![self.provider],
keyword: self.query.to_string(),
..SuggestionQuery::default()
})
}
}
fn benchmarked_code(&self, input: Self::Input) {
let InputType(query) = input;
self.store
fn benchmarked_code(&self, store: &Self::GlobalInput, query: Self::IterationInput) {
store
.query(query)
.unwrap_or_else(|e| panic!("Error querying store: {e}"));
}

Просмотреть файл

@ -266,6 +266,16 @@ impl SuggestStore {
}
}
#[cfg(feature = "benchmark_api")]
impl SuggestStore {
/// Creates a WAL checkpoint. This will cause changes in the write-ahead log
/// to be written to the DB. See:
/// https://sqlite.org/pragma.html#pragma_wal_checkpoint
pub fn checkpoint(&self) {
self.inner.checkpoint();
}
}
/// Constraints limit which suggestions to ingest from Remote Settings.
#[derive(Clone, Default, Debug, uniffi::Record)]
pub struct SuggestIngestionConstraints {
@ -751,6 +761,12 @@ where
self.dbs().unwrap();
}
fn checkpoint(&self) {
let conn = self.dbs().unwrap().writer.conn.lock();
conn.pragma_update(None, "wal_checkpoint", "TRUNCATE")
.expect("Error performing checkpoint");
}
pub fn ingest_records_by_type(&self, ingest_record_type: SuggestRecordType) {
let writer = &self.dbs().unwrap().writer;
let mut context = MetricsContext::default();