Adding command to print out ingestion sizes

Knowing these sizes can help us optimize things.  I'm going to use the
row counts as guidance to improve our ingestion benchmarks.
This commit is contained in:
Ben Dean-Kawamura 2024-03-22 16:03:16 -04:00 коммит произвёл bendk
Родитель d6eeec7590
Коммит be6735f29c
7 изменённых файлов: 84 добавлений и 4 удалений

Просмотреть файл

@ -5,3 +5,4 @@ dev-install = ["install", "asdev"]
verify_env = ["asdev", "verify_env"]
fxa = ["run", "-p", "examples-fxa-client", "--"]
suggest-bench = ["bench", "-p", "suggest", "--features", "benchmark_api"]
suggest-debug-ingestion-sizes = ["run", "-p", "suggest", "--bin", "debug_ingestion_sizes", "--features", "benchmark_api"]

Просмотреть файл

@ -20,6 +20,7 @@ serde_json = "1"
error-support = { path = "../support/error" }
sql-support = { path = "../support/sql" }
viaduct = { path = "../viaduct" }
viaduct-reqwest = { path = "../support/viaduct-reqwest", optional=true }
tempfile = { version = "3.2.0", optional = true }
thiserror = "1"
uniffi = { workspace = true }
@ -29,7 +30,6 @@ url = { version = "2.1", features = ["serde"] }
criterion = "0.5"
env_logger = { version = "0.10", default-features = false }
expect-test = "1.4"
viaduct-reqwest = { path = "../support/viaduct-reqwest" }
hex = "0.4"
rc_crypto = { path = "../support/rc_crypto" }
@ -38,9 +38,13 @@ uniffi = { workspace = true, features = ["build"] }
[features]
# Required for the benchmarks to work, wasted bytes otherwise.
benchmark_api = ["tempfile"]
benchmark_api = ["tempfile", "viaduct-reqwest"]
[[bench]]
name = "benchmark_all"
harness = false
required-features = ["benchmark_api"]
[[bin]]
name = "debug_ingestion_sizes"
required-features = ["benchmark_api"]

Просмотреть файл

@ -17,3 +17,12 @@ from these measurements.
- Ingestion with synthetic data. This would isolate the benchmark from changes to the RS database.
- Fetching suggestions
## cargo suggest-debug-ingestion-sizes
Run this to get row counts for all database tables. This can be very useful for improving
benchmarks, since targeting the tables with the largest number of rows will usually lead to the
largest improvements.
The command also prints out the size of all remote-settings attachments, which can be good to
optimize on its own since it represents the amount of data user's need to download.

Просмотреть файл

@ -14,8 +14,8 @@ use std::collections::HashMap;
/// [RemoteSettingsBenchmarkClient] implements [SuggestRemoteSettingsClient] by getting data from a HashMap rather than hitting the network.
pub struct RemoteSettingsWarmUpClient {
client: Client,
get_records_responses: Mutex<HashMap<GetItemsOptions, RemoteSettingsResponse>>,
get_attachment_responses: Mutex<HashMap<String, Vec<u8>>>,
pub get_records_responses: Mutex<HashMap<GetItemsOptions, RemoteSettingsResponse>>,
pub get_attachment_responses: Mutex<HashMap<String, Vec<u8>>>,
}
impl RemoteSettingsWarmUpClient {

Просмотреть файл

@ -9,6 +9,7 @@ use crate::{
},
rs::SuggestRecordType,
store::SuggestStoreInner,
SuggestIngestionConstraints,
};
use std::sync::atomic::{AtomicU32, Ordering};
@ -83,3 +84,33 @@ pub fn all_benchmarks() -> Vec<(&'static str, IngestBenchmark)> {
),
]
}
pub fn print_debug_ingestion_sizes() {
viaduct_reqwest::use_reqwest_backend();
let store = SuggestStoreInner::new(
"file:debug_ingestion_sizes?mode=memory&cache=shared",
RemoteSettingsWarmUpClient::new(),
);
store
.ingest(SuggestIngestionConstraints::default())
.unwrap();
let table_row_counts = store.table_row_counts();
let client = store.into_settings_client();
let total_attachment_size: usize = client
.get_attachment_responses
.lock()
.values()
.map(|data| data.len())
.sum();
println!(
"Total attachment size: {}kb",
(total_attachment_size + 500) / 1000
);
println!();
println!("Database table row counts");
println!("-------------------------");
for (name, count) in table_row_counts {
println!("{name:30}: {count}");
}
}

Просмотреть файл

@ -0,0 +1,9 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use suggest::benchmarks::ingest;
fn main() {
ingest::print_debug_ingestion_sizes()
}

Просмотреть файл

@ -623,6 +623,32 @@ where
)
.unwrap()
}
pub fn table_row_counts(&self) -> Vec<(String, u32)> {
use sql_support::ConnExt;
// Note: since this is just used for debugging, use unwrap to simplify the error handling.
let reader = &self.dbs().unwrap().reader;
let conn = reader.conn.lock();
let table_names: Vec<String> = conn
.query_rows_and_then(
"SELECT name FROM sqlite_master where type = 'table'",
(),
|row| row.get(0),
)
.unwrap();
let mut table_names_with_counts: Vec<(String, u32)> = table_names
.into_iter()
.map(|name| {
let count: u32 = conn
.query_one(&format!("SELECT COUNT(*) FROM {name}"))
.unwrap();
(name, count)
})
.collect();
table_names_with_counts.sort_by(|a, b| (b.1.cmp(&a.1)));
table_names_with_counts
}
}
/// Holds a store's open connections to the Suggest database.