Adding command to print out ingestion sizes
Knowing these sizes can help us optimize things. I'm going to use the row counts as guidance to improve our ingestion benchmarks.
This commit is contained in:
Родитель
d6eeec7590
Коммит
be6735f29c
|
@ -5,3 +5,4 @@ dev-install = ["install", "asdev"]
|
|||
verify_env = ["asdev", "verify_env"]
|
||||
fxa = ["run", "-p", "examples-fxa-client", "--"]
|
||||
suggest-bench = ["bench", "-p", "suggest", "--features", "benchmark_api"]
|
||||
suggest-debug-ingestion-sizes = ["run", "-p", "suggest", "--bin", "debug_ingestion_sizes", "--features", "benchmark_api"]
|
||||
|
|
|
@ -20,6 +20,7 @@ serde_json = "1"
|
|||
error-support = { path = "../support/error" }
|
||||
sql-support = { path = "../support/sql" }
|
||||
viaduct = { path = "../viaduct" }
|
||||
viaduct-reqwest = { path = "../support/viaduct-reqwest", optional=true }
|
||||
tempfile = { version = "3.2.0", optional = true }
|
||||
thiserror = "1"
|
||||
uniffi = { workspace = true }
|
||||
|
@ -29,7 +30,6 @@ url = { version = "2.1", features = ["serde"] }
|
|||
criterion = "0.5"
|
||||
env_logger = { version = "0.10", default-features = false }
|
||||
expect-test = "1.4"
|
||||
viaduct-reqwest = { path = "../support/viaduct-reqwest" }
|
||||
hex = "0.4"
|
||||
rc_crypto = { path = "../support/rc_crypto" }
|
||||
|
||||
|
@ -38,9 +38,13 @@ uniffi = { workspace = true, features = ["build"] }
|
|||
|
||||
[features]
|
||||
# Required for the benchmarks to work, wasted bytes otherwise.
|
||||
benchmark_api = ["tempfile"]
|
||||
benchmark_api = ["tempfile", "viaduct-reqwest"]
|
||||
|
||||
[[bench]]
|
||||
name = "benchmark_all"
|
||||
harness = false
|
||||
required-features = ["benchmark_api"]
|
||||
|
||||
[[bin]]
|
||||
name = "debug_ingestion_sizes"
|
||||
required-features = ["benchmark_api"]
|
||||
|
|
|
@ -17,3 +17,12 @@ from these measurements.
|
|||
|
||||
- Ingestion with synthetic data. This would isolate the benchmark from changes to the RS database.
|
||||
- Fetching suggestions
|
||||
|
||||
## cargo suggest-debug-ingestion-sizes
|
||||
|
||||
Run this to get row counts for all database tables. This can be very useful for improving
|
||||
benchmarks, since targeting the tables with the largest number of rows will usually lead to the
|
||||
largest improvements.
|
||||
|
||||
The command also prints out the size of all remote-settings attachments, which can be good to
|
||||
optimize on its own since it represents the amount of data user's need to download.
|
||||
|
|
|
@ -14,8 +14,8 @@ use std::collections::HashMap;
|
|||
/// [RemoteSettingsBenchmarkClient] implements [SuggestRemoteSettingsClient] by getting data from a HashMap rather than hitting the network.
|
||||
pub struct RemoteSettingsWarmUpClient {
|
||||
client: Client,
|
||||
get_records_responses: Mutex<HashMap<GetItemsOptions, RemoteSettingsResponse>>,
|
||||
get_attachment_responses: Mutex<HashMap<String, Vec<u8>>>,
|
||||
pub get_records_responses: Mutex<HashMap<GetItemsOptions, RemoteSettingsResponse>>,
|
||||
pub get_attachment_responses: Mutex<HashMap<String, Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl RemoteSettingsWarmUpClient {
|
||||
|
|
|
@ -9,6 +9,7 @@ use crate::{
|
|||
},
|
||||
rs::SuggestRecordType,
|
||||
store::SuggestStoreInner,
|
||||
SuggestIngestionConstraints,
|
||||
};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
|
||||
|
@ -83,3 +84,33 @@ pub fn all_benchmarks() -> Vec<(&'static str, IngestBenchmark)> {
|
|||
),
|
||||
]
|
||||
}
|
||||
|
||||
pub fn print_debug_ingestion_sizes() {
|
||||
viaduct_reqwest::use_reqwest_backend();
|
||||
let store = SuggestStoreInner::new(
|
||||
"file:debug_ingestion_sizes?mode=memory&cache=shared",
|
||||
RemoteSettingsWarmUpClient::new(),
|
||||
);
|
||||
store
|
||||
.ingest(SuggestIngestionConstraints::default())
|
||||
.unwrap();
|
||||
let table_row_counts = store.table_row_counts();
|
||||
let client = store.into_settings_client();
|
||||
let total_attachment_size: usize = client
|
||||
.get_attachment_responses
|
||||
.lock()
|
||||
.values()
|
||||
.map(|data| data.len())
|
||||
.sum();
|
||||
|
||||
println!(
|
||||
"Total attachment size: {}kb",
|
||||
(total_attachment_size + 500) / 1000
|
||||
);
|
||||
println!();
|
||||
println!("Database table row counts");
|
||||
println!("-------------------------");
|
||||
for (name, count) in table_row_counts {
|
||||
println!("{name:30}: {count}");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
use suggest::benchmarks::ingest;
|
||||
|
||||
fn main() {
|
||||
ingest::print_debug_ingestion_sizes()
|
||||
}
|
|
@ -623,6 +623,32 @@ where
|
|||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn table_row_counts(&self) -> Vec<(String, u32)> {
|
||||
use sql_support::ConnExt;
|
||||
|
||||
// Note: since this is just used for debugging, use unwrap to simplify the error handling.
|
||||
let reader = &self.dbs().unwrap().reader;
|
||||
let conn = reader.conn.lock();
|
||||
let table_names: Vec<String> = conn
|
||||
.query_rows_and_then(
|
||||
"SELECT name FROM sqlite_master where type = 'table'",
|
||||
(),
|
||||
|row| row.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
let mut table_names_with_counts: Vec<(String, u32)> = table_names
|
||||
.into_iter()
|
||||
.map(|name| {
|
||||
let count: u32 = conn
|
||||
.query_one(&format!("SELECT COUNT(*) FROM {name}"))
|
||||
.unwrap();
|
||||
(name, count)
|
||||
})
|
||||
.collect();
|
||||
table_names_with_counts.sort_by(|a, b| (b.1.cmp(&a.1)));
|
||||
table_names_with_counts
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds a store's open connections to the Suggest database.
|
||||
|
|
Загрузка…
Ссылка в новой задаче