Bug 1931811 - Weather queries are slow due to lack of index on geonames_alternates(geoname_id)

This adds an index on `geonames_alternates(geoname_id)` plus some geonames and
weather benchmarks that I used to verify the change. The bug has some benchmark
numbers and discussion. The index speeds up weather queries by as much as ~98%,
going from hundreds of ms in the worst case to ~10ms.
This commit is contained in:
Drew Willcoxon 2024-11-18 18:24:26 -08:00 коммит произвёл Drew
Родитель 91c55bf4f3
Коммит 4389742afc
6 изменённых файлов: 870 добавлений и 18 удалений

Просмотреть файл

@ -2,7 +2,13 @@ use criterion::{
criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup, Criterion,
};
use std::sync::Once;
use suggest::benchmarks::{ingest, query, BenchmarkWithInput};
use suggest::benchmarks::{geoname, ingest, query, BenchmarkWithInput};
pub fn geoname(c: &mut Criterion) {
setup_viaduct();
let group = c.benchmark_group("geoname");
run_benchmarks(group, geoname::all_benchmarks())
}
pub fn ingest(c: &mut Criterion) {
setup_viaduct();
@ -44,5 +50,5 @@ fn setup_viaduct() {
INIT.call_once(viaduct_reqwest::use_reqwest_backend);
}
criterion_group!(benches, ingest, query);
criterion_group!(benches, geoname, ingest, query);
criterion_main!(benches);

Просмотреть файл

@ -0,0 +1,449 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::{
benchmarks::{new_store, BenchmarkWithInput},
geoname::{Geoname, GeonameType},
SuggestStore,
};
pub struct GeonameBenchmark {
args: FetchGeonamesArgs,
should_match: bool,
}
#[derive(Clone, Debug)]
pub struct FetchGeonamesArgs {
query: &'static str,
match_name_prefix: bool,
geoname_type: Option<GeonameType>,
filter: Option<Vec<&'static Geoname>>,
}
pub struct IterationInput {
fetch_args: FetchGeonamesArgs,
should_match_message: String,
}
impl BenchmarkWithInput for GeonameBenchmark {
type GlobalInput = SuggestStore;
type IterationInput = IterationInput;
fn global_input(&self) -> Self::GlobalInput {
new_store()
}
fn iteration_input(&self) -> Self::IterationInput {
let fetch_args = self.args.clone();
// Format the message now so it doesn't take up time in the benchmark.
let should_match_message = format!("should_match for fetch: {:?}", fetch_args);
IterationInput {
fetch_args,
should_match_message,
}
}
fn benchmarked_code(&self, store: &Self::GlobalInput, i_input: Self::IterationInput) {
let matches = store
.fetch_geonames(
i_input.fetch_args.query,
i_input.fetch_args.match_name_prefix,
i_input.fetch_args.geoname_type,
i_input.fetch_args.filter,
)
.unwrap_or_else(|e| panic!("Error fetching geonames: {e}"));
// Make sure matches were returned or not as expected. Otherwise the
// benchmark might not be testing what it's intended to test.
assert_eq!(
!matches.is_empty(),
self.should_match,
"{}",
i_input.should_match_message,
);
}
}
pub fn all_benchmarks() -> Vec<(&'static str, GeonameBenchmark)> {
static NY_STATE: std::sync::OnceLock<Geoname> = std::sync::OnceLock::new();
let ny_state = NY_STATE.get_or_init(|| Geoname {
geoname_id: 8,
name: "New York".to_string(),
latitude: 43.00035,
longitude: -75.4999,
country_code: "US".to_string(),
admin1_code: "NY".to_string(),
population: 19274244,
});
vec![
// no matches
(
"geoname-fetch-no-match-1",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "nomatch",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-2",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "no match",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-3",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "no match either",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-long",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "this is a very long string that does not match anything in the geonames database but it sure is very long",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
// no matches w/ prefix matching
(
"geoname-fetch-no-match-1-prefix",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "nomatch",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-2-prefix",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "no match",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-3-prefix",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "no match either",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
(
"geoname-fetch-no-match-long-prefix",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "this is a very long string that does not match anything in the geonames database but it sure is very long",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: false,
}
),
// abbreviations and airport codes
(
"geoname-fetch-abbr-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-abbr-nyc",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "nyc",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-abbr-ca",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ca",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-airport-pdx",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "pdx",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-airport-roc",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "roc",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
// abbreviations and airport codes w/ prefix matching
(
"geoname-fetch-abbr-prefix-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-abbr-prefix-nyc",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "nyc",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-abbr-prefix-ca",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ca",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-airport-prefix-pdx",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "pdx",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-airport-prefix-roc",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "roc",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
// full names
(
"geoname-fetch-name-new-york",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "new york",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-name-rochester",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "rochester",
match_name_prefix: false,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
// full names w/ prefix matching
(
"geoname-fetch-name-prefix-new-york",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "new york",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-name-prefix-rochester",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "rochester",
match_name_prefix: true,
geoname_type: None,
filter: None,
},
should_match: true,
}
),
// restricting to a geoname type
(
"geoname-fetch-type-city-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: Some(GeonameType::City),
filter: None,
},
should_match: true,
}
),
(
"geoname-fetch-type-region-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: Some(GeonameType::Region),
filter: None,
},
should_match: true,
}
),
// filtering
(
"geoname-fetch-filter-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![&ny_state]),
},
should_match: true,
}
),
// restricting to a geoname type + filtering
(
"geoname-fetch-type-filter-city-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: Some(GeonameType::City),
filter: Some(vec![&ny_state]),
},
should_match: true,
}
),
(
"geoname-fetch-type-filter-region-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: false,
geoname_type: Some(GeonameType::Region),
filter: Some(vec![&ny_state]),
},
should_match: true,
}
),
// restricting to a geoname type + filtering w/ prefix matching
(
"geoname-fetch-type-filter-prefix-city-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: true,
geoname_type: Some(GeonameType::City),
filter: Some(vec![&ny_state]),
},
should_match: true,
}
),
(
"geoname-fetch-type-filter-prefix-region-ny",
GeonameBenchmark {
args: FetchGeonamesArgs {
query: "ny",
match_name_prefix: true,
geoname_type: Some(GeonameType::Region),
filter: Some(vec![&ny_state]),
},
should_match: true,
}
),
]
}

Просмотреть файл

@ -22,6 +22,7 @@ use tempfile::TempDir;
use crate::{SuggestIngestionConstraints, SuggestStore};
pub mod client;
pub mod geoname;
pub mod ingest;
pub mod query;

Просмотреть файл

@ -10,34 +10,49 @@ use crate::{
pub struct QueryBenchmark {
provider: SuggestionProvider,
query: &'static str,
should_match: bool,
}
impl QueryBenchmark {
pub fn new(provider: SuggestionProvider, query: &'static str) -> Self {
Self { provider, query }
}
pub struct IterationInput {
query: SuggestionQuery,
should_match_message: String,
}
impl BenchmarkWithInput for QueryBenchmark {
type GlobalInput = SuggestStore;
type IterationInput = SuggestionQuery;
type IterationInput = IterationInput;
fn global_input(&self) -> Self::GlobalInput {
new_store()
}
fn iteration_input(&self) -> Self::IterationInput {
SuggestionQuery {
let query = SuggestionQuery {
providers: vec![self.provider],
keyword: self.query.to_string(),
..SuggestionQuery::default()
};
// Format the message now so it doesn't take up time in the benchmark.
let should_match_message = format!("should_match for query: {:?}", query);
IterationInput {
query,
should_match_message,
}
}
fn benchmarked_code(&self, store: &Self::GlobalInput, query: Self::IterationInput) {
store
.query(query)
fn benchmarked_code(&self, store: &Self::GlobalInput, i_input: Self::IterationInput) {
let suggestions = store
.query(i_input.query)
.unwrap_or_else(|e| panic!("Error querying store: {e}"));
// Make sure matches were returned or not as expected. Otherwise the
// benchmark might not be testing what it's intended to test.
assert_eq!(
!suggestions.is_empty(),
self.should_match,
"{}",
i_input.should_match_message,
);
}
}
@ -50,27 +65,371 @@ pub fn all_benchmarks() -> Vec<(&'static str, QueryBenchmark)> {
// Therefore, to test shorter prefixes we use 2-term queries.
(
"query-fakespot-hand-s",
QueryBenchmark::new(SuggestionProvider::Fakespot, "hand s"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "hand s",
should_match: true,
}
),
(
"query-fakespot-hand-sa",
QueryBenchmark::new(SuggestionProvider::Fakespot, "hand sa"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "hand sa",
should_match: true,
}
),
(
"query-fakespot-hand-san",
QueryBenchmark::new(SuggestionProvider::Fakespot, "hand san"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "hand san",
should_match: true,
}
),
(
"query-fakespot-sani",
QueryBenchmark::new(SuggestionProvider::Fakespot, "sani"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "sani",
should_match: true,
}
),
(
"query-fakespot-sanit",
QueryBenchmark::new(SuggestionProvider::Fakespot, "sanit"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "sanit",
should_match: true,
}
),
(
"query-fakespot-saniti",
QueryBenchmark::new(SuggestionProvider::Fakespot, "saniti"),
QueryBenchmark {
provider: SuggestionProvider::Fakespot,
query: "saniti",
should_match: false,
},
),
// weather: no matches
(
"query-weather-no-match-1",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "nomatch",
should_match: false,
},
),
(
"query-weather-no-match-2",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "no match",
should_match: false,
},
),
(
"query-weather-no-match-3",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "no match either",
should_match: false,
},
),
(
"query-weather-no-match-long-1",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "city1 city2 state1 state2 keyword1 keyword2 keyword3",
should_match: false,
},
),
(
"query-weather-no-match-long-2",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "this does not match anything especially not a weather suggestion but nevertheless it is a very long query which as previously mentioned doesn't match anything at all",
should_match: false,
},
),
(
"query-weather-no-match-keyword-prefix",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "wea",
should_match: false,
},
),
(
"query-weather-no-match-city-abbr",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "ny",
should_match: false,
},
),
(
"query-weather-no-match-airport-code",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "pdx",
should_match: false,
},
),
(
"query-weather-no-match-airport-code-region",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "pdx or",
should_match: false,
},
),
// weather: keyword only
(
"query-weather-keyword-only",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather",
should_match: true,
},
),
// weather: city only
(
"query-weather-city-only",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "new york",
should_match: true,
},
),
// weather: city + region
(
"query-weather-city-region-los-angeles-c",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles c",
should_match: true,
},
),
(
"query-weather-city-region-los-angeles-ca",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles ca",
should_match: true,
},
),
(
"query-weather-city-region-la-ca",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "la ca",
should_match: true,
},
),
(
"query-weather-city-region-ny-ny",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "ny ny",
should_match: true,
},
),
// weather: keyword + city
(
"query-weather-keyword-city-n",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather n",
should_match: true,
},
),
(
"query-weather-keyword-city-ne",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather ne",
should_match: true,
},
),
(
"query-weather-keyword-city-new",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather new",
should_match: true,
},
),
(
"query-weather-keyword-city-new-york",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather new york",
should_match: true,
},
),
(
"query-weather-keyword-city-ny",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather ny",
should_match: true,
},
),
(
"query-weather-keyword-city-pdx",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather pdx",
should_match: true,
},
),
// weather: keyword + city + region
(
"query-weather-keyword-city-region-los-angeles-c",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather los angeles c",
should_match: true,
},
),
(
"query-weather-keyword-city-region-los-angeles-ca",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather los angeles ca",
should_match: true,
},
),
(
"query-weather-keyword-city-region-la-ca",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather la ca",
should_match: true,
},
),
(
"query-weather-keyword-city-region-ny-ny",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather ny ny",
should_match: true,
},
),
(
"query-weather-keyword-city-region-pdx-or",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "weather pdx or",
should_match: true,
},
),
// weather: city + keyword
(
"query-weather-city-keyword-new-york-w",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "new york w",
should_match: true,
},
),
(
"query-weather-city-keyword-new-york-we",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "new york we",
should_match: true,
},
),
(
"query-weather-city-keyword-new-york-wea",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "new york wea",
should_match: true,
},
),
(
"query-weather-city-keyword-new-york-weather",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "new york weather",
should_match: true,
},
),
(
"query-weather-city-keyword-ny-w",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "ny w",
should_match: true,
},
),
(
"query-weather-city-keyword-ny-weather",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "ny weather",
should_match: true,
},
),
// weather: city + region + keyword
(
"query-weather-city-region-keyword-los-angeles-w",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles ca w",
should_match: true,
},
),
(
"query-weather-city-region-keyword-los-angeles-we",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles ca we",
should_match: true,
},
),
(
"query-weather-city-region-keyword-los-angeles-wea",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles ca wea",
should_match: true,
},
),
(
"query-weather-city-region-keyword-los-angeles-weather",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "los angeles ca weather",
should_match: true,
},
),
(
"query-weather-city-region-keyword-la-ca-weather",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "la ca weather",
should_match: true,
},
),
(
"query-weather-city-region-keyword-ny-ny-weather",
QueryBenchmark {
provider: SuggestionProvider::Weather,
query: "ny ny weather",
should_match: true,
},
),
]
}

Просмотреть файл

@ -23,7 +23,7 @@ use sql_support::{
/// `clear_database()` by adding their names to `conditional_tables`, unless
/// they are cleared via a deletion trigger or there's some other good
/// reason not to do so.
pub const VERSION: u32 = 30;
pub const VERSION: u32 = 31;
/// The current Suggest database schema.
pub const SQL: &str = "
@ -218,6 +218,7 @@ CREATE TABLE geonames_alternates(
PRIMARY KEY (name, geoname_id),
FOREIGN KEY(geoname_id) REFERENCES geonames(id) ON DELETE CASCADE
) WITHOUT ROWID;
CREATE INDEX geonames_alternates_geoname_id ON geonames_alternates(geoname_id);
CREATE TABLE geonames_metrics(
record_id TEXT NOT NULL PRIMARY KEY,
@ -586,6 +587,16 @@ CREATE TABLE geonames_alternates(
clear_database(tx)?;
Ok(())
}
30 => {
// Add the `geonames_alternates_geoname_id` index.
clear_database(tx)?;
tx.execute_batch(
"
CREATE INDEX geonames_alternates_geoname_id ON geonames_alternates(geoname_id);
",
)?;
Ok(())
}
_ => Err(open_database::Error::IncompatibleVersion(version)),
}
}

Просмотреть файл

@ -30,6 +30,9 @@ use crate::{
QueryWithMetricsResult, Result, SuggestApiResult, Suggestion, SuggestionQuery,
};
#[cfg(feature = "benchmark_api")]
use crate::geoname::{Geoname, GeonameMatch, GeonameType};
/// Builder for [SuggestStore]
///
/// Using a builder is preferred to calling the constructor directly since it's harder to confuse
@ -274,6 +277,17 @@ impl SuggestStore {
pub fn checkpoint(&self) {
self.inner.checkpoint();
}
pub fn fetch_geonames(
&self,
query: &str,
match_name_prefix: bool,
geoname_type: Option<GeonameType>,
filter: Option<Vec<&Geoname>>,
) -> Result<Vec<GeonameMatch>> {
self.inner
.fetch_geonames(query, match_name_prefix, geoname_type, filter)
}
}
/// Constraints limit which suggestions to ingest from Remote Settings.
@ -833,6 +847,18 @@ where
conn.query_one("SELECT page_size * page_count FROM pragma_page_count(), pragma_page_size()")
.unwrap()
}
pub fn fetch_geonames(
&self,
query: &str,
match_name_prefix: bool,
geoname_type: Option<GeonameType>,
filter: Option<Vec<&Geoname>>,
) -> Result<Vec<GeonameMatch>> {
self.dbs()?
.reader
.read(|dao| dao.fetch_geonames(query, match_name_prefix, geoname_type, filter))
}
}
/// Holds a store's open connections to the Suggest database.