Improve matching logic for city abbreviations and airport codes

This commit is contained in:
Drew Willcoxon 2024-11-02 22:16:06 -07:00 коммит произвёл Drew
Родитель 578e31c21d
Коммит ea0654349e
3 изменённых файлов: 836 добавлений и 386 удалений

Просмотреть файл

@ -82,6 +82,33 @@ impl Hash for Geoname {
}
}
/// Value returned by `fetch_geonames()`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct GeonameMatch {
pub geoname: Geoname,
pub match_type: GeonameMatchType,
pub prefix: bool,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum GeonameMatchType {
/// For U.S. states, abbreviations are the usual two-letter codes ("CA").
Abbreviation,
AirportCode,
/// This includes any names that aren't abbreviations or airport codes.
Name,
}
impl GeonameMatchType {
pub fn is_abbreviation(&self) -> bool {
matches!(self, GeonameMatchType::Abbreviation)
}
pub fn is_name(&self) -> bool {
matches!(self, GeonameMatchType::Name)
}
}
/// This data is used to service every query handled by the weather provider and
/// potentially other providers, so we cache it from the DB.
#[derive(Debug, Default)]
@ -162,13 +189,11 @@ impl SuggestDao<'_> {
/// Fetches geonames that have at least one name matching the `query`
/// string.
///
/// `match_prefixes` determines whether prefix matching is performed. If
/// `true`, returned geonames will have at least one name prefixed by
/// `query`. If `false`, returned geonames will have at least one name equal
/// to `query`.
///
/// `match_abbreviations` determines whether abbreviations and airport codes
/// are matched.
/// `match_name_prefix` determines whether prefix matching is performed on
/// names that aren't abbreviations and airport codes. When `true`, names
/// that start with `query` will match. When false, names that equal `query`
/// will match. Prefix matching is never performed on abbreviations and
/// airport codes because we don't currently have a use case for that.
///
/// `geoname_type` restricts returned geonames to the specified type. `None`
/// restricts geonames to cities and regions. There's no way to return
@ -182,14 +207,20 @@ impl SuggestDao<'_> {
/// since city and region names are not unique. `filter` is disjunctive: If
/// any item in `filter` matches a geoname, the geoname will be filtered in.
/// If `filter` is empty, all geonames will be filtered out.
///
/// The returned matches will include all matching types for a geoname, one
/// match per type per geoname. For example, if the query matches both a
/// geoname's name and abbreviation, two matches for that geoname will be
/// returned: one with a `match_type` of `GeonameMatchType::Name` and one
/// with a `match_type` of `GeonameMatchType::Abbreviation`. `prefix` is set
/// according to whether the query matched a prefix of the given type.
pub fn fetch_geonames(
&self,
query: &str,
match_prefixes: bool,
match_abbreviations: bool,
match_name_prefix: bool,
geoname_type: Option<GeonameType>,
filter: Option<Vec<&Geoname>>,
) -> Result<Vec<Geoname>> {
) -> Result<Vec<GeonameMatch>> {
let city_pred = "(g.feature_class = 'P')";
let region_pred = "(g.feature_class = 'A' AND g.feature_code = 'ADM1')";
let type_pred = match geoname_type {
@ -210,56 +241,67 @@ impl SuggestDao<'_> {
g.feature_class,
g.country_code,
g.admin1_code,
g.population
g.population,
a.name != :name AS prefix,
(SELECT CASE
-- abbreviation
WHEN a.iso_language = 'abbr' THEN 1
-- airport code
WHEN a.iso_language IN ('iata', 'icao', 'faac') THEN 2
-- name
ELSE 3
END
) AS match_type
FROM
geonames g
JOIN
geonames_alternates a ON g.id = a.geoname_id
WHERE
{}
AND g.id IN (
SELECT DISTINCT
geoname_id
FROM
geonames_alternates
WHERE
CASE :abbr
WHEN TRUE THEN 1
ELSE (
iso_language ISNULL
OR iso_language NOT IN ('iata', 'icao', 'faac', 'abbr')
) END
AND CASE :prefix
WHEN TRUE THEN (name BETWEEN :name AND :name || X'FFFF')
ELSE name = :name
END
)
AND CASE :prefix
WHEN FALSE THEN a.name = :name
ELSE (a.name = :name OR (
(a.name BETWEEN :name AND :name || X'FFFF')
AND match_type = 3
))
END
GROUP BY
g.id, match_type
ORDER BY
g.feature_class = 'P' DESC, g.population DESC, g.id ASC
g.feature_class = 'P' DESC, g.population DESC, g.id ASC, a.iso_language ASC
"#,
type_pred
),
named_params! {
":name": query.to_lowercase(),
":abbr": match_abbreviations,
":prefix": match_prefixes,
":prefix": match_name_prefix,
},
|row| -> Result<Option<Geoname>> {
let geoname = Geoname {
geoname_id: row.get("id")?,
name: row.get("name")?,
latitude: row.get("latitude")?,
longitude: row.get("longitude")?,
country_code: row.get("country_code")?,
admin1_code: row.get("admin1_code")?,
population: row.get("population")?,
|row| -> Result<Option<GeonameMatch>> {
let g_match = GeonameMatch {
geoname: Geoname {
geoname_id: row.get("id")?,
name: row.get("name")?,
latitude: row.get("latitude")?,
longitude: row.get("longitude")?,
country_code: row.get("country_code")?,
admin1_code: row.get("admin1_code")?,
population: row.get("population")?,
},
prefix: row.get("prefix")?,
match_type: match row.get::<_, i32>("match_type")? {
1 => GeonameMatchType::Abbreviation,
2 => GeonameMatchType::AirportCode,
_ => GeonameMatchType::Name,
},
};
if let Some(geonames) = &filter {
geonames
.iter()
.find(|g| g.has_same_region(&geoname))
.map(|_| Ok(Some(geoname)))
.find(|g| g.has_same_region(&g_match.geoname))
.map(|_| Ok(Some(g_match)))
.unwrap_or(Ok(None))
} else {
Ok(Some(geoname))
Ok(Some(g_match))
}
},
)?
@ -586,6 +628,39 @@ pub(crate) mod tests {
{ "name": "ny", "iso_language": "abbr" },
],
},
// Waco, TX: Has a surprising IATA airport code that's a
// common English word and not a prefix of the city name
{
"id": 9,
"name": "Waco",
"latitude": "31.54933",
"longitude": "-97.14667",
"feature_class": "P",
"feature_code": "PPLA2",
"country_code": "US",
"admin1_code": "TX",
"population": 132356,
"alternate_names_2": [
{ "name": "waco" },
{ "name": "act", "iso_language": "iata" },
],
},
// TX
{
"id": 10,
"name": "Texas",
"latitude": "31.25044",
"longitude": "-99.25061",
"feature_class": "A",
"feature_code": "ADM1",
"country_code": "US",
"admin1_code": "TX",
"population": 22875689,
"alternate_names_2": [
{ "name": "texas" },
{ "name": "tx", "iso_language": "abbr" },
],
},
// Made-up city with a long name
{
"id": 999,
@ -655,6 +730,18 @@ pub(crate) mod tests {
}
}
pub(crate) fn waco() -> Geoname {
Geoname {
geoname_id: 9,
name: "Waco".to_string(),
latitude: 31.54933,
longitude: -97.14667,
country_code: "US".to_string(),
admin1_code: "TX".to_string(),
population: 132356,
}
}
pub(crate) fn long_name_city() -> Geoname {
Geoname {
geoname_id: 999,
@ -715,404 +802,455 @@ pub(crate) mod tests {
..SuggestIngestionConstraints::all_providers()
});
#[derive(Debug)]
struct Test {
query: &'static str,
match_prefixes: bool,
match_abbreviations: bool,
match_name_prefix: bool,
geoname_type: Option<GeonameType>,
filter: Option<Vec<Geoname>>,
expected: Vec<Geoname>,
expected: Vec<GeonameMatch>,
}
let tests = [
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "ia",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "ia",
match_prefixes: true,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![waterloo_ia(), waterloo_al()]),
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![waterloo_ia()]),
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![waterloo_al()]),
expected: vec![],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: Some(GeonameType::City),
filter: None,
expected: vec![],
},
Test {
query: "ia",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: Some(GeonameType::Region),
filter: None,
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "iowa",
match_prefixes: false,
match_abbreviations: false,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![ia()],
},
Test {
query: "iowa",
match_prefixes: false,
match_abbreviations: true,
geoname_type: None,
filter: None,
expected: vec![ia()],
expected: vec![GeonameMatch {
geoname: ia(),
match_type: GeonameMatchType::Name,
prefix: false,
}],
},
Test {
query: "al",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![al()],
expected: vec![GeonameMatch {
geoname: al(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
// "al" is both a name prefix and an abbreviation.
Test {
query: "al",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![al()],
},
Test {
query: "al",
match_prefixes: false,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![],
},
// "al" is both an abbreviation and a prefix, so disabling
// abbreviations but enabling prefixes should match it.
Test {
query: "al",
match_prefixes: true,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![al()],
expected: vec![
GeonameMatch {
geoname: al(),
match_type: GeonameMatchType::Name,
prefix: true,
},
GeonameMatch {
geoname: al(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "waterloo",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![ia()]),
expected: vec![waterloo_ia()],
expected: vec![GeonameMatch {
geoname: waterloo_ia(),
match_type: GeonameMatchType::Name,
prefix: false,
}],
},
Test {
query: "waterloo",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![al()]),
expected: vec![waterloo_al()],
expected: vec![GeonameMatch {
geoname: waterloo_al(),
match_type: GeonameMatchType::Name,
prefix: false,
}],
},
Test {
query: "waterloo",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![ny_state()]),
expected: vec![],
},
Test {
query: "waterloo",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
// Waterloo, IA should be first since it has a larger
// population.
expected: vec![waterloo_ia(), waterloo_al()],
expected: vec![
GeonameMatch {
geoname: waterloo_ia(),
match_type: GeonameMatchType::Name,
prefix: false,
},
GeonameMatch {
geoname: waterloo_al(),
match_type: GeonameMatchType::Name,
prefix: false,
},
],
},
Test {
query: "water",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![waterloo_ia(), waterloo_al()],
expected: vec![
GeonameMatch {
geoname: waterloo_ia(),
match_type: GeonameMatchType::Name,
prefix: true,
},
GeonameMatch {
geoname: waterloo_al(),
match_type: GeonameMatchType::Name,
prefix: true,
},
],
},
Test {
query: "water",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
// NYC should be first since cities are ordered before regions.
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "ny",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "ny",
match_prefixes: true,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![nyc()]),
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: Some(vec![ny_state()]),
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: Some(GeonameType::City),
filter: None,
expected: vec![nyc()],
expected: vec![GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "ny",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: Some(GeonameType::Region),
filter: None,
expected: vec![ny_state()],
expected: vec![GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
}],
},
Test {
query: "NeW YoRk",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Name,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Name,
prefix: false,
},
],
},
Test {
query: "NY",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Abbreviation,
prefix: false,
},
],
},
Test {
query: "new",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "new",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![nyc(), ny_state()],
expected: vec![
GeonameMatch {
geoname: nyc(),
match_type: GeonameMatchType::Name,
prefix: true,
},
GeonameMatch {
geoname: ny_state(),
match_type: GeonameMatchType::Name,
prefix: true,
},
],
},
Test {
query: "new york foo",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "new york foo",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "new foo",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "foo new york",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "foo new york",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "foo new",
match_prefixes: true,
match_abbreviations: true,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "roc",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![rochester()],
expected: vec![GeonameMatch {
geoname: rochester(),
match_type: GeonameMatchType::AirportCode,
prefix: false,
}],
},
// "roc" is both a name prefix and an airport code.
Test {
query: "roc",
match_prefixes: false,
match_abbreviations: false,
match_name_prefix: true,
geoname_type: None,
filter: None,
expected: vec![],
},
Test {
query: "roc",
match_prefixes: true,
match_abbreviations: true,
geoname_type: None,
filter: None,
expected: vec![rochester()],
},
// "roc" is both an airport code and a prefix, so disabling
// abbreviations but enabling prefixes should match it.
Test {
query: "roc",
match_prefixes: true,
match_abbreviations: false,
geoname_type: None,
filter: None,
expected: vec![rochester()],
expected: vec![
GeonameMatch {
geoname: rochester(),
match_type: GeonameMatchType::Name,
prefix: true,
},
GeonameMatch {
geoname: rochester(),
match_type: GeonameMatchType::AirportCode,
prefix: false,
},
],
},
Test {
query: "long name",
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![Geoname {
geoname_id: 999,
name: "Long Name".to_string(),
latitude: 38.06084,
longitude: -97.92977,
country_code: "US".to_string(),
admin1_code: "NY".to_string(),
population: 2,
expected: vec![GeonameMatch {
geoname: long_name_city(),
match_type: GeonameMatchType::Name,
prefix: false,
}],
},
Test {
query: LONG_NAME,
match_prefixes: false,
match_abbreviations: true,
match_name_prefix: false,
geoname_type: None,
filter: None,
expected: vec![Geoname {
geoname_id: 999,
name: "Long Name".to_string(),
latitude: 38.06084,
longitude: -97.92977,
country_code: "US".to_string(),
admin1_code: "NY".to_string(),
population: 2,
expected: vec![GeonameMatch {
geoname: long_name_city(),
match_type: GeonameMatchType::Name,
prefix: false,
}],
},
];
store.read(|dao| {
for t in tests {
let gs = t.filter.unwrap_or_default();
let gs = t.filter.clone().unwrap_or_default();
let gs_refs: Vec<_> = gs.iter().collect();
let filters = if gs_refs.is_empty() {
None
@ -1122,12 +1260,13 @@ pub(crate) mod tests {
assert_eq!(
dao.fetch_geonames(
t.query,
t.match_prefixes,
t.match_abbreviations,
t.geoname_type,
t.match_name_prefix,
t.geoname_type.clone(),
filters
)?,
t.expected
t.expected,
"Test: {:?}",
t
);
}
Ok(())
@ -1230,8 +1369,19 @@ pub(crate) mod tests {
// Make sure we have a match.
store.read(|dao| {
assert_eq!(
dao.fetch_geonames("waterloo", false, true, None, None)?,
vec![waterloo_ia(), waterloo_al()],
dao.fetch_geonames("waterloo", false, None, None)?,
vec![
GeonameMatch {
geoname: waterloo_ia(),
match_type: GeonameMatchType::Name,
prefix: false,
},
GeonameMatch {
geoname: waterloo_al(),
match_type: GeonameMatchType::Name,
prefix: false,
},
],
);
Ok(())
})?;
@ -1248,10 +1398,7 @@ pub(crate) mod tests {
// The same query shouldn't match anymore and the tables should be
// empty.
store.read(|dao| {
assert_eq!(
dao.fetch_geonames("waterloo", false, true, None, None)?,
vec![],
);
assert_eq!(dao.fetch_geonames("waterloo", false, None, None)?, vec![],);
let g_ids = dao.conn.query_rows_and_then(
"SELECT id FROM geonames",

Просмотреть файл

@ -83,7 +83,7 @@ pub fn full_keyword(query: &str, keywords: &[impl AsRef<str>]) -> String {
///
/// ```
/// # use suggest::util::filter_map_chunks;
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, _, _| {
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, _| {
/// Ok(Some(vec![chunk.to_uppercase()]))
/// });
/// assert_eq!(paths.unwrap(), vec![
@ -99,7 +99,7 @@ pub fn full_keyword(query: &str, keywords: &[impl AsRef<str>]) -> String {
///
/// ```
/// # use suggest::util::filter_map_chunks;
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, chunk_index, _, _| {
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, chunk_index, _| {
/// if chunk_index > 0 || chunk == "a" {
/// Ok(Some(vec![chunk.to_uppercase()]))
/// } else {
@ -117,7 +117,7 @@ pub fn full_keyword(query: &str, keywords: &[impl AsRef<str>]) -> String {
///
/// ```
/// # use suggest::util::filter_map_chunks;
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, _, path| {
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, path| {
/// if path.iter().any(|value| value == "A B") {
/// Ok(None)
/// } else {
@ -135,7 +135,7 @@ pub fn full_keyword(query: &str, keywords: &[impl AsRef<str>]) -> String {
///
/// ```
/// # use suggest::util::filter_map_chunks;
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, _, _| {
/// let paths = filter_map_chunks(&["a", "b", "c"], 3, |chunk, _, _| {
/// Ok(Some(vec![format!("{chunk}0"), format!("{chunk}1")]))
/// });
/// assert_eq!(paths.unwrap(), vec![
@ -162,7 +162,7 @@ pub fn full_keyword(query: &str, keywords: &[impl AsRef<str>]) -> String {
pub fn filter_map_chunks<T: Clone>(
words: &[&str],
max_chunk_size: usize,
f: impl Fn(&str, usize, usize, &[T]) -> Result<Option<Vec<T>>>,
f: impl Fn(&str, usize, &[T]) -> Result<Option<Vec<T>>>,
) -> Result<Vec<Vec<T>>> {
let normalized_query = words.join(" ");
filter_map_chunks_recurse(words, &normalized_query, &mut vec![], 0, max_chunk_size, &f)
@ -182,7 +182,7 @@ fn filter_map_chunks_recurse<T: Clone>(
path: &mut Vec<T>,
chunk_index: usize,
max_chunk_size: usize,
f: &impl Fn(&str, usize, usize, &[T]) -> Result<Option<Vec<T>>>,
f: &impl Fn(&str, usize, &[T]) -> Result<Option<Vec<T>>>,
) -> Result<Vec<Vec<T>>> {
// Filtered-in (non-pruned) paths that will be returned from this step of
// the traversal.
@ -206,7 +206,7 @@ fn filter_map_chunks_recurse<T: Clone>(
let chunk = &remaining_query[..chunk_char_len];
// Call the mapper function.
if let Some(mapped_values) = f(chunk, chunk_index, chunk_size, &path[..])? {
if let Some(mapped_values) = f(chunk, chunk_index, &path[..])? {
for value in mapped_values {
if chunk_size == remaining_words.len() {
// This is the final chunk in the path. Stop recursing.
@ -302,7 +302,7 @@ mod tests {
fn fmc<T: Clone>(
query: &str,
max_chunk_size: usize,
f: impl Fn(&str, usize, usize, &[T]) -> Result<Option<Vec<T>>>,
f: impl Fn(&str, usize, &[T]) -> Result<Option<Vec<T>>>,
) -> Result<Vec<Vec<T>>> {
let words: Vec<_> = query.split_whitespace().collect();
filter_map_chunks(&words, max_chunk_size, f)
@ -323,7 +323,7 @@ mod tests {
#[test]
fn filter_map_chunks_1() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(
@ -335,7 +335,7 @@ mod tests {
#[test]
fn filter_map_chunks_2() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(
@ -356,7 +356,7 @@ mod tests {
#[test]
fn filter_map_chunks_3() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(
@ -382,7 +382,7 @@ mod tests {
#[test]
fn filter_map_chunks_4() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(
@ -410,7 +410,7 @@ mod tests {
#[test]
fn filter_map_chunks_5() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 5, |chunk, chunk_index, _, _| {
let paths = fmc("a b c d e", 5, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(
@ -439,7 +439,7 @@ mod tests {
#[test]
fn filter_map_chunks_1_map_many() -> anyhow::Result<()> {
let paths = fmc("a b c", 1, |chunk, _, _, _| {
let paths = fmc("a b c", 1, |chunk, _, _| {
Ok(Some((0..3).map(|i| format!("{chunk}{i}")).collect()))
})?;
assert_eq!(
@ -479,7 +479,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_map_many() -> anyhow::Result<()> {
let paths = fmc("a b c", 2, |chunk, _, _, _| {
let paths = fmc("a b c", 2, |chunk, _, _| {
Ok(Some((0..3).map(|i| format!("{chunk}{i}")).collect()))
})?;
assert_eq!(
@ -535,35 +535,9 @@ mod tests {
Ok(())
}
#[test]
fn filter_map_chunks_3_chunk_size() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, _, chunk_size, _| {
Ok(Some(vec![(chunk.to_string(), chunk_size)]))
})?;
check_paths(
paths,
vec![
vec![("a", 1), ("b", 1), ("c", 1), ("d", 1), ("e", 1)],
vec![("a", 1), ("b", 1), ("c", 1), ("d e", 2)],
vec![("a", 1), ("b", 1), ("c d", 2), ("e", 1)],
vec![("a", 1), ("b", 1), ("c d e", 3)],
vec![("a", 1), ("b c", 2), ("d", 1), ("e", 1)],
vec![("a", 1), ("b c", 2), ("d e", 2)],
vec![("a", 1), ("b c d", 3), ("e", 1)],
vec![("a b", 2), ("c", 1), ("d", 1), ("e", 1)],
vec![("a b", 2), ("c", 1), ("d e", 2)],
vec![("a b", 2), ("c d", 2), ("e", 1)],
vec![("a b", 2), ("c d e", 3)],
vec![("a b c", 3), ("d", 1), ("e", 1)],
vec![("a b c", 3), ("d e", 2)],
],
);
Ok(())
}
#[test]
fn filter_map_chunks_1_prune_a() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| match chunk {
"a" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -573,7 +547,7 @@ mod tests {
#[test]
fn filter_map_chunks_1_prune_b() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| match chunk {
"b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -583,7 +557,7 @@ mod tests {
#[test]
fn filter_map_chunks_1_prune_c() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| match chunk {
"c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -593,7 +567,7 @@ mod tests {
#[test]
fn filter_map_chunks_1_prune_d() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| match chunk {
"d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -603,7 +577,7 @@ mod tests {
#[test]
fn filter_map_chunks_1_prune_e() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 1, |chunk, chunk_index, _| match chunk {
"e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -613,7 +587,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_a() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"a" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -630,7 +604,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_b() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -649,7 +623,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_c() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -667,7 +641,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_d() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -686,7 +660,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_e() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -703,7 +677,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_ab() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"a b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -722,7 +696,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_bc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -742,7 +716,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_cd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -762,7 +736,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_de() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -781,7 +755,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_a_bc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"a" | "b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -798,7 +772,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_a_cd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"a" | "c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -814,7 +788,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_bc_cd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"b c" | "c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -832,7 +806,7 @@ mod tests {
#[test]
fn filter_map_chunks_2_prune_bc_de() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 2, |chunk, chunk_index, _| match chunk {
"b c" | "d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -850,7 +824,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_a() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"a" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -870,7 +844,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_b() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -893,7 +867,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_c() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -916,7 +890,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_d() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -939,7 +913,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_e() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -959,7 +933,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_ab() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"a b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -982,7 +956,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_bc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1007,7 +981,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_cd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1032,7 +1006,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_de() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1055,7 +1029,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_abc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"a b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1080,7 +1054,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_bcd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"b c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1106,7 +1080,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_cde() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"c d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1131,7 +1105,7 @@ mod tests {
#[test]
fn filter_map_chunks_3_prune_a_bc_cde() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 3, |chunk, chunk_index, _| match chunk {
"a" | "b c" | "c d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1150,7 +1124,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_a() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1171,7 +1145,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_b() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1196,7 +1170,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_c() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1221,7 +1195,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_d() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1246,7 +1220,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_e() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1267,7 +1241,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_ab() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a b" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1292,7 +1266,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_bc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1319,7 +1293,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_cd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1346,7 +1320,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_de() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1371,7 +1345,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_abc() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a b c" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1398,7 +1372,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_bcd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"b c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1426,7 +1400,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_cde() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"c d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1453,7 +1427,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_abcd() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a b c d" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1481,7 +1455,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_bcde() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"b c d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1509,7 +1483,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_a_bc_de() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a" | "b c" | "d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1528,7 +1502,7 @@ mod tests {
#[test]
fn filter_map_chunks_4_prune_a_bc_cde() -> anyhow::Result<()> {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _, _| match chunk {
let paths = fmc("a b c d e", 4, |chunk, chunk_index, _| match chunk {
"a" | "b c" | "c d e" => Ok(None),
_ => Ok(Some(vec![(chunk.to_string(), chunk_index)])),
})?;
@ -1548,7 +1522,7 @@ mod tests {
#[test]
fn filter_map_chunks_spaces() -> anyhow::Result<()> {
let paths = fmc(" a b c d e ", 2, |chunk, chunk_index, _, _| {
let paths = fmc(" a b c d e ", 2, |chunk, chunk_index, _| {
Ok(Some(vec![(chunk.to_string(), chunk_index)]))
})?;
check_paths(

Просмотреть файл

@ -15,7 +15,7 @@ use crate::{
KeywordInsertStatement, KeywordMetricsInsertStatement, SuggestDao,
SuggestionInsertStatement, DEFAULT_SUGGESTION_SCORE,
},
geoname::{Geoname, GeonameType},
geoname::{GeonameMatch, GeonameType},
metrics::MetricsContext,
provider::SuggestionProvider,
rs::{Client, Record, SuggestRecordId},
@ -101,7 +101,8 @@ impl SuggestDao<'_> {
.collect();
let mut matches =
filter_map_chunks::<Token>(&words, max_chunk_size, |chunk, c_i, c_size, path| {
// Step 2: Parse the query words into a list of token paths.
filter_map_chunks::<Token>(&words, max_chunk_size, |chunk, chunk_i, path| {
// Match the chunk to token types that haven't already been matched
// in this path. `all_tokens` will remain `None` until a token is
// matched.
@ -112,21 +113,7 @@ impl SuggestDao<'_> {
TokenType::WeatherKeyword,
] {
if !path.iter().any(|t| t.token_type() == tt) {
let is_first_chunk = c_i == 0;
let is_last_chunk = c_i + c_size == words.len();
// Match prefixes if the chunk isn't the first term in
// the query.
let match_prefixes = !is_first_chunk;
// Match abbreviations if the chunk isn't the only term
// in the query.
let match_abbreviations = !is_first_chunk || !is_last_chunk;
let mut tokens = self.match_weather_tokens(
tt,
path,
chunk,
match_prefixes,
match_abbreviations,
)?;
let mut tokens = self.match_weather_tokens(tt, path, chunk, chunk_i == 0)?;
if !tokens.is_empty() {
let mut ts = all_tokens.take().unwrap_or_default();
ts.append(&mut tokens);
@ -138,8 +125,8 @@ impl SuggestDao<'_> {
Ok(all_tokens)
})?
.into_iter()
// Map each token path to a tuple that represents a matched city,
// region, and keyword (each optional). Since paths are vecs,
// Step 3: Map each token path to a tuple that represents a matched
// city, region, and keyword (each optional). Since paths are vecs,
// they're ordered, so we may end up with duplicate tuples after
// this step. e.g., the paths `[<Waterloo IA>, <IA>]` and `[<IA>,
// <Waterloo IA>]` map to the same match.
@ -160,17 +147,34 @@ impl SuggestDao<'_> {
match_tuple
})
})
// Dedupe the matches by collecting them into a set.
// Step 4: Discard matches that don't have the right combination of
// tokens or that are otherwise invalid. Along with step 1, this is
// the core of the matching logic. In general, allow a match if it
// has (a) a city name typed in full or (b) a weather keyword at
// least as long as the config's min keyword length, since that
// indicates a weather intent.
.filter(|(city_match, region_match, kw_match)| {
match (city_match, region_match, kw_match) {
(None, None, Some(_)) => true,
(None, _, None) | (None, Some(_), Some(_)) => false,
(Some(city), region, kw) => {
(city.match_type.is_name() && !city.prefix)
// Allow city abbreviations without a weather
// keyword but only if the region was typed in full.
|| (city.match_type.is_abbreviation()
&& !city.prefix
&& region.as_ref().map(|r| !r.prefix).unwrap_or(false))
|| kw.as_ref().map(|k| k.is_min_keyword_length).unwrap_or(false)
}
}
})
// Step 5: Map the match objects to their underlying values.
.map(|(city, region, kw)| {
(city.map(|c| c.geoname), region.map(|r| r.geoname), kw.map(|k| k.keyword))
})
// Step 6: Dedupe the values by collecting them into a set.
.collect::<HashSet<_>>()
.into_iter()
// Filter out matches that don't have the right combination of
// tokens.
.filter(|(city, region, kw)| {
!matches!(
(city, region, kw),
(None, _, None) | (None, Some(_), Some(_))
)
})
.collect::<Vec<_>>();
// Sort the matches so cities with larger populations are first.
@ -207,19 +211,20 @@ impl SuggestDao<'_> {
token_type: TokenType,
path: &[Token],
candidate: &str,
match_prefixes: bool,
match_abbreviations: bool,
is_first_chunk: bool,
) -> Result<Vec<Token>> {
match token_type {
TokenType::City => {
// Fetch matching cities, and filter them to regions we've
// already matched in this path.
let regions: Vec<_> = path.iter().filter_map(|t| t.region()).collect();
let regions: Vec<_> = path
.iter()
.filter_map(|t| t.region().map(|m| &m.geoname))
.collect();
Ok(self
.fetch_geonames(
candidate,
match_prefixes,
match_abbreviations,
!is_first_chunk,
Some(GeonameType::City),
if regions.is_empty() {
None
@ -234,12 +239,14 @@ impl SuggestDao<'_> {
TokenType::Region => {
// Fetch matching regions, and filter them to cities we've
// already matched in this patch.
let cities: Vec<_> = path.iter().filter_map(|t| t.city()).collect();
let cities: Vec<_> = path
.iter()
.filter_map(|t| t.city().map(|m| &m.geoname))
.collect();
Ok(self
.fetch_geonames(
candidate,
match_prefixes,
match_abbreviations,
!is_first_chunk,
Some(GeonameType::Region),
if cities.is_empty() {
None
@ -255,11 +262,10 @@ impl SuggestDao<'_> {
// Fetch matching keywords. `min_keyword_length == 0` in the
// config means that the config doesn't allow prefix matching.
// `min_keyword_length > 0` means that the keyword must be at
// least that long when it's the first term in the query. We
// assume that `match_prefixes == false` means the candidate is
// the first term in the query.
// least that long when there's not already a city name present
// in the query.
let len = self.weather_cache().min_keyword_length;
if !match_prefixes && (candidate.len() as i32) < len {
if is_first_chunk && (candidate.len() as i32) < len {
// The candidate is the first term in the query and it's too
// short.
Ok(vec![])
@ -268,9 +274,14 @@ impl SuggestDao<'_> {
// first term in the query or if the config allows prefix
// matching.
Ok(self
.match_weather_keywords(candidate, match_prefixes || len > 0)?
.match_weather_keywords(candidate, !is_first_chunk || len > 0)?
.into_iter()
.map(Token::WeatherKeyword)
.map(|keyword| {
Token::WeatherKeyword(WeatherKeywordMatch {
keyword,
is_min_keyword_length: (len as usize) <= candidate.len(),
})
})
.collect())
}
}
@ -282,7 +293,8 @@ impl SuggestDao<'_> {
r#"
SELECT
k.keyword,
s.score
s.score,
k.keyword != :keyword AS matched_prefix
FROM
suggestions s
JOIN
@ -420,20 +432,20 @@ enum TokenType {
#[derive(Clone, Debug)]
enum Token {
City(Geoname),
Region(Geoname),
WeatherKeyword(String),
City(GeonameMatch),
Region(GeonameMatch),
WeatherKeyword(WeatherKeywordMatch),
}
impl Token {
fn city(&self) -> Option<&Geoname> {
fn city(&self) -> Option<&GeonameMatch> {
match self {
Self::City(g) => Some(g),
_ => None,
}
}
fn region(&self) -> Option<&Geoname> {
fn region(&self) -> Option<&GeonameMatch> {
match self {
Self::Region(g) => Some(g),
_ => None,
@ -449,10 +461,18 @@ impl Token {
}
}
#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)]
struct WeatherKeywordMatch {
keyword: String,
is_min_keyword_length: bool,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{geoname, store::tests::TestStore, testing::*, SuggestIngestionConstraints};
use crate::{
geoname, geoname::Geoname, store::tests::TestStore, testing::*, SuggestIngestionConstraints,
};
impl From<Geoname> for Suggestion {
fn from(g: Geoname) -> Self {
@ -627,7 +647,7 @@ mod tests {
"weather-1",
json!({
"keywords": ["ab", "xyz", "weather"],
"min_keyword_length": 3,
"min_keyword_length": 5,
"max_keyword_length": "weather".len(),
"max_keyword_word_count": 1,
"score": 0.24
@ -640,6 +660,244 @@ mod tests {
});
let tests: &[(&str, Vec<Suggestion>)] = &[
(
"act",
vec![],
),
(
"act w",
vec![],
),
(
"act we",
vec![],
),
(
"act wea",
vec![],
),
(
"act weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"act weath",
vec![geoname::tests::waco().into()],
),
(
"act weathe",
vec![geoname::tests::waco().into()],
),
(
"act weather",
vec![geoname::tests::waco().into()],
),
(
"weather a",
// The made-up long-name city starts with A.
vec![geoname::tests::long_name_city().into()],
),
(
"weather ac",
vec![],
),
(
"weather act",
vec![geoname::tests::waco().into()],
),
(
"act t",
vec![],
),
(
"act tx",
vec![],
),
(
"act tx w",
vec![],
),
(
"act tx we",
vec![],
),
(
"act tx wea",
vec![],
),
(
"act tx weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"act tx weath",
vec![geoname::tests::waco().into()],
),
(
"act tx weathe",
vec![geoname::tests::waco().into()],
),
(
"act tx weather",
vec![geoname::tests::waco().into()],
),
(
"tx a",
vec![],
),
(
"tx ac",
vec![],
),
(
"tx act",
vec![],
),
(
"tx act w",
vec![],
),
(
"tx act we",
vec![],
),
(
"tx act wea",
vec![],
),
(
"tx act weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"tx act weath",
vec![geoname::tests::waco().into()],
),
(
"tx act weathe",
vec![geoname::tests::waco().into()],
),
(
"tx act weather",
vec![geoname::tests::waco().into()],
),
(
"act te",
vec![],
),
(
"act tex",
vec![],
),
(
"act texa",
vec![],
),
(
"act texas",
vec![],
),
(
"act texas w",
vec![],
),
(
"act texas we",
vec![],
),
(
"act texas wea",
vec![],
),
(
"act texas weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"act texas weath",
vec![geoname::tests::waco().into()],
),
(
"act texas weathe",
vec![geoname::tests::waco().into()],
),
(
"act texas weather",
vec![geoname::tests::waco().into()],
),
(
"texas a",
vec![],
),
(
"texas ac",
vec![],
),
(
"texas act",
vec![],
),
(
"texas act w",
vec![],
),
(
"texas act we",
vec![],
),
(
"texas act wea",
vec![],
),
(
"texas act weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"texas act weath",
vec![geoname::tests::waco().into()],
),
(
"texas act weathe",
vec![geoname::tests::waco().into()],
),
(
"texas act weather",
vec![geoname::tests::waco().into()],
),
(
"ia w",
vec![],
),
(
"ia wa",
vec![],
),
(
"ia wat",
vec![],
),
(
"ia wate",
vec![],
),
(
"ia water",
vec![],
),
(
"ia waterl",
vec![],
),
(
"ia waterlo",
vec![],
),
(
"waterloo",
vec![
@ -649,10 +907,26 @@ mod tests {
geoname::tests::waterloo_al().into(),
],
),
(
"waterloo i",
vec![geoname::tests::waterloo_ia().into()],
),
(
"waterloo ia",
vec![geoname::tests::waterloo_ia().into()],
),
(
"waterloo io",
vec![geoname::tests::waterloo_ia().into()],
),
(
"waterloo iow",
vec![geoname::tests::waterloo_ia().into()],
),
(
"waterloo iowa",
vec![geoname::tests::waterloo_ia().into()],
),
(
"ia waterloo",
vec![geoname::tests::waterloo_ia().into()],
@ -696,6 +970,34 @@ mod tests {
vec![geoname::tests::nyc().into()],
),
("ny ny ny", vec![]),
(
"ny n",
vec![],
),
(
"ny ne",
vec![],
),
(
"ny new",
vec![],
),
(
"ny new ",
vec![],
),
(
"ny new y",
vec![],
),
(
"ny new yo",
vec![],
),
(
"ny new yor",
vec![],
),
(
"ny new york",
vec![geoname::tests::nyc().into()],
@ -712,6 +1014,31 @@ mod tests {
"ny weather",
vec![geoname::tests::nyc().into()],
),
(
"ny w",
vec![],
),
(
"ny we",
vec![],
),
(
"ny wea",
vec![],
),
(
"ny weat",
vec![],
),
(
// `min_keyword_length` = 5, so there should be a match.
"ny weath",
vec![geoname::tests::nyc().into()],
),
(
"ny weathe",
vec![geoname::tests::nyc().into()],
),
(
"weather ny ny",
vec![geoname::tests::nyc().into()],
@ -827,11 +1154,11 @@ mod tests {
),
(
"roc ny",
vec![geoname::tests::rochester().into()],
vec![],
),
(
"ny roc",
vec![geoname::tests::rochester().into()],
vec![],
),
(
"nyc weather",
@ -1129,7 +1456,9 @@ mod tests {
for (query, expected_suggestions) in tests {
assert_eq!(
&store.fetch_suggestions(SuggestionQuery::weather(query)),
expected_suggestions
expected_suggestions,
"Query: {:?}",
query
);
}