Switch relevancy component from UDL to proc-macros
This commit is contained in:
Родитель
8be45cd323
Коммит
220af9af6c
|
@ -1,8 +0,0 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
fn main() {
|
||||
uniffi::generate_scaffolding("./src/relevancy.udl").unwrap();
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
use error_support::{ErrorHandling, GetErrorHandling};
|
||||
|
||||
/// Errors we return via the public interface.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[derive(Debug, thiserror::Error, uniffi::Error)]
|
||||
pub enum RelevancyApiError {
|
||||
#[error("Unexpected Error: {reason}")]
|
||||
Unexpected { reason: String },
|
||||
|
|
|
@ -34,7 +34,7 @@ impl ToSql for InterestVectorKind {
|
|||
/// List of possible interests for a domain. Domains can have be associated with one or multiple
|
||||
/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify
|
||||
/// because there's no corresponding entry in the interest database.
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, uniffi::Enum)]
|
||||
#[repr(u32)]
|
||||
pub enum Interest {
|
||||
// Note: if you change these codes, make sure to update the `TryFrom<u32>` implementation and
|
||||
|
@ -149,7 +149,7 @@ impl ToSql for Interest {
|
|||
///
|
||||
/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
|
||||
/// number of elements.
|
||||
#[derive(Debug, Default, PartialEq, Eq)]
|
||||
#[derive(Debug, Default, PartialEq, Eq, uniffi::Record)]
|
||||
pub struct InterestVector {
|
||||
pub inconclusive: u32,
|
||||
pub animals: u32,
|
||||
|
|
|
@ -25,33 +25,39 @@ pub use ranker::score;
|
|||
|
||||
use error_support::handle_error;
|
||||
|
||||
uniffi::setup_scaffolding!();
|
||||
|
||||
#[derive(uniffi::Object)]
|
||||
pub struct RelevancyStore {
|
||||
db: RelevancyDb,
|
||||
}
|
||||
|
||||
/// Top-level API for the Relevancy component
|
||||
// Impl block to be exported via `UniFFI`.
|
||||
#[uniffi::export]
|
||||
impl RelevancyStore {
|
||||
/// Construct a new RelevancyStore
|
||||
///
|
||||
/// This is non-blocking since databases and other resources are lazily opened.
|
||||
#[uniffi::constructor]
|
||||
pub fn new(db_path: String) -> Self {
|
||||
Self {
|
||||
db: RelevancyDb::new(db_path),
|
||||
}
|
||||
}
|
||||
|
||||
/// Close any open resources (for example databases)
|
||||
///
|
||||
/// Calling `close` will interrupt any in-progress queries on other threads.
|
||||
pub fn close(&self) {
|
||||
self.db.close()
|
||||
}
|
||||
|
||||
/// Interrupt any current database queries
|
||||
pub fn interrupt(&self) {
|
||||
self.db.interrupt()
|
||||
}
|
||||
|
||||
/// Download the interest data from remote settings if needed
|
||||
#[handle_error(Error)]
|
||||
pub fn ensure_interest_data_populated(&self) -> ApiResult<()> {
|
||||
ingest::ensure_interest_data_populated(&self.db)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ingest top URLs to build the user's interest vector.
|
||||
///
|
||||
/// Consumer should pass a list of the user's top URLs by frecency to this method. It will
|
||||
|
@ -72,17 +78,6 @@ impl RelevancyStore {
|
|||
Ok(interest_vec)
|
||||
}
|
||||
|
||||
pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
|
||||
let mut interest_vector = InterestVector::default();
|
||||
for url in top_urls_by_frecency {
|
||||
let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
|
||||
log::trace!("classified: {url} {}", interest_count.summary());
|
||||
interest_vector = interest_vector + interest_count;
|
||||
}
|
||||
|
||||
Ok(interest_vector)
|
||||
}
|
||||
|
||||
/// Calculate metrics for the validation phase
|
||||
///
|
||||
/// This runs after [Self::ingest]. It takes the interest vector that ingest created and
|
||||
|
@ -102,14 +97,50 @@ impl RelevancyStore {
|
|||
}
|
||||
}
|
||||
|
||||
/// Interest metric data. See `relevancy.udl` for details.
|
||||
pub struct InterestMetrics {
|
||||
pub top_single_interest_similarity: u32,
|
||||
pub top_2interest_similarity: u32,
|
||||
pub top_3interest_similarity: u32,
|
||||
impl RelevancyStore {
|
||||
/// Download the interest data from remote settings if needed
|
||||
#[handle_error(Error)]
|
||||
pub fn ensure_interest_data_populated(&self) -> ApiResult<()> {
|
||||
ingest::ensure_interest_data_populated(&self.db)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
|
||||
let mut interest_vector = InterestVector::default();
|
||||
for url in top_urls_by_frecency {
|
||||
let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
|
||||
log::trace!("classified: {url} {}", interest_count.summary());
|
||||
interest_vector = interest_vector + interest_count;
|
||||
}
|
||||
Ok(interest_vector)
|
||||
}
|
||||
}
|
||||
|
||||
uniffi::include_scaffolding!("relevancy");
|
||||
/// Interest metrics that we want to send to Glean as part of the validation process. These contain
|
||||
/// the cosine similarity when comparing the user's interest against various interest vectors that
|
||||
/// consumers may use.
|
||||
///
|
||||
/// Cosine similarly was chosen because it seems easy to calculate. This was then matched against
|
||||
/// some semi-plausible real-world interest vectors that consumers might use. This is all up for
|
||||
/// debate and we may decide to switch to some other metrics.
|
||||
///
|
||||
/// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
|
||||
/// rounding. This is to make them compatible with Glean's distribution metrics.
|
||||
#[derive(uniffi::Record)]
|
||||
pub struct InterestMetrics {
|
||||
/// Similarity between the user's interest vector and an interest vector where the element for
|
||||
/// the user's top interest is copied, but all other interests are set to zero. This measures
|
||||
/// the highest possible similarity with consumers that used interest vectors with a single
|
||||
/// interest set.
|
||||
pub top_single_interest_similarity: u32,
|
||||
/// The same as before, but the top 2 interests are copied. This measures the highest possible
|
||||
/// similarity with consumers that used interest vectors with a two interests (note: this means
|
||||
/// they would need to choose the user's top two interests and have the exact same proportion
|
||||
/// between them as the user).
|
||||
pub top_2interest_similarity: u32,
|
||||
/// The same as before, but the top 3 interests are copied.
|
||||
pub top_3interest_similarity: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
|
|
@ -14,6 +14,7 @@ use crate::interest::{Interest, InterestVector};
|
|||
/// - `content_categories`: a list of categories (interests) of the give content.
|
||||
/// Return:
|
||||
// - A score ranges in [0, 1].
|
||||
#[uniffi::export]
|
||||
pub fn score(interest_vector: InterestVector, content_categories: Vec<Interest>) -> f64 {
|
||||
let n = content_categories
|
||||
.iter()
|
||||
|
|
|
@ -1,125 +0,0 @@
|
|||
namespace relevancy {
|
||||
// Calculate score for a piece of categorized content based on a user interest vector.
|
||||
//
|
||||
// Params:
|
||||
// - `interest_vector`: a user interest vector that can be fetched via
|
||||
// `RelevancyStore::user_interest_vector()`.
|
||||
// - `content_categories`: a list of categories (interests) of the give content.
|
||||
// Return:
|
||||
// - A score ranges in [0, 1].
|
||||
double score(InterestVector interest_vector, sequence<Interest> content_categories);
|
||||
};
|
||||
|
||||
[Error]
|
||||
interface RelevancyApiError {
|
||||
Unexpected(string reason);
|
||||
};
|
||||
|
||||
// Top-level class for the Relevancy component
|
||||
interface RelevancyStore {
|
||||
/// Construct a new RelevancyStore
|
||||
///
|
||||
/// This is non-blocking since databases and other resources are lazily opened.
|
||||
constructor(string dbpath);
|
||||
|
||||
/// Close any open resources (for example databases)
|
||||
///
|
||||
/// Calling `close` will interrupt any in-progress queries on other threads.
|
||||
void close();
|
||||
|
||||
/// Interrupt any current database queries
|
||||
void interrupt();
|
||||
|
||||
/// Ingest the top URLs by frequency to build up the user's interest vector
|
||||
[Throws=RelevancyApiError]
|
||||
InterestVector ingest(sequence<string> top_urls);
|
||||
|
||||
/// Calculate metrics for the user's interest vector in order to measure how strongly we're
|
||||
/// identifying interests. See the `InterestMetrics` struct for details.
|
||||
[Throws=RelevancyApiError]
|
||||
InterestMetrics calculate_metrics();
|
||||
|
||||
/// Get the interest vector for the user.
|
||||
///
|
||||
/// This is intended to be show to the user in an `about:` page so that users can judge if it
|
||||
/// feels correct.
|
||||
[Throws=RelevancyApiError]
|
||||
InterestVector user_interest_vector();
|
||||
};
|
||||
|
||||
enum Interest {
|
||||
"Animals",
|
||||
"Arts",
|
||||
"Autos",
|
||||
"Business",
|
||||
"Career",
|
||||
"Education",
|
||||
"Fashion",
|
||||
"Finance",
|
||||
"Food",
|
||||
"Government",
|
||||
// "Health",
|
||||
"Hobbies",
|
||||
"Home",
|
||||
"News",
|
||||
"RealEstate",
|
||||
"Society",
|
||||
"Sports",
|
||||
"Tech",
|
||||
"Travel",
|
||||
"Inconclusive",
|
||||
};
|
||||
|
||||
/// Interest metrics that we want to send to Glean as part of the validation process. These contain
|
||||
/// the cosine similarity when comparing the user's interest against various interest vectors that
|
||||
/// consumers may use.
|
||||
///
|
||||
/// Cosine similarly was chosen because it seems easy to calculate. This was then matched against
|
||||
/// some semi-plausible real-world interest vectors that consumers might use. This is all up for
|
||||
/// debate and we may decide to switch to some other metrics.
|
||||
///
|
||||
/// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
|
||||
/// rounding. This is to make them compatible with Glean's distribution metrics.
|
||||
dictionary InterestMetrics {
|
||||
/// Similarity between the user's interest vector and an interest vector where the element for
|
||||
/// the user's top interest is copied, but all other interests are set to zero. This measures
|
||||
/// the highest possible similarity with consumers that used interest vectors with a single
|
||||
/// interest set.
|
||||
u32 top_single_interest_similarity;
|
||||
|
||||
/// The same as before, but the top 2 interests are copied. This measures the highest possible
|
||||
/// similarity with consumers that used interest vectors with a two interests (note: this means
|
||||
/// they would need to choose the user's top two interests and have the exact same proportion
|
||||
/// between them as the user).
|
||||
u32 top_2interest_similarity;
|
||||
|
||||
/// The same as before, but the top 3 interests are copied.
|
||||
u32 top_3interest_similarity;
|
||||
};
|
||||
|
||||
/// Vector storing a count value for each interest
|
||||
///
|
||||
/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
|
||||
/// number of elements.
|
||||
dictionary InterestVector {
|
||||
u32 animals;
|
||||
u32 arts;
|
||||
u32 autos;
|
||||
u32 business;
|
||||
u32 career;
|
||||
u32 education;
|
||||
u32 fashion;
|
||||
u32 finance;
|
||||
u32 food;
|
||||
u32 government;
|
||||
// u32 health;
|
||||
u32 hobbies;
|
||||
u32 home;
|
||||
u32 news;
|
||||
u32 real_estate;
|
||||
u32 society;
|
||||
u32 sports;
|
||||
u32 tech;
|
||||
u32 travel;
|
||||
u32 inconclusive;
|
||||
};
|
Загрузка…
Ссылка в новой задаче