Switch relevancy component from UDL to proc-macros

This commit is contained in:
Bastian Gruber 2024-09-18 09:56:53 -03:00
Родитель 8be45cd323
Коммит 220af9af6c
6 изменённых файлов: 59 добавлений и 160 удалений

Просмотреть файл

@ -1,8 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
fn main() {
uniffi::generate_scaffolding("./src/relevancy.udl").unwrap();
}

Просмотреть файл

@ -6,7 +6,7 @@
use error_support::{ErrorHandling, GetErrorHandling};
/// Errors we return via the public interface.
#[derive(Debug, thiserror::Error)]
#[derive(Debug, thiserror::Error, uniffi::Error)]
pub enum RelevancyApiError {
#[error("Unexpected Error: {reason}")]
Unexpected { reason: String },

Просмотреть файл

@ -34,7 +34,7 @@ impl ToSql for InterestVectorKind {
/// List of possible interests for a domain. Domains can have be associated with one or multiple
/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify
/// because there's no corresponding entry in the interest database.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, uniffi::Enum)]
#[repr(u32)]
pub enum Interest {
// Note: if you change these codes, make sure to update the `TryFrom<u32>` implementation and
@ -149,7 +149,7 @@ impl ToSql for Interest {
///
/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
/// number of elements.
#[derive(Debug, Default, PartialEq, Eq)]
#[derive(Debug, Default, PartialEq, Eq, uniffi::Record)]
pub struct InterestVector {
pub inconclusive: u32,
pub animals: u32,

Просмотреть файл

@ -25,33 +25,39 @@ pub use ranker::score;
use error_support::handle_error;
uniffi::setup_scaffolding!();
#[derive(uniffi::Object)]
pub struct RelevancyStore {
db: RelevancyDb,
}
/// Top-level API for the Relevancy component
// Impl block to be exported via `UniFFI`.
#[uniffi::export]
impl RelevancyStore {
/// Construct a new RelevancyStore
///
/// This is non-blocking since databases and other resources are lazily opened.
#[uniffi::constructor]
pub fn new(db_path: String) -> Self {
Self {
db: RelevancyDb::new(db_path),
}
}
/// Close any open resources (for example databases)
///
/// Calling `close` will interrupt any in-progress queries on other threads.
pub fn close(&self) {
self.db.close()
}
/// Interrupt any current database queries
pub fn interrupt(&self) {
self.db.interrupt()
}
/// Download the interest data from remote settings if needed
#[handle_error(Error)]
pub fn ensure_interest_data_populated(&self) -> ApiResult<()> {
ingest::ensure_interest_data_populated(&self.db)?;
Ok(())
}
/// Ingest top URLs to build the user's interest vector.
///
/// Consumer should pass a list of the user's top URLs by frecency to this method. It will
@ -72,17 +78,6 @@ impl RelevancyStore {
Ok(interest_vec)
}
pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
let mut interest_vector = InterestVector::default();
for url in top_urls_by_frecency {
let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
log::trace!("classified: {url} {}", interest_count.summary());
interest_vector = interest_vector + interest_count;
}
Ok(interest_vector)
}
/// Calculate metrics for the validation phase
///
/// This runs after [Self::ingest]. It takes the interest vector that ingest created and
@ -102,14 +97,50 @@ impl RelevancyStore {
}
}
/// Interest metric data. See `relevancy.udl` for details.
pub struct InterestMetrics {
pub top_single_interest_similarity: u32,
pub top_2interest_similarity: u32,
pub top_3interest_similarity: u32,
impl RelevancyStore {
/// Download the interest data from remote settings if needed
#[handle_error(Error)]
pub fn ensure_interest_data_populated(&self) -> ApiResult<()> {
ingest::ensure_interest_data_populated(&self.db)?;
Ok(())
}
pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
let mut interest_vector = InterestVector::default();
for url in top_urls_by_frecency {
let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
log::trace!("classified: {url} {}", interest_count.summary());
interest_vector = interest_vector + interest_count;
}
Ok(interest_vector)
}
}
uniffi::include_scaffolding!("relevancy");
/// Interest metrics that we want to send to Glean as part of the validation process. These contain
/// the cosine similarity when comparing the user's interest against various interest vectors that
/// consumers may use.
///
/// Cosine similarly was chosen because it seems easy to calculate. This was then matched against
/// some semi-plausible real-world interest vectors that consumers might use. This is all up for
/// debate and we may decide to switch to some other metrics.
///
/// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
/// rounding. This is to make them compatible with Glean's distribution metrics.
#[derive(uniffi::Record)]
pub struct InterestMetrics {
/// Similarity between the user's interest vector and an interest vector where the element for
/// the user's top interest is copied, but all other interests are set to zero. This measures
/// the highest possible similarity with consumers that used interest vectors with a single
/// interest set.
pub top_single_interest_similarity: u32,
/// The same as before, but the top 2 interests are copied. This measures the highest possible
/// similarity with consumers that used interest vectors with a two interests (note: this means
/// they would need to choose the user's top two interests and have the exact same proportion
/// between them as the user).
pub top_2interest_similarity: u32,
/// The same as before, but the top 3 interests are copied.
pub top_3interest_similarity: u32,
}
#[cfg(test)]
mod test {

Просмотреть файл

@ -14,6 +14,7 @@ use crate::interest::{Interest, InterestVector};
/// - `content_categories`: a list of categories (interests) of the give content.
/// Return:
// - A score ranges in [0, 1].
#[uniffi::export]
pub fn score(interest_vector: InterestVector, content_categories: Vec<Interest>) -> f64 {
let n = content_categories
.iter()

Просмотреть файл

@ -1,125 +0,0 @@
namespace relevancy {
// Calculate score for a piece of categorized content based on a user interest vector.
//
// Params:
// - `interest_vector`: a user interest vector that can be fetched via
// `RelevancyStore::user_interest_vector()`.
// - `content_categories`: a list of categories (interests) of the give content.
// Return:
// - A score ranges in [0, 1].
double score(InterestVector interest_vector, sequence<Interest> content_categories);
};
[Error]
interface RelevancyApiError {
Unexpected(string reason);
};
// Top-level class for the Relevancy component
interface RelevancyStore {
/// Construct a new RelevancyStore
///
/// This is non-blocking since databases and other resources are lazily opened.
constructor(string dbpath);
/// Close any open resources (for example databases)
///
/// Calling `close` will interrupt any in-progress queries on other threads.
void close();
/// Interrupt any current database queries
void interrupt();
/// Ingest the top URLs by frequency to build up the user's interest vector
[Throws=RelevancyApiError]
InterestVector ingest(sequence<string> top_urls);
/// Calculate metrics for the user's interest vector in order to measure how strongly we're
/// identifying interests. See the `InterestMetrics` struct for details.
[Throws=RelevancyApiError]
InterestMetrics calculate_metrics();
/// Get the interest vector for the user.
///
/// This is intended to be show to the user in an `about:` page so that users can judge if it
/// feels correct.
[Throws=RelevancyApiError]
InterestVector user_interest_vector();
};
enum Interest {
"Animals",
"Arts",
"Autos",
"Business",
"Career",
"Education",
"Fashion",
"Finance",
"Food",
"Government",
// "Health",
"Hobbies",
"Home",
"News",
"RealEstate",
"Society",
"Sports",
"Tech",
"Travel",
"Inconclusive",
};
/// Interest metrics that we want to send to Glean as part of the validation process. These contain
/// the cosine similarity when comparing the user's interest against various interest vectors that
/// consumers may use.
///
/// Cosine similarly was chosen because it seems easy to calculate. This was then matched against
/// some semi-plausible real-world interest vectors that consumers might use. This is all up for
/// debate and we may decide to switch to some other metrics.
///
/// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
/// rounding. This is to make them compatible with Glean's distribution metrics.
dictionary InterestMetrics {
/// Similarity between the user's interest vector and an interest vector where the element for
/// the user's top interest is copied, but all other interests are set to zero. This measures
/// the highest possible similarity with consumers that used interest vectors with a single
/// interest set.
u32 top_single_interest_similarity;
/// The same as before, but the top 2 interests are copied. This measures the highest possible
/// similarity with consumers that used interest vectors with a two interests (note: this means
/// they would need to choose the user's top two interests and have the exact same proportion
/// between them as the user).
u32 top_2interest_similarity;
/// The same as before, but the top 3 interests are copied.
u32 top_3interest_similarity;
};
/// Vector storing a count value for each interest
///
/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
/// number of elements.
dictionary InterestVector {
u32 animals;
u32 arts;
u32 autos;
u32 business;
u32 career;
u32 education;
u32 fashion;
u32 finance;
u32 food;
u32 government;
// u32 health;
u32 hobbies;
u32 home;
u32 news;
u32 real_estate;
u32 society;
u32 sports;
u32 tech;
u32 travel;
u32 inconclusive;
};