* Migrated taar locale recommender to use redis

* added a noop fixture loader module for tests

* Converted TAAR Collaborative recommender to use redis

* Dropped hybrid recommender

* Ported similarity recommender to use redis

* Ported ensemble and recommendation manager to use redis

* dropped LazyJSONLoader

* dropped moto dependency

* Renamed AddonsCoinstallCache to TAARCache

* renamed bin/taarlite-redis to bin/taar-redis

* Execute data preprocess step on redis ptr change

* bumped to 0.7.4
This commit is contained in:
Victor Ng 2020-09-01 21:39:09 -04:00 коммит произвёл GitHub
Родитель 9773053739
Коммит b33a1b684c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
28 изменённых файлов: 1378 добавлений и 1528 удалений

48
bin/pipstrap.py Executable file → Normal file
Просмотреть файл

@ -28,6 +28,7 @@ from shutil import rmtree
from subprocess import check_output
from sys import exit
from tempfile import mkdtemp
try:
from urllib2 import build_opener, HTTPHandler, HTTPSHandler
except ImportError:
@ -40,26 +41,34 @@ except ImportError:
PACKAGES = [
# Pip has no dependencies, as it vendors everything:
('https://pypi.python.org/packages/source/p/pip/pip-8.0.2.tar.gz',
'46f4bd0d8dfd51125a554568d646fe4200a3c2c6c36b9f2d06d2212148439521'),
(
"https://pypi.python.org/packages/source/p/pip/pip-8.0.2.tar.gz",
"46f4bd0d8dfd51125a554568d646fe4200a3c2c6c36b9f2d06d2212148439521",
),
# This version of setuptools has only optional dependencies:
('https://pypi.python.org/packages/source/s/setuptools/'
'setuptools-19.4.tar.gz',
'214bf29933f47cf25e6faa569f710731728a07a19cae91ea64f826051f68a8cf'),
(
"https://pypi.python.org/packages/source/s/setuptools/"
"setuptools-19.4.tar.gz",
"214bf29933f47cf25e6faa569f710731728a07a19cae91ea64f826051f68a8cf",
),
# We require Python 2.7 or later because we don't support wheel's
# conditional dep on argparse. This version of wheel has no other
# dependencies:
('https://pypi.python.org/packages/source/w/wheel/wheel-0.26.0.tar.gz',
'eaad353805c180a47545a256e6508835b65a8e830ba1093ed8162f19a50a530c')
(
"https://pypi.python.org/packages/source/w/wheel/wheel-0.26.0.tar.gz",
"eaad353805c180a47545a256e6508835b65a8e830ba1093ed8162f19a50a530c",
),
]
class HashError(Exception):
def __str__(self):
url, path, actual, expected = self.args
return ('{url} did not match the expected hash {expected}. Instead, '
'it was {actual}. The file (left at {path}) may have been '
'tampered with.'.format(**locals()))
return (
"{url} did not match the expected hash {expected}. Instead, "
"it was {actual}. The file (left at {path}) may have been "
"tampered with.".format(**locals())
)
def hashed_download(url, temp, digest):
@ -82,9 +91,9 @@ def hashed_download(url, temp, digest):
yield chunk
response = opener().open(url)
path = join(temp, urlparse(url).path.split('/')[-1])
path = join(temp, urlparse(url).path.split("/")[-1])
actual_hash = sha256()
with open(path, 'wb') as file:
with open(path, "wb") as file:
for chunk in read_chunks(response, 4096):
file.write(chunk)
actual_hash.update(chunk)
@ -96,13 +105,14 @@ def hashed_download(url, temp, digest):
def main():
temp = mkdtemp(prefix='pipstrap-')
temp = mkdtemp(prefix="pipstrap-")
try:
downloads = [hashed_download(url, temp, digest)
for url, digest in PACKAGES]
check_output('pip install --no-index --no-deps -U ' +
' '.join(quote(d) for d in downloads),
shell=True)
downloads = [hashed_download(url, temp, digest) for url, digest in PACKAGES]
check_output(
"pip install --no-index --no-deps -U "
+ " ".join(quote(d) for d in downloads),
shell=True,
)
except HashError as exc:
print(exc)
except Exception:
@ -114,5 +124,5 @@ def main():
return 1
if __name__ == '__main__':
if __name__ == "__main__":
exit(main())

Просмотреть файл

@ -1,6 +1,6 @@
#!/usr/bin/env python
from taar.recommenders.redis_cache import AddonsCoinstallCache
from taar.recommenders.redis_cache import TAARCache
from taar.context import default_context
import click
@ -11,7 +11,7 @@ import click
@click.option("--info", is_flag=True, help="Display information about the cache state")
def main(reset, load, info):
"""
Manage the TAARLite redis cache.
Manage the TAAR+TAARLite redis cache.
This expecte that the following enviroment variables are set:
@ -23,7 +23,7 @@ def main(reset, load, info):
return
ctx = default_context()
cache = AddonsCoinstallCache(ctx)
cache = TAARCache.get_instance(ctx)
if reset:
if cache.reset():
print("Successfully flushed db0 bookkeeping database.")

Просмотреть файл

@ -83,7 +83,6 @@ dependencies:
- markus[datadog]==2.2.0
- mock==2.0.0
- more-itertools==4.2.0
- moto==1.3.14
- mozilla-srgutil==0.1.7
- mozilla-jsoncache==0.1.7
- networkx==2.4

Просмотреть файл

@ -3,7 +3,7 @@ from setuptools import find_packages, setup
setup(
name="mozilla-taar3",
use_scm_version=False,
version="0.7.3",
version="0.7.4",
setup_requires=["setuptools_scm", "pytest-runner"],
tests_require=["pytest"],
include_package_data=True,
@ -29,6 +29,6 @@ setup(
[taarapi_app]
app=taar.plugin:configure_plugin
""",
scripts=["bin/taarlite-redis.py"],
scripts=["bin/taar-redis.py"],
zip_safe=False,
)

Просмотреть файл

@ -1,4 +1,4 @@
from .profile_fetcher import ProfileFetcher # noqa
from .profile_fetcher import ProfileFetcher # noqa
import pkg_resources
__version__ = pkg_resources.require("mozilla-taar3")[0].version

Просмотреть файл

@ -20,8 +20,7 @@ PLUGIN = config("TAAR_API_PLUGIN", default=None)
sentry_sdk.init(
dsn=config("SENTRY_DSN", ''),
integrations=[FlaskIntegration()],
dsn=config("SENTRY_DSN", ""), integrations=[FlaskIntegration()],
)
# There should only be a single registered app for the taar-api

Просмотреть файл

@ -132,9 +132,7 @@ class ProfileFetcher:
"locale": profile_data.get("locale", ""),
"os": profile_data.get("os", ""),
"installed_addons": addon_ids,
"disabled_addons_ids": profile_data.get(
"disabled_addons_ids", []
),
"disabled_addons_ids": profile_data.get("disabled_addons_ids", []),
"bookmark_count": profile_data.get("places_bookmarks_count", 0),
"tab_open_count": profile_data.get(
"scalar_parent_browser_engagement_tab_open_event_count", 0

Просмотреть файл

@ -3,38 +3,18 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from srgutil.interfaces import IMozLogging
from .lazys3 import LazyJSONLoader
import numpy as np
import operator as op
import functools
import threading
from .base_recommender import AbstractRecommender
from taar.settings import (
TAAR_ITEM_MATRIX_BUCKET,
TAAR_ITEM_MATRIX_KEY,
TAAR_ADDON_MAPPING_BUCKET,
TAAR_ADDON_MAPPING_KEY,
)
from taar.recommenders.redis_cache import TAARCache
import markus
metrics = markus.get_metrics("taar")
def synchronized(wrapped):
""" Synchronization decorator. """
@functools.wraps(wrapped)
def wrapper(*args, **kwargs):
self = args[0]
with self._lock:
return wrapped(*args, **kwargs)
return wrapper
def java_string_hashcode(s):
h = 0
for c in s:
@ -58,31 +38,20 @@ class CollaborativeRecommender(AbstractRecommender):
def __init__(self, ctx):
self._ctx = ctx
self._lock = threading.RLock()
self._addon_mapping = LazyJSONLoader(
self._ctx,
TAAR_ADDON_MAPPING_BUCKET,
TAAR_ADDON_MAPPING_KEY,
"addon_mapping",
)
self._raw_item_matrix = LazyJSONLoader(
self._ctx, TAAR_ITEM_MATRIX_BUCKET, TAAR_ITEM_MATRIX_KEY, "item_matrix",
)
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._redis_cache = TAARCache.get_instance(self._ctx)
self.model = None
@property
def addon_mapping(self):
return self._addon_mapping.get()[0]
return self._redis_cache.collab_addon_mapping()
@property
def raw_item_matrix(self):
val, new_copy = self._raw_item_matrix.get()
if val is not None and new_copy:
val = self._redis_cache.collab_raw_item_matrix()
if val not in (None, ""):
# Build a dense numpy matrix out of it.
num_rows = len(val)
num_cols = len(val[0]["features"])
@ -90,27 +59,10 @@ class CollaborativeRecommender(AbstractRecommender):
self.model = np.zeros(shape=(num_rows, num_cols))
for index, row in enumerate(val):
self.model[index, :] = row["features"]
elif val is None and new_copy:
else:
self.model = None
return val
def _load_json_models(self):
# Download the addon mappings.
if self.addon_mapping is None:
self.logger.error(
"Cannot download the addon mapping file {} {}".format(
TAAR_ADDON_MAPPING_BUCKET, TAAR_ADDON_MAPPING_KEY
)
)
if self.addon_mapping is None:
self.logger.error(
"Cannot download the model file {} {}".format(
TAAR_ITEM_MATRIX_BUCKET, TAAR_ITEM_MATRIX_KEY
)
)
@synchronized
def can_recommend(self, client_data, extra_data={}):
# We can't recommend if we don't have our data files.
if (
@ -178,22 +130,18 @@ class CollaborativeRecommender(AbstractRecommender):
@metrics.timer_decorator("collaborative_recommend")
def recommend(self, client_data, limit, extra_data={}):
# Addons identifiers are stored as positive hash values within the model.
with self._lock:
try:
recommendations = self._recommend(client_data, limit, extra_data)
except Exception as e:
recommendations = []
try:
recommendations = self._recommend(client_data, limit, extra_data)
except Exception as e:
recommendations = []
self._addon_mapping.force_expiry()
self._raw_item_matrix.force_expiry()
metrics.incr("error_collaborative", value=1)
self.logger.exception(
"Collaborative recommender crashed for {}".format(
client_data.get("client_id", "no-client-id")
),
e,
)
metrics.incr("error_collaborative", value=1)
self.logger.exception(
"Collaborative recommender crashed for {}".format(
client_data.get("client_id", "no-client-id")
),
e,
)
log_data = (
client_data["client_id"],

Просмотреть файл

@ -5,16 +5,9 @@
from srgutil.interfaces import IMozLogging
import itertools
from .base_recommender import AbstractRecommender
from .lazys3 import LazyJSONLoader
from taar.settings import (
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
)
from taar.utils import hasher
from taar.recommenders.redis_cache import TAARCache
import markus
@ -27,18 +20,6 @@ def is_test_client(client_id):
return len(set(client_id.replace("-", ""))) == 1
class WeightCache:
def __init__(self, ctx):
self._ctx = ctx
self._weights = LazyJSONLoader(
self._ctx, TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY, "ensemble"
)
def getWeights(self):
return self._weights.get()[0]["ensemble_weights"]
class EnsembleRecommender(AbstractRecommender):
"""
The EnsembleRecommender is a collection of recommenders where the
@ -50,12 +31,17 @@ class EnsembleRecommender(AbstractRecommender):
def __init__(self, ctx):
self.RECOMMENDER_KEYS = ["collaborative", "similarity", "locale"]
self._ctx = ctx
self._redis_cache = TAARCache.get_instance(self._ctx)
self.logger = self._ctx[IMozLogging].get_logger("taar.ensemble")
assert "recommender_factory" in self._ctx
self._init_from_ctx()
def getWeights(self):
return self._redis_cache.ensemble_weights()
def _init_from_ctx(self):
# Copy the map of the recommenders
self._recommender_map = {}
@ -64,11 +50,6 @@ class EnsembleRecommender(AbstractRecommender):
for rkey in self.RECOMMENDER_KEYS:
self._recommender_map[rkey] = recommender_factory.create(rkey)
self._whitelist_data = LazyJSONLoader(
self._ctx, TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY, "whitelist"
)
self._weight_cache = WeightCache(self._ctx.child())
self.logger.info("EnsembleRecommender initialized")
def can_recommend(self, client_data, extra_data={}):
@ -88,7 +69,7 @@ class EnsembleRecommender(AbstractRecommender):
client_id = client_data.get("client_id", "no-client-id")
if is_test_client(client_id):
whitelist = self._whitelist_data.get()[0]
whitelist = self._redis_cache.whitelist_data()
samples = whitelist[:limit]
self.logger.info("Test ID detected [{}]".format(client_id))
@ -102,7 +83,6 @@ class EnsembleRecommender(AbstractRecommender):
results = self._recommend(client_data, limit, extra_data)
except Exception as e:
results = []
self._weight_cache._weights.force_expiry()
self.logger.exception(
"Ensemble recommender crashed for {}".format(client_id), e
)
@ -130,7 +110,7 @@ class EnsembleRecommender(AbstractRecommender):
extended_limit = limit + len(preinstalled_addon_ids)
flattened_results = []
ensemble_weights = self._weight_cache.getWeights()
ensemble_weights = self._redis_cache.ensemble_weights()
for rkey in self.RECOMMENDER_KEYS:
recommender = self._recommender_map[rkey]

Просмотреть файл

@ -10,7 +10,7 @@ from srgutil.interfaces import IMozLogging
import markus
from taar.recommenders.redis_cache import AddonsCoinstallCache
from taar.recommenders.redis_cache import TAARCache
metrics = markus.get_metrics("taar")
@ -76,7 +76,7 @@ class GuidBasedRecommender:
self._ctx = ctx
self.logger = self._ctx[IMozLogging].get_logger("taarlite")
self._redis_cache = AddonsCoinstallCache(self._ctx)
self._redis_cache = TAARCache.get_instance(self._ctx)
self.logger.info("GUIDBasedRecommender is initialized")
def cache_ready(self):

Просмотреть файл

@ -1,178 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from .base_recommender import AbstractRecommender
from .lazys3 import LazyJSONLoader
from srgutil.interfaces import IMozLogging
import operator as op
import random
from taar.settings import TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
import markus
metrics = markus.get_metrics("taar")
class CuratedWhitelistCache:
"""
This fetches the curated whitelist from S3.
"""
def __init__(self, ctx):
self._ctx = ctx
self._data = LazyJSONLoader(
self._ctx, TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY, "whitelist",
)
def get_whitelist(self):
return self._data.get()[0]
def get_randomized_guid_sample(self, item_count):
""" Fetch a subset of randomzied GUIDs from the whitelist """
dataset = self.get_whitelist()
random.shuffle(dataset)
return dataset[:item_count]
class CuratedRecommender(AbstractRecommender):
"""
The curated recommender just delegates to the whitelist
that is provided by the AMO team.
This recommender simply provides a randomized sample of
pre-approved addons for recommendation. It does not use any other
external data to generate recommendations, nor does it use any
information from the Firefox agent.
"""
def __init__(self, ctx):
self._ctx = ctx
self.logger = self._ctx[IMozLogging].get_logger("taar.curated")
self._curated_wl = CuratedWhitelistCache(self._ctx)
def can_recommend(self, client_data, extra_data={}):
"""The Curated recommender will always be able to recommend
something"""
self.logger.info("Curated can_recommend: {}".format(True))
return True
@metrics.timer_decorator("hybrid_recommend")
def recommend(self, client_data, limit, extra_data={}):
"""
Curated recommendations are just random selections
from the whitelist and we explicitly set the weighting to 1.0
"""
guids = self._curated_wl.get_randomized_guid_sample(limit)
results = [(guid, 1.0) for guid in guids]
log_data = (client_data["client_id"], str(guids))
self.logger.info(
"Curated recommendations client_id: [%s], guids: [%s]" % log_data
)
return results
class HybridRecommender(AbstractRecommender):
"""
The EnsembleRecommender is a collection of recommenders where the
results from each recommendation is amplified or dampened by a
factor. The aggregate results are combines and used to recommend
addons for users.
"""
def __init__(self, ctx):
self._ctx = ctx
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._ensemble_recommender = self._ctx["ensemble_recommender"]
self._curated_recommender = CuratedRecommender(self._ctx.child())
def can_recommend(self, client_data, extra_data={}):
"""The ensemble recommender is always going to be
available if at least one recommender is available"""
ensemble_recommend = self._ensemble_recommender.can_recommend(
client_data, extra_data
)
curated_recommend = self._curated_recommender.can_recommend(
client_data, extra_data
)
result = ensemble_recommend and curated_recommend
self.logger.info("Hybrid can_recommend: {}".format(result))
return result
def recommend(self, client_data, limit, extra_data={}):
"""
Hybrid recommendations simply select half recommendations from
the ensemble recommender, and half from the curated one.
Duplicate recommendations are accomodated by rank ordering
by weight.
"""
preinstalled_addon_ids = client_data.get("installed_addons", [])
# Compute an extended limit by adding the length of
# the list of any preinstalled addons.
extended_limit = limit + len(preinstalled_addon_ids)
ensemble_suggestions = self._ensemble_recommender.recommend(
client_data, extended_limit, extra_data
)
curated_suggestions = self._curated_recommender.recommend(
client_data, extended_limit, extra_data
)
# Generate a set of results from each of the composite
# recommenders. We select one item from each recommender
# sequentially so that we do not bias one recommender over the
# other.
merged_results = set()
while (
len(merged_results) < limit
and len(ensemble_suggestions) > 0
and len(curated_suggestions) > 0
):
r1 = ensemble_suggestions.pop()
if r1[0] not in [temp[0] for temp in merged_results]:
merged_results.add(r1)
# Terminate early if we have an odd number for the limit
if not (
len(merged_results) < limit
and len(ensemble_suggestions) > 0
and len(curated_suggestions) > 0
):
break
r2 = curated_suggestions.pop()
if r2[0] not in [temp[0] for temp in merged_results]:
merged_results.add(r2)
if len(merged_results) < limit:
msg = (
"Defaulting to empty results. Insufficient recommendations found for client: %s"
% client_data["client_id"]
)
self.logger.info(msg)
return []
sorted_results = sorted(
list(merged_results), key=op.itemgetter(1), reverse=True
)
log_data = (
client_data["client_id"],
str([r[0] for r in sorted_results]),
)
self.logger.info(
"Hybrid recommendations client_id: [%s], guids: [%s]" % log_data
)
return sorted_results

Просмотреть файл

@ -1,139 +0,0 @@
import boto3
from botocore.client import Config
from srgutil.interfaces import IMozLogging, IClock
import json
import threading
import time
import markus
metrics = markus.get_metrics("taar")
class LazyJSONLoader:
def __init__(self, ctx, s3_bucket, s3_key, metric_name="", ttl=14400):
self._ctx = ctx
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._clock = self._ctx[IClock]
self._s3_bucket = s3_bucket
self._s3_key = s3_key
self._metric_name = metric_name
self._ttl = int(ttl)
self._expiry_time = 0
self._key_str = "{}|{}".format(self._s3_bucket, self._s3_key)
self._cached_copy = None
msg = "Cache expiry of {} is set to TTL of {} seconds".format(
self._key_str, self._ttl
)
self.logger.info(msg)
self._lock = threading.RLock()
self.logger.info("{} loader is initialized".format(self._key_str))
def force_expiry(self):
msg = "Existing model for {} reset to 0. Model was:".format(
self._key_str, str(self._cached_copy)
)
self.logger.info(msg)
self._expiry_time = 0
def has_expired(self):
return self._clock.time() > self._expiry_time
def get(self, transform=None):
"""
Return the JSON defined at the S3 location in the constructor.
The get method will reload the S3 object after the TTL has
expired.
Fetch the JSON object from cache or S3 if necessary
"""
if not self.has_expired() and self._cached_copy is not None:
return self._cached_copy, False
return self._refresh_cache(transform), True
def _refresh_cache(self, transform=None):
with self._lock:
# If some requests get stale data while the S3 bucket is
# being reloaded - it's not the end of the world.
#
# Likewise when the TTL expires, it's possible for
# multiple threads to concurrently lock and update the
# cache. Again - not world ending.
#
# Immediately update the expiry time as we don't want other
# threads to wait on the lock while we update the
# cached_copy
#
self._expiry_time = self._clock.time() + self._ttl
raw_data = None
raw_bytes = None
try:
# We need to force a data reload from S3
config = Config(connect_timeout=10, retries={"max_attempts": 3})
s3 = boto3.resource("s3", config=config)
start_load = time.time()
raw_bytes = (
s3.Object(self._s3_bucket, self._s3_key).get()["Body"].read()
)
end_load = time.time()
load_time = end_load - start_load
raw_data = raw_bytes.decode("utf-8")
msg = "Loaded S3: {}. Byte count: {:d}. Time to Load: {:0.3f}"
msg_params = self._key_str, len(raw_bytes), load_time
self.logger.info(msg.format(*msg_params))
# It is possible to have corrupted files in S3, so
# protect against that.
try:
tmp = json.loads(raw_data)
if transform is not None:
tmp = transform(tmp)
self._cached_copy = tmp
metrics.timing(
self._metric_name,
value=load_time * 1000,
tags=[
f"store:s3",
f"bucket:{self._s3_bucket}",
f"key:{self._s3_key}",
],
)
except ValueError:
# In the event of an error, we want to try to reload
# the data so force the expiry to 0, but leave the
# existing cached data alone so we can still service
# requests.
self._expiry_time = 0
self.logger.error(
"Cannot parse JSON resource from S3",
extra={"bucket": self._s3_bucket, "key": self._s3_key},
)
return self._cached_copy
except Exception:
# In the event of an error, we want to try to reload
# the data so force the expiry to 0, but leave the
# existing cached data alone so we can still service
# requests.
self._expiry_time = 0
self.logger.exception(
"Failed to download from S3",
extra={"bucket": self._s3_bucket, "key": self._s3_key},
)
return self._cached_copy

Просмотреть файл

@ -2,14 +2,13 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from srgutil.interfaces import IMozLogging
from .base_recommender import AbstractRecommender
from .lazys3 import LazyJSONLoader
from taar.settings import TAAR_LOCALE_BUCKET, TAAR_LOCALE_KEY
import markus
from srgutil.interfaces import IMozLogging
from .base_recommender import AbstractRecommender
from taar.recommenders.redis_cache import TAARCache
metrics = markus.get_metrics("taar")
@ -29,27 +28,12 @@ class LocaleRecommender(AbstractRecommender):
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._top_addons_per_locale = LazyJSONLoader(
self._ctx, TAAR_LOCALE_BUCKET, TAAR_LOCALE_KEY, "locale"
)
self._init_from_ctx()
self._redis_cache = TAARCache.get_instance(self._ctx)
# DONE removed
@property
def top_addons_per_locale(self):
def presort_locale(data):
result = {}
for locale, guid_list in data.items():
result[locale] = sorted(guid_list, key=lambda x: x[1], reverse=True)
return result
return self._top_addons_per_locale.get(transform=presort_locale)[0]
def _init_from_ctx(self):
if self.top_addons_per_locale is None:
self.logger.error(
"Cannot download the top per locale file {}".format(TAAR_LOCALE_KEY)
)
return self._redis_cache.top_addons_per_locale()
def can_recommend(self, client_data, extra_data={}):
# We can't recommend if we don't have our data files.

Просмотреть файл

@ -8,13 +8,9 @@ from taar.recommenders.ensemble_recommender import (
)
from taar.recommenders.randomizer import in_experiment, reorder_guids
from srgutil.interfaces import IMozLogging
from .lazys3 import LazyJSONLoader
from taar.recommenders.redis_cache import TAARCache
from taar.settings import (
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
TAAR_EXPERIMENT_PROB,
)
from taar.settings import TAAR_EXPERIMENT_PROB
import markus
@ -61,9 +57,7 @@ class RecommendationManager:
# The whitelist data is only used for test client IDs
self._whitelist_data = LazyJSONLoader(
self._ctx, TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY, "whitelist"
)
self._redis_cache = TAARCache.get_instance(self._ctx)
self._experiment_prob = ctx.get("TAAR_EXPERIMENT_PROB", TAAR_EXPERIMENT_PROB)
@ -98,7 +92,7 @@ class RecommendationManager:
# Fetch back all possible whitelisted addons for this
# client
extra_data["guid_randomization"] = True
whitelist = self._whitelist_data.get()[0]
whitelist = self._redis_cache.whitelist_data()
results = self._ensemble_recommender.recommend(
client_info, len(whitelist), extra_data
)

Просмотреть файл

@ -8,39 +8,103 @@ import threading
import redis
import numpy as np
from srgutil.interfaces import IMozLogging
from taar.settings import (
REDIS_HOST,
REDIS_PORT,
)
# TAARLite configuration
from taar.settings import (
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
TAARLITE_GUID_RANKING_KEY,
TAARLITE_TTL,
TAARLITE_TRUNCATE,
TAARLITE_MUTEX_TTL,
)
# TAAR configuration
from taar.settings import (
# Locale
TAAR_LOCALE_BUCKET,
TAAR_LOCALE_KEY,
# Collaborative dta
TAAR_ADDON_MAPPING_BUCKET,
TAAR_ADDON_MAPPING_KEY,
TAAR_ITEM_MATRIX_BUCKET,
TAAR_ITEM_MATRIX_KEY,
# Similarity data
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
TAAR_SIMILARITY_LRCURVES_KEY,
# Ensemble data
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
# Whitelist data
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
)
from jsoncache.loader import s3_json_loader
# This marks which of the redis databases is currently
# active for read
ACTIVE_DB = "active_db"
# This is a mutex to block multiple writers from redis
UPDATE_CHECK = "update_mutex|"
# taarlite guid guid coinstallation matrix
COINSTALL_PREFIX = "coinstall|"
# taarlite guid guid coinstallation matrix filtered by
# minimum installation threshholds
FILTERED_COINSTALL_PREFIX = "filtered_coinstall|"
# taarlite ranking data
RANKING_PREFIX = "ranking|"
# taarlite minimum installation threshold
MIN_INSTALLS_PREFIX = "min_installs|"
# This is a map is guid->sum of coinstall counts
# taarlite map of guid->(sum of coinstall counts)
NORMDATA_COUNT_MAP_PREFIX = "normdata_count_map_prefix|"
# Capture the number of times a GUID shows up per row
# taarlite number of times a GUID shows up per row
# of coinstallation data.
NORMDATA_ROWCOUNT_PREFIX = "normdata_rowcount_prefix|"
# taarlite row nownormalization data
NORMDATA_GUID_ROW_NORM_PREFIX = "normdata_guid_row_norm_prefix|"
# TAAR: Locale data
LOCALE_DATA = "taar_locale_data|"
# TAAR: collaborative data
COLLAB_MAPPING_DATA = "taar_collab_mapping|"
COLLAB_ITEM_MATRIX = "taar_collab_item_matrix|"
# TAAR: similarity data
SIMILARITY_DONORS = "taar_similarity_donors|"
SIMILARITY_LRCURVES = "taar_similarity_lrcurves|"
# TAAR: similarity preprocessed data
SIMILARITY_NUM_DONORS = "taar_similarity_num_donors|"
SIMILARITY_CONTINUOUS_FEATURES = "taar_similarity_continuous_features|"
SIMILARITY_CATEGORICAL_FEATURES = "taar_similarity_categorical_features|"
# TAAR: ensemble weights
ENSEMBLE_WEIGHTS = "taar_ensemble_weights|"
# TAAR: whitelist data
WHITELIST_DATA = "taar_whitelist_data|"
class PrefixStripper:
def __init__(self, prefix, iterator, cast_to_str=False):
self._prefix = prefix
@ -58,19 +122,42 @@ class PrefixStripper:
return result
class AddonsCoinstallCache:
class TAARCache:
"""
This class manages a redis instance to hold onto the taar-lite
GUID->GUID co-installation data
"""
def __init__(self, ctx, ttl=TAARLITE_TTL):
_instance = None
@classmethod
def get_instance(cls, ctx):
if cls._instance is None:
cls._instance = TAARCache(ctx, i_didnt_read_the_docs=False)
return cls._instance
def __init__(self, ctx, i_didnt_read_the_docs=True):
"""
Don't call this directly - use get_instance instace
"""
if i_didnt_read_the_docs:
raise RuntimeError(
"You cannot call this method directly - use get_instance"
)
self._ctx = ctx
self._last_db = None
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._ttl = ttl
# Keep an integer handle (or None) on the last known database
self._last_db = None
self._similarity_num_donors = 0
self._similarity_continuous_features = None
self._similarity_categorical_features = None
rcon = self.init_redis_connections()
self._r0 = rcon[0]
self._r1 = rcon[1]
self._r2 = rcon[2]
@ -136,9 +223,6 @@ class AddonsCoinstallCache:
self._r0.delete(UPDATE_CHECK)
self.logger.info("UPDATE_CHECK field is cleared")
def fetch_ranking_data(self):
return s3_json_loader(TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_RANKING_KEY)
def guid_maps_count_map(self, guid, default=None):
tmp = self._db().get(NORMDATA_COUNT_MAP_PREFIX + guid)
if tmp:
@ -167,11 +251,6 @@ class AddonsCoinstallCache:
return 0
return float(result.decode("utf8"))
def fetch_coinstall_data(self):
return s3_json_loader(
TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_COINSTALL_KEY
)
def get_filtered_coinstall(self, guid, default=None):
tmp = self._db().get(FILTERED_COINSTALL_PREFIX + guid)
if tmp:
@ -224,7 +303,92 @@ class AddonsCoinstallCache:
# Any value in ACTIVE_DB indicates that data is live
return self._r0.get(ACTIVE_DB) is not None
# Private methods below
def top_addons_per_locale(self):
"""
Get locale data
"""
tmp = self._db().get(LOCALE_DATA)
if tmp:
return json.loads(tmp.decode("utf8"))
return None
def collab_raw_item_matrix(self):
"""
Get the taar collaborative item matrix
"""
tmp = self._db().get(COLLAB_ITEM_MATRIX)
if tmp:
return json.loads(tmp.decode("utf8"))
return None
def collab_addon_mapping(self):
"""
Get the taar collaborative addon mappin
"""
tmp = self._db().get(COLLAB_MAPPING_DATA)
if tmp:
return json.loads(tmp.decode("utf8"))
return None
def similarity_donors(self):
"""
Get the taar similarity donors
"""
tmp = self._db().get(SIMILARITY_DONORS)
if tmp:
return json.loads(tmp.decode("utf8"))
return None
def similarity_lrcurves(self):
"""
Get the taar similarity donors
"""
tmp = self._db().get(SIMILARITY_LRCURVES)
if tmp:
return json.loads(tmp.decode("utf8"))
return None
def similarity_continuous_features(self):
"""
precomputed similarity recommender continuous features cache
"""
_ = self._db() # make sure we've computed data from the live redis instance
return self._similarity_continuous_features
def similarity_categorical_features(self):
"""
precomputed similarity recommender categorical features cache
"""
_ = self._db() # make sure we've computed data from the live redis instance
return self._similarity_categorical_features
@property
def similarity_num_donors(self):
"""
precomputed similarity recommender categorical features cache
"""
_ = self._db() # make sure we've computed data from the live redis instance
return self._similarity_num_donors
def ensemble_weights(self):
tmp = self._db().get(ENSEMBLE_WEIGHTS)
if tmp:
return json.loads(tmp)
return None
def whitelist_data(self):
tmp = self._db().get(WHITELIST_DATA)
if tmp:
return json.loads(tmp)
return None
"""
################################
Private methods below
"""
def _db(self):
"""
@ -232,21 +396,166 @@ class AddonsCoinstallCache:
active redis instance
"""
active_db = self._r0.get(ACTIVE_DB)
if active_db is not None:
db = int(active_db.decode("utf8"))
if db == 1:
return self._r1
# Run all callback functions to preprocess model data
live_db = self._r1
elif db == 2:
return self._r2
live_db = self._r2
self._update_data_callback(db, live_db)
return live_db
def _update_data_callback(self, db_num, db):
"""
Preprocess data when the current redis instance does not match
the last known instance.
"""
if db_num == self._last_db:
return
self._last_db = db_num
self._build_similarity_features_caches(db)
self.logger.info("Completed precomputing normalized data")
def _build_similarity_features_caches(self, db):
"""
This function build two feature cache matrices and sets the
number of donors (self.similarity_num_donors)
That's the self.categorical_features and
self.continuous_features attributes.
One matrix is for the continuous features and the other is for
the categorical features. This is needed to speed up the similarity
recommendation process."""
from taar.recommenders.similarity_recommender import (
CONTINUOUS_FEATURES,
CATEGORICAL_FEATURES,
)
tmp = db.get(SIMILARITY_DONORS)
if tmp is None:
return
donors_pool = json.loads(tmp.decode("utf8"))
self._similarity_num_donors = len(donors_pool)
# Build a numpy matrix cache for the continuous features.
continuous_features = np.zeros(
(self.similarity_num_donors, len(CONTINUOUS_FEATURES))
)
for idx, d in enumerate(donors_pool):
features = [d.get(specified_key) for specified_key in CONTINUOUS_FEATURES]
continuous_features[idx] = features
self._similarity_continuous_features = continuous_features
# Build the cache for categorical features.
categorical_features = np.zeros(
(self.similarity_num_donors, len(CATEGORICAL_FEATURES)), dtype="object",
)
for idx, d in enumerate(donors_pool):
features = [d.get(specified_key) for specified_key in CATEGORICAL_FEATURES]
categorical_features[idx] = np.array([features], dtype="object")
self._similarity_categorical_features = categorical_features
self.logger.info("Reconstructed matrices for similarity recommender")
@property
def _ident(self):
""" pid/thread identity """
return f"{os.getpid()}_{threading.get_ident()}"
def _update_coinstall_data(self, db):
def _fetch_coinstall_data(self):
return s3_json_loader(
TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_COINSTALL_KEY
)
data = self.fetch_coinstall_data()
def _fetch_ranking_data(self):
return s3_json_loader(TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_RANKING_KEY)
def _fetch_locale_data(self):
return s3_json_loader(TAAR_LOCALE_BUCKET, TAAR_LOCALE_KEY)
def _fetch_collaborative_mapping_data(self):
return s3_json_loader(TAAR_ADDON_MAPPING_BUCKET, TAAR_ADDON_MAPPING_KEY)
def _fetch_collaborative_item_matrix(self):
return s3_json_loader(TAAR_ITEM_MATRIX_BUCKET, TAAR_ITEM_MATRIX_KEY)
def _fetch_similarity_donors(self):
return s3_json_loader(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY,)
def _fetch_similarity_lrcurves(self):
return s3_json_loader(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY,)
def _fetch_ensemble_weights(self):
return s3_json_loader(TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY)
def _fetch_whitelist(self):
return s3_json_loader(TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY)
def _update_whitelist_data(self, db):
"""
Load the TAAR whitelist data
"""
tmp = self._fetch_whitelist()
if tmp:
db.set(WHITELIST_DATA, json.dumps(tmp))
def _update_ensemble_data(self, db):
"""
Load the TAAR ensemble data
"""
tmp = self._fetch_ensemble_weights()
if tmp:
db.set(ENSEMBLE_WEIGHTS, json.dumps(tmp["ensemble_weights"]))
def _update_similarity_data(self, db):
"""
Load the TAAR similarity data
"""
donors = self._fetch_similarity_donors()
lrcurves = self._fetch_similarity_lrcurves()
db.set(SIMILARITY_DONORS, json.dumps(donors))
db.set(SIMILARITY_LRCURVES, json.dumps(lrcurves))
def _update_collab_data(self, db):
"""
Load the TAAR collaborative data. This is two parts: an item
matrix and a mapping of GUIDs
"""
# Load the item matrix into redis
item_matrix = self._fetch_collaborative_item_matrix()
db.set(COLLAB_ITEM_MATRIX, json.dumps(item_matrix))
# Load the taar collaborative mapping data
mapping_data = self._fetch_collaborative_mapping_data()
db.set(COLLAB_MAPPING_DATA, json.dumps(mapping_data))
def _update_locale_data(self, db):
"""
Load the TAAR locale data
"""
data = self._fetch_locale_data()
result = {}
for locale, guid_list in data.items():
result[locale] = sorted(guid_list, key=lambda x: x[1], reverse=True)
db.set(LOCALE_DATA, json.dumps(result))
def _update_coinstall_data(self, db):
"""
Load the TAAR Lite GUID GUID coinstallation data
"""
data = self._fetch_coinstall_data()
items = data.items()
len_items = len(items)
@ -302,7 +611,7 @@ class AddonsCoinstallCache:
def _update_rank_data(self, db):
data = self.fetch_ranking_data()
data = self._fetch_ranking_data()
items = data.items()
len_items = len(items)
@ -330,10 +639,6 @@ class AddonsCoinstallCache:
self._copy_data(next_active_db)
self.logger.info("Completed precomputing normalized data")
# TODO: should this autoexpire to help indicate that no fresh
# data has loaded? Maybe N * update TTL time?
self._r0.set(ACTIVE_DB, next_active_db)
self.logger.info(f"Active DB is set to {next_active_db}")
@ -345,5 +650,22 @@ class AddonsCoinstallCache:
# Clear this database before we do anything with it
db.flushdb()
# Update TAARlite
self._update_rank_data(db)
self._update_coinstall_data(db)
# Update TAAR locale data
self._update_locale_data(db)
# Update TAAR collaborative data
self._update_collab_data(db)
# Update TAAR similarity data
self._update_similarity_data(db)
# Update TAAR ensemble data
self._update_ensemble_data(db)
# Update TAAR ensemble data
self._update_whitelist_data(db)

Просмотреть файл

@ -7,13 +7,7 @@ from itertools import groupby
from scipy.spatial import distance
from srgutil.interfaces import IMozLogging
import numpy as np
from .lazys3 import LazyJSONLoader
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
TAAR_SIMILARITY_LRCURVES_KEY,
)
from taar.recommenders.redis_cache import TAARCache
import markus
@ -52,99 +46,29 @@ class SimilarityRecommender(AbstractRecommender):
def __init__(self, ctx):
self._ctx = ctx
if "similarity_donors_pool" in self._ctx:
self._donors_pool = self._ctx["similarity_donors_pool"]
else:
self._donors_pool = LazyJSONLoader(
self._ctx,
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
"similarity_donor",
)
if "similarity_lr_curves" in self._ctx:
self._lr_curves = self._ctx["similarity_lr_curves"]
else:
self._lr_curves = LazyJSONLoader(
self._ctx,
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_LRCURVES_KEY,
"similarity_curves",
)
self._redis_cache = TAARCache.get_instance(self._ctx)
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._init_from_ctx()
@property
def categorical_features(self):
return self._redis_cache.similarity_categorical_features()
@property
def continuous_features(self):
return self._redis_cache.similarity_continuous_features()
@property
def num_donors(self):
return self._redis_cache.similarity_num_donors
@property
def donors_pool(self):
result, status = self._donors_pool.get()
if status:
# Force a reconstruction of the features cache on new
# donor pool data
self._build_features_caches()
return result
return self._redis_cache.similarity_donors()
@property
def lr_curves(self):
result, status = self._lr_curves.get()
if status:
# Force a reconstruction of the features cache on new
# curve data
self._build_features_caches()
return result
def _init_from_ctx(self):
# Download the addon donors list.
if self.donors_pool is None:
self.logger.info(
"Similarity donors pool has not been fetched from S3: {}".format(
TAAR_SIMILARITY_DONOR_KEY
)
)
# Download the probability mapping curves from similarity to likelihood of being a good donor.
if self.lr_curves is None:
self.logger.error(
"Similarity LR Curves have not been fetched from S3: {}".format(
TAAR_SIMILARITY_LRCURVES_KEY
)
)
def _build_features_caches(self):
"""This function build two feature cache matrices.
That's the self.categorical_features and
self.continuous_features attributes.
One matrix is for the continuous features and the other is for
the categorical features. This is needed to speed up the similarity
recommendation process."""
_donors_pool = self._donors_pool.get()[0]
_lr_curves = self._lr_curves.get()[0]
if _donors_pool is None or _lr_curves is None:
# We need to have both donors_pool and lr_curves defined
# to reconstruct the matrices
return None
self.num_donors = len(_donors_pool)
# Build a numpy matrix cache for the continuous features.
self.continuous_features = np.zeros((self.num_donors, len(CONTINUOUS_FEATURES)))
for idx, d in enumerate(_donors_pool):
features = [d.get(specified_key) for specified_key in CONTINUOUS_FEATURES]
self.continuous_features[idx] = features
# Build the cache for categorical features.
self.categorical_features = np.zeros(
(self.num_donors, len(CATEGORICAL_FEATURES)), dtype="object"
)
for idx, d in enumerate(_donors_pool):
features = [d.get(specified_key) for specified_key in CATEGORICAL_FEATURES]
self.categorical_features[idx] = np.array([features], dtype="object")
self.logger.info("Reconstructed matrices for similarity recommender")
return self._redis_cache.similarity_lrcurves()
def can_recommend(self, client_data, extra_data={}):
# We can't recommend if we don't have our data files.
@ -301,8 +225,6 @@ class SimilarityRecommender(AbstractRecommender):
recommendations_out = self._recommend(client_data, limit, extra_data)
except Exception as e:
recommendations_out = []
self._donors_pool.force_expiry()
self._lr_curves.force_expiry()
metrics.incr("error_similarity", value=1)
self.logger.exception(

Просмотреть файл

@ -1,6 +1,6 @@
import re
RE_PLATFORM = re.compile('(linux|windows|macintosh|android|fxios).*firefox')
RE_PLATFORM = re.compile("(linux|windows|macintosh|android|fxios).*firefox")
LINUX = 1
WINDOWS = 2
@ -8,11 +8,13 @@ MACINTOSH = 3
ANDROID = 4
FXIOS = 5
OSNAME_TO_ID = {'linux': LINUX,
'windows': WINDOWS,
'macintosh': MACINTOSH,
'android': ANDROID,
'fxios': FXIOS}
OSNAME_TO_ID = {
"linux": LINUX,
"windows": WINDOWS,
"macintosh": MACINTOSH,
"android": ANDROID,
"fxios": FXIOS,
}
def parse_ua(user_agent):

54
tests/noop_fixtures.py Normal file
Просмотреть файл

@ -0,0 +1,54 @@
"""
Noop helpers
"""
import mock
from taar.recommenders.redis_cache import TAARCache
def noop_taarlite_dataload(stack):
# no-op the taarlite rankdata
stack.enter_context(
mock.patch.object(TAARCache, "_update_rank_data", return_value=None)
)
# no-op the taarlite guidguid data
stack.enter_context(
mock.patch.object(TAARCache, "_update_coinstall_data", return_value=None,)
)
return stack
def noop_taarlocale_dataload(stack):
# no-op the taarlite rankdata
stack.enter_context(
mock.patch.object(TAARCache, "_update_locale_data", return_value=None)
)
return stack
def noop_taarcollab_dataload(stack):
# no-op the taar collab
stack.enter_context(
mock.patch.object(TAARCache, "_update_collab_data", return_value=None)
)
return stack
def noop_taarsimilarity_dataload(stack):
# no-op the taar collab
stack.enter_context(
mock.patch.object(TAARCache, "_update_similarity_data", return_value=None)
)
return stack
def noop_taarensemble_dataload(stack):
# no-op the taar collab
stack.enter_context(
mock.patch.object(TAARCache, "_update_ensemble_data", return_value=None)
)
stack.enter_context(
mock.patch.object(TAARCache, "_update_whitelist_data", return_value=None)
)
return stack

Просмотреть файл

@ -7,7 +7,12 @@
CONTINUOUS_FEATURE_FIXTURE_DATA = [
{
"active_addons": ["{test-guid-1}", "{test-guid-2}", "{test-guid-3}", "{test-guid-4}"],
"active_addons": [
"{test-guid-1}",
"{test-guid-2}",
"{test-guid-3}",
"{test-guid-4}",
],
"geo_city": "brasilia-br",
"subsession_length": 4911,
"locale": "br-PT",
@ -15,10 +20,15 @@ CONTINUOUS_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 190,
"unique_tlds": 21
"unique_tlds": 21,
},
{
"active_addons": ["{test-guid-5}", "{test-guid-6}", "{test-guid-1}", "{test-guid-8}"],
"active_addons": [
"{test-guid-5}",
"{test-guid-6}",
"{test-guid-1}",
"{test-guid-8}",
],
"geo_city": "brasilia-br",
"subsession_length": 4911,
"locale": "br-PT",
@ -26,10 +36,15 @@ CONTINUOUS_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 200,
"unique_tlds": 21
"unique_tlds": 21,
},
{
"active_addons": ["{test-guid-9}", "{test-guid-10}", "{test-guid-11}", "{test-guid-12}"],
"active_addons": [
"{test-guid-9}",
"{test-guid-10}",
"{test-guid-11}",
"{test-guid-12}",
],
"geo_city": "brasilia-br",
"subsession_length": 4911,
"locale": "br-PT",
@ -37,7 +52,7 @@ CONTINUOUS_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 222,
"unique_tlds": 21
"unique_tlds": 21,
},
{
"active_addons": ["{test-guid-13}", "{test-guid-14}"],
@ -48,8 +63,8 @@ CONTINUOUS_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 210,
"unique_tlds": 21
}
"unique_tlds": 21,
},
]
# Match the fixture taar client, but vary the geo_city to test only
@ -60,7 +75,12 @@ CONTINUOUS_FEATURE_FIXTURE_DATA = [
CATEGORICAL_FEATURE_FIXTURE_DATA = [
{
"active_addons": ["{test-guid-1}", "{test-guid-2}", "{test-guid-3}", "{test-guid-4}"],
"active_addons": [
"{test-guid-1}",
"{test-guid-2}",
"{test-guid-3}",
"{test-guid-4}",
],
"geo_city": "brasilia-br",
"subsession_length": 4911,
"locale": "br-PT",
@ -68,11 +88,16 @@ CATEGORICAL_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 222,
"unique_tlds": 21
"unique_tlds": 21,
},
{
# "{test-guid-1}" appears in duplicate here.
"active_addons": ["{test-guid-5}", "{test-guid-6}", "{test-guid-1}", "{test-guid-8}"],
"active_addons": [
"{test-guid-5}",
"{test-guid-6}",
"{test-guid-1}",
"{test-guid-8}",
],
"geo_city": "toronto-ca",
"subsession_length": 4911,
"locale": "br-PT",
@ -80,10 +105,15 @@ CATEGORICAL_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 222,
"unique_tlds": 21
"unique_tlds": 21,
},
{
"active_addons": ["{test-guid-9}", "{test-guid-10}", "{test-guid-11}", "{test-guid-12}"],
"active_addons": [
"{test-guid-9}",
"{test-guid-10}",
"{test-guid-11}",
"{test-guid-12}",
],
"geo_city": "brasilia-br",
"subsession_length": 4911,
"locale": "br-PT",
@ -91,7 +121,7 @@ CATEGORICAL_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 222,
"unique_tlds": 21
"unique_tlds": 21,
},
{
"active_addons": ["{test-guid-13}", "{test-guid-1}"],
@ -102,6 +132,6 @@ CATEGORICAL_FEATURE_FIXTURE_DATA = [
"bookmark_count": 7,
"tab_open_count": 4,
"total_uri": 222,
"unique_tlds": 21
}
"unique_tlds": 21,
},
]

Просмотреть файл

@ -8,22 +8,25 @@ Test cases for the TAAR CollaborativeRecommender
import numpy
from moto import mock_s3
import boto3
from taar.recommenders.collaborative_recommender import (
TAAR_ITEM_MATRIX_BUCKET,
TAAR_ITEM_MATRIX_KEY,
TAAR_ADDON_MAPPING_BUCKET,
TAAR_ADDON_MAPPING_KEY,
)
import fakeredis
import mock
import contextlib
from taar.recommenders.redis_cache import TAARCache
from taar.recommenders.collaborative_recommender import CollaborativeRecommender
from taar.recommenders.collaborative_recommender import positive_hash
import json
from markus import TIMING
from markus.testing import MetricsMock
from .noop_fixtures import (
noop_taarlocale_dataload,
noop_taarlite_dataload,
noop_taarensemble_dataload,
noop_taarsimilarity_dataload,
)
"""
We need to generate a synthetic list of addons and relative weights
@ -33,29 +36,56 @@ the Java hash function.
"""
def install_none_mock_data(ctx):
def noop_other_recommenders(stack):
stack = noop_taarlocale_dataload(stack)
stack = noop_taarlite_dataload(stack)
stack = noop_taarsimilarity_dataload(stack)
stack = noop_taarensemble_dataload(stack)
return stack
@contextlib.contextmanager
def mock_install_none_mock_data(ctx):
"""
Overload the 'real' addon model and mapping URLs responses so that
we always get 404 errors.
"""
conn = boto3.resource("s3", region_name="us-west-2")
with contextlib.ExitStack() as stack:
TAARCache._instance = None
conn.create_bucket(Bucket=TAAR_ITEM_MATRIX_BUCKET)
conn.Object(TAAR_ITEM_MATRIX_BUCKET, TAAR_ITEM_MATRIX_KEY).put(Body="")
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_collaborative_item_matrix", return_value="",
)
)
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_collaborative_mapping_data", return_value="",
)
)
# Don't reuse connections with moto. badness happens
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_ADDON_MAPPING_BUCKET)
conn.Object(TAAR_ADDON_MAPPING_BUCKET, TAAR_ADDON_MAPPING_KEY).put(Body="")
return ctx
stack = noop_other_recommenders(stack)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
def install_mock_data(ctx):
"""
Overload the 'real' addon model and mapping URLs responses so that
we always the fixture data at the top of this test module.
"""
@contextlib.contextmanager
def mock_install_mock_data(ctx):
addon_space = [
{"id": "addon1.id", "name": "addon1.name", "isWebextension": True},
{"id": "addon2.id", "name": "addon2.name", "isWebextension": True},
@ -66,7 +96,10 @@ def install_mock_data(ctx):
fake_addon_matrix = []
for i, addon in enumerate(addon_space):
row = {"id": positive_hash(addon["id"]), "features": [0, 0.2, 0.0, 0.1, 0.15]}
row = {
"id": positive_hash(addon["id"]),
"features": [0, 0.2, 0.0, 0.1, 0.15],
}
row["features"][i] = 1.0
fake_addon_matrix.append(row)
@ -75,74 +108,123 @@ def install_mock_data(ctx):
java_hash = positive_hash(addon["id"])
fake_mapping[str(java_hash)] = addon
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_ITEM_MATRIX_BUCKET)
conn.Object(TAAR_ITEM_MATRIX_BUCKET, TAAR_ITEM_MATRIX_KEY).put(
Body=json.dumps(fake_addon_matrix)
)
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(
TAARCache,
"_fetch_collaborative_item_matrix",
return_value=fake_addon_matrix,
)
)
stack.enter_context(
mock.patch.object(
TAARCache,
"_fetch_collaborative_mapping_data",
return_value=fake_mapping,
)
)
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_ADDON_MAPPING_BUCKET)
conn.Object(TAAR_ADDON_MAPPING_BUCKET, TAAR_ADDON_MAPPING_KEY).put(
Body=json.dumps(fake_mapping)
)
stack = noop_other_recommenders(stack)
return ctx
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
@mock_s3
def test_cant_recommend(test_ctx):
ctx = install_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
with mock_install_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
assert not r.can_recommend({"installed_addons": []})
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
assert not r.can_recommend({"installed_addons": []})
@mock_s3
def test_can_recommend(test_ctx):
ctx = install_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
with mock_install_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
# For some reason, moto doesn't like to play nice with this call
# Check that we can recommend if we the user has at least an addon.
assert r.can_recommend(
{"installed_addons": ["uBlock0@raymondhill.net"], "client_id": "test-client"}
)
# Check that we can recommend if the user has at least an addon.
assert r.can_recommend(
{
"installed_addons": ["uBlock0@raymondhill.net"],
"client_id": "test-client",
}
)
@mock_s3
def test_can_recommend_no_model(test_ctx):
ctx = install_none_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
with mock_install_none_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
# We should never be able to recommend if something went wrong with the model.
assert not r.can_recommend({})
assert not r.can_recommend({"installed_addons": []})
assert not r.can_recommend({"installed_addons": ["uBlock0@raymondhill.net"]})
# We should never be able to recommend if something went wrong with the model.
assert not r.can_recommend({})
assert not r.can_recommend({"installed_addons": []})
assert not r.can_recommend({"installed_addons": ["uBlock0@raymondhill.net"]})
@mock_s3
def test_empty_recommendations(test_ctx):
# Tests that the empty recommender always recommends an empty list
# of addons if we have no addons
ctx = install_none_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
assert not r.can_recommend({})
with mock_install_none_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
assert not r.can_recommend({})
# Note that calling recommend() if can_recommend has failed is not
# defined.
# Note that calling recommend() if can_recommend has failed is not
# defined.
@mock_s3
def test_best_recommendation(test_ctx):
with MetricsMock() as mm:
# Make sure the structure of the recommendations is correct and that we
# recommended the the right addon.
ctx = install_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
with mock_install_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
# An non-empty set of addons should give a list of recommendations
fixture_client_data = {
"installed_addons": ["addon4.id"],
"client_id": "test_client",
}
assert r.can_recommend(fixture_client_data)
recommendations = r.recommend(fixture_client_data, 1)
assert isinstance(recommendations, list)
assert len(recommendations) == 1
# Verify that addon2 - the most heavy weighted addon was
# recommended
result = recommendations[0]
assert type(result) is tuple
assert len(result) == 2
assert result[0] == "addon2.id"
assert type(result[1]) is numpy.float64
assert numpy.isclose(result[1], numpy.float64("0.3225"))
assert mm.has_record(TIMING, stat="taar.collaborative_recommend")
def test_recommendation_weights(test_ctx):
"""
Weights should be ordered greatest to lowest
"""
with mock_install_mock_data(test_ctx):
r = CollaborativeRecommender(test_ctx)
# An non-empty set of addons should give a list of recommendations
fixture_client_data = {
@ -150,10 +232,9 @@ def test_best_recommendation(test_ctx):
"client_id": "test_client",
}
assert r.can_recommend(fixture_client_data)
recommendations = r.recommend(fixture_client_data, 1)
recommendations = r.recommend(fixture_client_data, 2)
assert isinstance(recommendations, list)
assert len(recommendations) == 1
assert len(recommendations) == 2
# Verify that addon2 - the most heavy weighted addon was
# recommended
@ -164,43 +245,11 @@ def test_best_recommendation(test_ctx):
assert type(result[1]) is numpy.float64
assert numpy.isclose(result[1], numpy.float64("0.3225"))
assert mm.has_record(TIMING, stat="taar.item_matrix")
assert mm.has_record(TIMING, stat="taar.addon_mapping")
assert mm.has_record(TIMING, stat="taar.collaborative_recommend")
@mock_s3
def test_recommendation_weights(test_ctx):
"""
Weights should be ordered greatest to lowest
"""
ctx = install_mock_data(test_ctx)
r = CollaborativeRecommender(ctx)
# An non-empty set of addons should give a list of recommendations
fixture_client_data = {
"installed_addons": ["addon4.id"],
"client_id": "test_client",
}
assert r.can_recommend(fixture_client_data)
recommendations = r.recommend(fixture_client_data, 2)
assert isinstance(recommendations, list)
assert len(recommendations) == 2
# Verify that addon2 - the most heavy weighted addon was
# recommended
result = recommendations[0]
assert type(result) is tuple
assert len(result) == 2
assert result[0] == "addon2.id"
assert type(result[1]) is numpy.float64
assert numpy.isclose(result[1], numpy.float64("0.3225"))
# Verify that addon2 - the most heavy weighted addon was
# recommended
result = recommendations[1]
assert type(result) is tuple
assert len(result) == 2
assert result[0] == "addon5.id"
assert type(result[1]) is numpy.float64
assert numpy.isclose(result[1], numpy.float64("0.29"))
# Verify that addon2 - the most heavy weighted addon was
# recommended
result = recommendations[1]
assert type(result) is tuple
assert len(result) == 2
assert result[0] == "addon5.id"
assert type(result[1]) is numpy.float64
assert numpy.isclose(result[1], numpy.float64("0.29"))

Просмотреть файл

@ -2,19 +2,17 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from taar.recommenders.ensemble_recommender import (
WeightCache,
EnsembleRecommender,
from taar.recommenders.ensemble_recommender import EnsembleRecommender
import mock
import contextlib
import fakeredis
from taar.recommenders.redis_cache import TAARCache
from .noop_fixtures import (
noop_taarlocale_dataload,
noop_taarcollab_dataload,
noop_taarlite_dataload,
noop_taarsimilarity_dataload,
)
from taar.settings import (
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
)
from moto import mock_s3
import boto3
import json
from .mocks import MockRecommenderFactory
from markus import TIMING
@ -23,146 +21,177 @@ from markus.testing import MetricsMock
EXPECTED = {"collaborative": 1000, "similarity": 100, "locale": 10}
def install_mock_ensemble_data(ctx):
def noop_loaders(stack):
stack = noop_taarlocale_dataload(stack)
stack = noop_taarcollab_dataload(stack)
stack = noop_taarlite_dataload(stack)
stack = noop_taarsimilarity_dataload(stack)
return stack
@contextlib.contextmanager
def mock_install_mock_ensemble_data(ctx):
DATA = {"ensemble_weights": EXPECTED}
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_ENSEMBLE_BUCKET)
conn.Object(TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY).put(Body=json.dumps(DATA))
WHITELIST_DATA = [
"2.0@disconnect.me",
"@contain-facebook",
"@testpilot-containers",
"CookieAutoDelete@kennydo.com",
"FirefoxColor@mozilla.com",
"adblockultimate@adblockultimate.net",
"addon@darkreader.org",
"adguardadblocker@adguard.com",
"adnauseam@rednoise.org",
"clearcache@michel.de.almeida",
"copyplaintext@eros.man",
"default-bookmark-folder@gustiaux.com",
"enhancerforyoutube@maximerf.addons.mozilla.org",
"extension@one-tab.com",
"extension@tabliss.io",
"firefox-addon@myki.co",
"firefox@ghostery.com",
"forecastfox@s3_fix_version",
"forget-me-not@lusito.info",
"foxyproxy@eric.h.jung",
"foxytab@eros.man",
"gmailnoads@mywebber.com",
]
conn.create_bucket(Bucket=TAAR_WHITELIST_BUCKET)
conn.Object(TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY).put(
Body=json.dumps(
[
"2.0@disconnect.me",
"@contain-facebook",
"@testpilot-containers",
"CookieAutoDelete@kennydo.com",
"FirefoxColor@mozilla.com",
"adblockultimate@adblockultimate.net",
"addon@darkreader.org",
"adguardadblocker@adguard.com",
"adnauseam@rednoise.org",
"clearcache@michel.de.almeida",
"copyplaintext@eros.man",
"default-bookmark-folder@gustiaux.com",
"enhancerforyoutube@maximerf.addons.mozilla.org",
"extension@one-tab.com",
"extension@tabliss.io",
"firefox-addon@myki.co",
"firefox@ghostery.com",
"forecastfox@s3_fix_version",
"forget-me-not@lusito.info",
"foxyproxy@eric.h.jung",
"foxytab@eros.man",
"gmailnoads@mywebber.com",
]
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(TAARCache, "_fetch_ensemble_weights", return_value=DATA,)
)
)
return ctx
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_whitelist", return_value=WHITELIST_DATA,
)
)
stack = noop_loaders(stack)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
@mock_s3
def test_weight_cache(test_ctx):
ctx = install_mock_ensemble_data(test_ctx)
wc = WeightCache(ctx)
actual = wc.getWeights()
assert EXPECTED == actual
@mock_s3
def test_recommendations(test_ctx):
with MetricsMock() as mm:
ctx = install_mock_ensemble_data(test_ctx)
EXPECTED_RESULTS = [
("ghi", 3430.0),
("def", 3320.0),
("ijk", 3200.0),
("hij", 3100.0),
("lmn", 420.0),
]
with mock_install_mock_ensemble_data(test_ctx):
factory = MockRecommenderFactory()
ctx["recommender_factory"] = factory
test_ctx["recommender_factory"] = factory
ctx["recommender_map"] = {
test_ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(ctx.child())
client = {"client_id": "12345"} # Anything will work here
r = EnsembleRecommender(test_ctx)
actual = r.getWeights()
assert EXPECTED == actual
def test_recommendations(test_ctx):
with MetricsMock() as mm:
with mock_install_mock_ensemble_data(test_ctx):
EXPECTED_RESULTS = [
("ghi", 3430.0),
("def", 3320.0),
("ijk", 3200.0),
("hij", 3100.0),
("lmn", 420.0),
]
factory = MockRecommenderFactory()
test_ctx["recommender_factory"] = factory
test_ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(test_ctx)
client = {"client_id": "12345"} # Anything will work here
recommendation_list = r.recommend(client, 5)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
assert mm.has_record(TIMING, "taar.ensemble_recommend")
def test_preinstalled_guids(test_ctx):
with mock_install_mock_ensemble_data(test_ctx):
EXPECTED_RESULTS = [
("ghi", 3430.0),
("ijk", 3200.0),
("lmn", 420.0),
("klm", 409.99999999999994),
("abc", 23.0),
]
factory = MockRecommenderFactory()
test_ctx["recommender_factory"] = factory
test_ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(test_ctx)
# 'hij' should be excluded from the suggestions list
# The other two addon GUIDs 'def' and 'jkl' will never be
# recommended anyway and should have no impact on results
client = {"client_id": "12345", "installed_addons": ["def", "hij", "jkl"]}
recommendation_list = r.recommend(client, 5)
print(recommendation_list)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
def test_mock_client_ids(test_ctx):
with mock_install_mock_ensemble_data(test_ctx):
EXPECTED_RESULTS = [
("2.0@disconnect.me", 0.17),
("@contain-facebook", 0.25),
("@testpilot-containers", 0.72),
("CookieAutoDelete@kennydo.com", 0.37),
("FirefoxColor@mozilla.com", 0.32),
]
factory = MockRecommenderFactory()
test_ctx["recommender_factory"] = factory
test_ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(test_ctx)
# 'hij' should be excluded from the suggestions list
# The other two addon GUIDs 'def' and 'jkl' will never be
# recommended anyway and should have no impact on results
client = {"client_id": "11111"}
recommendation_list = r.recommend(client, 5)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
assert mm.has_record(TIMING, "taar.ensemble")
assert mm.has_record(TIMING, "taar.ensemble_recommend")
@mock_s3
def test_preinstalled_guids(test_ctx):
ctx = install_mock_ensemble_data(test_ctx)
EXPECTED_RESULTS = [
("ghi", 3430.0),
("ijk", 3200.0),
("lmn", 420.0),
("klm", 409.99999999999994),
("abc", 23.0),
]
factory = MockRecommenderFactory()
ctx["recommender_factory"] = factory
ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(ctx.child())
# 'hij' should be excluded from the suggestions list
# The other two addon GUIDs 'def' and 'jkl' will never be
# recommended anyway and should have no impact on results
client = {"client_id": "12345", "installed_addons": ["def", "hij", "jkl"]}
recommendation_list = r.recommend(client, 5)
print(recommendation_list)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
@mock_s3
def test_mock_client_ids(test_ctx):
ctx = install_mock_ensemble_data(test_ctx)
EXPECTED_RESULTS = [
("2.0@disconnect.me", 0.17),
("@contain-facebook", 0.25),
("@testpilot-containers", 0.72),
("CookieAutoDelete@kennydo.com", 0.37),
("FirefoxColor@mozilla.com", 0.32),
]
factory = MockRecommenderFactory()
ctx["recommender_factory"] = factory
ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
r = EnsembleRecommender(ctx.child())
# 'hij' should be excluded from the suggestions list
# The other two addon GUIDs 'def' and 'jkl' will never be
# recommended anyway and should have no impact on results
client = {"client_id": "11111"}
recommendation_list = r.recommend(client, 5)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS

Просмотреть файл

@ -5,8 +5,15 @@ import pytest
import mock
import contextlib
from .noop_fixtures import (
noop_taarlocale_dataload,
noop_taarcollab_dataload,
noop_taarsimilarity_dataload,
noop_taarensemble_dataload,
)
from taar.recommenders.guid_based_recommender import GuidBasedRecommender
from taar.recommenders.redis_cache import AddonsCoinstallCache
from taar.recommenders.redis_cache import TAARCache
from taar.recommenders.redis_cache import NORMDATA_GUID_ROW_NORM_PREFIX
@ -85,23 +92,28 @@ RESULTS = {
def mock_coinstall_ranking_context(ctx, mock_coinstall, mock_ranking):
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache, "fetch_ranking_data", return_value=mock_ranking,
TAARCache, "_fetch_ranking_data", return_value=mock_ranking,
)
)
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache,
"fetch_coinstall_data",
return_value=mock_coinstall,
TAARCache, "_fetch_coinstall_data", return_value=mock_coinstall,
)
)
stack = noop_taarlocale_dataload(stack)
stack = noop_taarcollab_dataload(stack)
stack = noop_taarsimilarity_dataload(stack)
stack = noop_taarensemble_dataload(stack)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache,
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
@ -112,7 +124,7 @@ def mock_coinstall_ranking_context(ctx, mock_coinstall, mock_ranking):
)
# Initialize redis
AddonsCoinstallCache(ctx).safe_load_data()
TAARCache.get_instance(ctx).safe_load_data()
yield stack

Просмотреть файл

@ -1,138 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Test cases for the TAAR Hybrid recommender
"""
import pytest
from taar.recommenders.hybrid_recommender import CuratedRecommender
from taar.recommenders.hybrid_recommender import HybridRecommender
from taar.recommenders.ensemble_recommender import EnsembleRecommender
from taar.settings import TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
# from taar.recommenders.hybrid_recommender import ENSEMBLE_WEIGHTS
from .test_ensemblerecommender import install_mock_ensemble_data
from .mocks import MockRecommenderFactory
import json
from moto import mock_s3
import boto3
from markus import TIMING
from markus.testing import MetricsMock
def install_no_curated_data(ctx):
ctx = ctx.child()
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_WHITELIST_BUCKET)
conn.Object(TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY).put(Body="")
return ctx
def install_mock_curated_data(ctx):
mock_data = []
for i in range(20):
mock_data.append(str(i) * 16)
ctx = ctx.child()
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_WHITELIST_BUCKET)
conn.Object(TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY).put(
Body=json.dumps(mock_data)
)
return ctx
def install_ensemble_fixtures(ctx):
ctx = install_mock_ensemble_data(ctx)
factory = MockRecommenderFactory()
ctx["recommender_factory"] = factory
ctx["recommender_map"] = {
"collaborative": factory.create("collaborative"),
"similarity": factory.create("similarity"),
"locale": factory.create("locale"),
}
ctx["ensemble_recommender"] = EnsembleRecommender(ctx.child())
return ctx
@mock_s3
def test_curated_can_recommend(test_ctx):
ctx = install_no_curated_data(test_ctx)
r = CuratedRecommender(ctx)
# CuratedRecommender will always recommend something no matter
# what
assert r.can_recommend({})
assert r.can_recommend({"installed_addons": []})
@mock_s3
def test_curated_recommendations(test_ctx):
with MetricsMock() as mm:
ctx = install_mock_curated_data(test_ctx)
r = CuratedRecommender(ctx)
# CuratedRecommender will always recommend something no matter
# what
for LIMIT in range(1, 5):
guid_list = r.recommend({"client_id": "000000"}, limit=LIMIT)
# The curated recommendations should always return with some kind
# of recommendations
assert len(guid_list) == LIMIT
assert mm.has_record(TIMING, "taar.whitelist")
assert mm.has_record(TIMING, "taar.hybrid_recommend")
@pytest.mark.skip(reason="this test seems to break sporadically")
@mock_s3
def test_hybrid_recommendations(test_ctx):
# verify that the recommendations mix the curated and
# ensemble results
ctx = install_mock_curated_data(test_ctx)
ctx = install_ensemble_fixtures(ctx)
r = HybridRecommender(ctx)
# Test that we can generate lists of results
for LIMIT in range(4, 8):
guid_list = r.recommend({"client_id": "000000"}, limit=LIMIT)
# The curated recommendations should always return with some kind
# of recommendations
assert len(guid_list) == LIMIT
@pytest.mark.skip(reason="this test seems to break sporadically")
@mock_s3
def test_stable_hybrid_results(test_ctx):
# verify that the recommendations mix the curated and
# ensemble results
ctx = install_mock_curated_data(test_ctx)
ctx = install_ensemble_fixtures(ctx)
r = HybridRecommender(ctx)
# Test that the results are actually mixed
guid_list = r.recommend({"client_id": "000000"}, limit=4)
assert len(guid_list) == 4
# A mixed list will have two recommendations with weight = 1.0
# (curated) and 2 with exactly weight < 1.0 from the ensemble list
assert guid_list[0][1] == 1.0
assert guid_list[1][1] == 1.0
assert guid_list[2][1] < 1.0
assert guid_list[3][1] < 1.0

Просмотреть файл

@ -1,94 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import json
from taar.recommenders.lazys3 import LazyJSONLoader
import boto3
from moto import mock_s3
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
)
def install_categorical_data(ctx):
ctx = ctx.child()
conn = boto3.resource("s3", region_name="us-west-2")
try:
conn.create_bucket(Bucket=TAAR_SIMILARITY_BUCKET)
except Exception:
pass
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY).put(
Body=json.dumps({"test": "donor_key"})
)
ctx["similarity_donors_pool"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY
)
return ctx
@mock_s3
def test_does_it_load(test_ctx):
ctx = install_categorical_data(test_ctx)
jdata, status = ctx["similarity_donors_pool"].get()
assert jdata["test"] == "donor_key"
check_jdata_status(jdata, status)
@mock_s3
def test_cached_load(test_ctx):
ctx = install_categorical_data(test_ctx)
loader = ctx["similarity_donors_pool"]
jdata, status = loader.get()
check_jdata_status(jdata, status)
jdata, status = loader.get()
assert not status
@mock_s3
def test_reload_on_expiry(test_ctx):
ctx = install_categorical_data(test_ctx)
loader = ctx["similarity_donors_pool"]
jdata, status = loader.get()
check_jdata_status(jdata, status)
jdata, status = loader.get()
assert not status
# Force expirty time to be 10 seconds ago
loader._expiry_time = loader._clock.time() - 10
jdata, status = loader.get()
check_jdata_status(jdata, status)
@mock_s3
def test_force_expiry(test_ctx):
ctx = install_categorical_data(test_ctx)
loader = ctx["similarity_donors_pool"]
jdata, status = loader.get()
check_jdata_status(jdata, status)
jdata, status = loader.get()
assert not status
# Force expirty time to be 10 seconds ago
loader.force_expiry()
jdata, status = loader.get()
check_jdata_status(jdata, status)
def check_jdata_status(jdata, status):
assert jdata == {"test": "donor_key"}
assert status

Просмотреть файл

@ -2,9 +2,18 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from moto import mock_s3
import boto3
import mock
import contextlib
import fakeredis
from taar.recommenders.redis_cache import TAARCache
from .noop_fixtures import (
noop_taarcollab_dataload,
noop_taarlite_dataload,
noop_taarsimilarity_dataload,
noop_taarensemble_dataload,
)
import json
@ -14,6 +23,7 @@ from taar.settings import TAAR_LOCALE_KEY, TAAR_LOCALE_BUCKET
from markus import TIMING
from markus.testing import MetricsMock
FAKE_LOCALE_DATA = {
"te-ST": [
["{1e6b8bce-7dc8-481c-9f19-123e41332b72}", 0.1],
@ -37,32 +47,62 @@ def install_mock_data(ctx):
return ctx
@mock_s3
@contextlib.contextmanager
def mock_locale_data(ctx):
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_locale_data", return_value=FAKE_LOCALE_DATA,
)
)
stack = noop_taarlite_dataload(stack)
stack = noop_taarcollab_dataload(stack)
stack = noop_taarsimilarity_dataload(stack)
stack = noop_taarensemble_dataload(stack)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
def test_can_recommend(test_ctx):
ctx = install_mock_data(test_ctx)
r = LocaleRecommender(ctx)
with mock_locale_data(test_ctx):
r = LocaleRecommender(test_ctx)
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
assert not r.can_recommend({"locale": []})
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
assert not r.can_recommend({"locale": []})
# Check that we can recommend if the user has at least an addon.
assert r.can_recommend({"locale": "en"})
# Check that we can recommend if the user has at least an addon.
assert r.can_recommend({"locale": "en"})
@mock_s3
def test_can_recommend_no_model(test_ctx):
ctx = install_mock_data(test_ctx)
r = LocaleRecommender(ctx)
with mock_locale_data(test_ctx):
r = LocaleRecommender(test_ctx)
# We should never be able to recommend if something went
# wrong with the model.
assert not r.can_recommend({})
assert not r.can_recommend({"locale": []})
assert not r.can_recommend({"locale": "it"})
# We should never be able to recommend if something went
# wrong with the model.
assert not r.can_recommend({})
assert not r.can_recommend({"locale": []})
assert not r.can_recommend({"locale": "it"})
@mock_s3
def test_recommendations(test_ctx):
"""Test that the locale recommender returns the correct
locale dependent addons.
@ -71,27 +111,26 @@ def test_recommendations(test_ctx):
of (GUID, weight).
"""
with MetricsMock() as mm:
ctx = install_mock_data(test_ctx)
r = LocaleRecommender(ctx)
recommendations = r.recommend({"locale": "en"}, 10)
with mock_locale_data(test_ctx):
r = LocaleRecommender(test_ctx)
# Make sure the structure of the recommendations is correct and that we
# recommended the the right addon.
assert isinstance(recommendations, list)
assert len(recommendations) == len(FAKE_LOCALE_DATA["en"])
recommendations = r.recommend({"locale": "en"}, 10)
# Make sure that the reported addons are the one from the fake data.
for (addon_id, weight), (expected_id, expected_weight) in zip(
recommendations, FAKE_LOCALE_DATA["en"]
):
assert addon_id == expected_id
assert weight == expected_weight
# Make sure the structure of the recommendations is correct and that we
# recommended the the right addon.
assert isinstance(recommendations, list)
assert len(recommendations) == len(FAKE_LOCALE_DATA["en"])
assert mm.has_record(TIMING, "taar.locale")
assert mm.has_record(TIMING, "taar.locale_recommend")
# Make sure that the reported addons are the one from the fake data.
for (addon_id, weight), (expected_id, expected_weight) in zip(
recommendations, FAKE_LOCALE_DATA["en"]
):
assert addon_id == expected_id
assert weight == expected_weight
assert mm.has_record(TIMING, "taar.locale_recommend")
@mock_s3
def test_recommender_extra_data(test_ctx):
# Test that the recommender uses locale data from the "extra"
# section if available.
@ -109,11 +148,13 @@ def test_recommender_extra_data(test_ctx):
assert addon_id == expected_id
assert weight == expected_weight
ctx = install_mock_data(test_ctx)
r = LocaleRecommender(ctx)
recommendations = r.recommend({}, 10, extra_data={"locale": "en"})
validate_recommendations(recommendations, "en")
with mock_locale_data(test_ctx):
r = LocaleRecommender(test_ctx)
recommendations = r.recommend({}, 10, extra_data={"locale": "en"})
validate_recommendations(recommendations, "en")
# Make sure that we favour client data over the extra data.
recommendations = r.recommend({"locale": "en"}, 10, extra_data={"locale": "te-ST"})
validate_recommendations(recommendations, "en")
# Make sure that we favour client data over the extra data.
recommendations = r.recommend(
{"locale": "en"}, 10, extra_data={"locale": "te-ST"}
)
validate_recommendations(recommendations, "en")

Просмотреть файл

@ -52,10 +52,7 @@ MOCK_DATA = {
"tab_open_count": 46,
"total_uri": 791,
"unique_tlds": 11,
"installed_addons": [
"e10srollout@mozilla.org",
"firefox@getpocket.com",
],
"installed_addons": ["e10srollout@mozilla.org", "firefox@getpocket.com",],
"locale": "it-IT",
},
}

Просмотреть файл

@ -2,20 +2,17 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import boto3
import json
from moto import mock_s3
from taar.recommenders import RecommendationManager
from taar.recommenders.base_recommender import AbstractRecommender
from taar.recommenders.ensemble_recommender import (
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
from .noop_fixtures import (
noop_taarlocale_dataload,
noop_taarcollab_dataload,
noop_taarsimilarity_dataload,
noop_taarlite_dataload,
)
from .mocks import MockRecommenderFactory
from .test_hybrid_recommender import install_mock_curated_data
import operator
from functools import reduce
@ -23,6 +20,72 @@ from functools import reduce
from markus import TIMING
from markus.testing import MetricsMock
import mock
import contextlib
import fakeredis
from taar.recommenders.redis_cache import TAARCache
@contextlib.contextmanager
def mock_install_mock_curated_data(ctx):
mock_data = []
for i in range(20):
mock_data.append(str(i) * 16)
mock_ensemble_weights = {
"ensemble_weights": {"collaborative": 1000, "similarity": 100, "locale": 10,}
}
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(TAARCache, "_fetch_whitelist", return_value=mock_data)
)
stack.enter_context(
mock.patch.object(
TAARCache,
"_fetch_ensemble_weights",
return_value=mock_ensemble_weights,
)
)
stack = noop_taarlite_dataload(stack)
stack = noop_taarcollab_dataload(stack)
stack = noop_taarlocale_dataload(stack)
stack = noop_taarsimilarity_dataload(stack)
stack.enter_context(
mock.patch.object(TAARCache, "_fetch_whitelist", return_value=mock_data)
)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
class DefaultMockProfileFetcher:
def get(self, client_id):
return {"client_id": client_id}
mock_fetcher = DefaultMockProfileFetcher()
ctx["profile_fetcher"] = mock_fetcher
ctx["recommender_factory"] = MockRecommenderFactory()
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
class StubRecommender(AbstractRecommender):
""" A shared, stub recommender that can be used for testing.
@ -39,129 +102,94 @@ class StubRecommender(AbstractRecommender):
return self._recommendations
def install_mocks(ctx, mock_fetcher=None):
ctx = ctx.child()
class DefaultMockProfileFetcher:
def get(self, client_id):
return {"client_id": client_id}
if mock_fetcher is None:
mock_fetcher = DefaultMockProfileFetcher()
ctx["profile_fetcher"] = mock_fetcher
ctx["recommender_factory"] = MockRecommenderFactory()
DATA = {
"ensemble_weights": {"collaborative": 1000, "similarity": 100, "locale": 10,}
}
conn = boto3.resource("s3", region_name="us-west-2")
conn.create_bucket(Bucket=TAAR_ENSEMBLE_BUCKET)
conn.Object(TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY).put(Body=json.dumps(DATA))
return ctx
@mock_s3
def test_none_profile_returns_empty_list(test_ctx):
ctx = install_mocks(test_ctx)
with mock_install_mock_curated_data(test_ctx):
class MockProfileFetcher:
def get(self, client_id):
return None
class MockProfileFetcher:
def get(self, client_id):
return None
ctx["profile_fetcher"] = MockProfileFetcher()
test_ctx["profile_fetcher"] = MockProfileFetcher()
rec_manager = RecommendationManager(ctx)
assert rec_manager.recommend("random-client-id", 10) == []
rec_manager = RecommendationManager(test_ctx)
assert rec_manager.recommend("random-client-id", 10) == []
@mock_s3
def test_simple_recommendation(test_ctx):
ctx = install_mocks(test_ctx)
with mock_install_mock_curated_data(test_ctx):
EXPECTED_RESULTS = [
("ghi", 3430.0),
("def", 3320.0),
("ijk", 3200.0),
("hij", 3100.0),
("lmn", 420.0),
("klm", 409.99999999999994),
("jkl", 400.0),
("abc", 23.0),
("fgh", 22.0),
("efg", 21.0),
]
EXPECTED_RESULTS = [
("ghi", 3430.0),
("def", 3320.0),
("ijk", 3200.0),
("hij", 3100.0),
("lmn", 420.0),
("klm", 409.99999999999994),
("jkl", 400.0),
("abc", 23.0),
("fgh", 22.0),
("efg", 21.0),
]
with MetricsMock() as mm:
manager = RecommendationManager(ctx.child())
recommendation_list = manager.recommend("some_ignored_id", 10)
with MetricsMock() as mm:
manager = RecommendationManager(test_ctx)
recommendation_list = manager.recommend("some_ignored_id", 10)
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
assert isinstance(recommendation_list, list)
assert recommendation_list == EXPECTED_RESULTS
assert mm.has_record(TIMING, stat="taar.ensemble")
assert mm.has_record(TIMING, stat="taar.profile_recommendation")
assert mm.has_record(TIMING, stat="taar.profile_recommendation")
@mock_s3
def test_fixed_client_id_valid(test_ctx):
ctx = install_mocks(test_ctx)
ctx = install_mock_curated_data(ctx)
manager = RecommendationManager(ctx.child())
recommendation_list = manager.recommend("111111", 10)
assert len(recommendation_list) == 10
with mock_install_mock_curated_data(test_ctx):
manager = RecommendationManager(test_ctx)
recommendation_list = manager.recommend("111111", 10)
assert len(recommendation_list) == 10
@mock_s3
def test_fixed_client_id_empty_list(test_ctx):
class NoClientFetcher:
def get(self, client_id):
return None
ctx = install_mocks(test_ctx, mock_fetcher=NoClientFetcher())
with mock_install_mock_curated_data(test_ctx):
test_ctx["profile_fetcher"] = NoClientFetcher()
ctx = install_mock_curated_data(ctx)
manager = RecommendationManager(test_ctx)
recommendation_list = manager.recommend("not_a_real_client_id", 10)
manager = RecommendationManager(ctx.child())
recommendation_list = manager.recommend("not_a_real_client_id", 10)
assert len(recommendation_list) == 0
assert len(recommendation_list) == 0
@mock_s3
def test_experimental_randomization(test_ctx):
ctx = install_mocks(test_ctx)
ctx = install_mock_curated_data(ctx)
with mock_install_mock_curated_data(test_ctx):
manager = RecommendationManager(ctx.child())
raw_list = manager.recommend("111111", 10)
manager = RecommendationManager(test_ctx)
raw_list = manager.recommend("111111", 10)
# Clobber the experiment probability to be 100% to force a
# reordering.
ctx["TAAR_EXPERIMENT_PROB"] = 1.0
# Clobber the experiment probability to be 100% to force a
# reordering.
test_ctx["TAAR_EXPERIMENT_PROB"] = 1.0
manager = RecommendationManager(ctx.child())
rand_list = manager.recommend("111111", 10)
manager = RecommendationManager(test_ctx)
rand_list = manager.recommend("111111", 10)
"""
The two lists should be :
"""
The two lists should be :
* different (guid, weight) lists (possibly just order)
* same length
"""
assert (
reduce(
operator.and_,
[
(t1[0] == t2[0] and t1[1] == t2[1])
for t1, t2 in zip(rand_list, raw_list)
],
* different (guid, weight) lists (possibly just order)
* same length
"""
assert (
reduce(
operator.and_,
[
(t1[0] == t2[0] and t1[1] == t2[1])
for t1, t2 in zip(rand_list, raw_list)
],
)
is False
)
is False
)
assert len(rand_list) == len(raw_list)
assert len(rand_list) == len(raw_list)

Просмотреть файл

@ -6,12 +6,9 @@ import json
import six
import logging
import numpy as np
import scipy.stats
from taar.recommenders.lazys3 import LazyJSONLoader
import boto3
from moto import mock_s3
from taar.recommenders.similarity_recommender import (
CATEGORICAL_FEATURES,
@ -25,11 +22,24 @@ from .similarity_data import CATEGORICAL_FEATURE_FIXTURE_DATA
from markus import TIMING
from markus.testing import MetricsMock
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
TAAR_SIMILARITY_LRCURVES_KEY,
import fakeredis
import mock
import contextlib
from .noop_fixtures import (
noop_taarcollab_dataload,
noop_taarlite_dataload,
noop_taarlocale_dataload,
noop_taarensemble_dataload,
)
from taar.recommenders.redis_cache import TAARCache
def noop_loaders(stack):
stack = noop_taarlocale_dataload(stack)
stack = noop_taarcollab_dataload(stack)
stack = noop_taarensemble_dataload(stack)
stack = noop_taarlite_dataload(stack)
return stack
def generate_fake_lr_curves(num_elements, ceiling=10.0):
@ -68,311 +78,326 @@ def generate_a_fake_taar_client():
}
def install_no_data(ctx):
ctx = ctx.child()
conn = boto3.resource("s3", region_name="us-west-2")
@contextlib.contextmanager
def mock_install_no_data(ctx):
conn.create_bucket(Bucket=TAAR_SIMILARITY_BUCKET)
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY).put(Body="")
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(TAARCache, "_fetch_similarity_donors", return_value="",)
)
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY).put(Body="")
stack.enter_context(
mock.patch.object(TAARCache, "_fetch_similarity_lrcurves", return_value="",)
)
ctx["similarity_donors_pool"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY, "similarity_donor",
)
stack = noop_loaders(stack)
ctx["similarity_lr_curves"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY, "similarity_curves",
)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
return ctx
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
def install_categorical_data(ctx):
ctx = ctx.child()
conn = boto3.resource("s3", region_name="us-west-2")
@contextlib.contextmanager
def mock_install_categorical_data(ctx):
try:
conn.create_bucket(Bucket=TAAR_SIMILARITY_BUCKET)
except Exception:
pass
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY).put(
Body=json.dumps(CATEGORICAL_FEATURE_FIXTURE_DATA)
)
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(
TAARCache,
"_fetch_similarity_donors",
return_value=CATEGORICAL_FEATURE_FIXTURE_DATA,
)
)
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY).put(
Body=json.dumps(generate_fake_lr_curves(1000))
)
stack.enter_context(
mock.patch.object(
TAARCache,
"_fetch_similarity_lrcurves",
return_value=generate_fake_lr_curves(1000),
)
)
stack = noop_loaders(stack)
ctx["similarity_donors_pool"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY, "similarity_donor",
)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
ctx["similarity_lr_curves"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY, "similarity_curves",
)
return ctx
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
def install_continuous_data(ctx):
ctx = ctx.child()
cts_data = json.dumps(CONTINUOUS_FEATURE_FIXTURE_DATA)
lrs_data = json.dumps(generate_fake_lr_curves(1000))
@contextlib.contextmanager
def mock_install_continuous_data(ctx):
cts_data = CONTINUOUS_FEATURE_FIXTURE_DATA
lrs_data = generate_fake_lr_curves(1000)
conn = boto3.resource("s3", region_name="us-west-2")
with contextlib.ExitStack() as stack:
TAARCache._instance = None
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_similarity_donors", return_value=cts_data,
)
)
try:
conn.create_bucket(Bucket=TAAR_SIMILARITY_BUCKET)
except Exception:
pass
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY).put(Body=cts_data)
stack.enter_context(
mock.patch.object(
TAARCache, "_fetch_similarity_lrcurves", return_value=lrs_data,
)
)
stack = noop_loaders(stack)
conn.Object(TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY).put(Body=lrs_data)
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
TAARCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(db=0),
1: fakeredis.FakeStrictRedis(db=1),
2: fakeredis.FakeStrictRedis(db=2),
},
)
)
ctx["similarity_donors_pool"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_DONOR_KEY, "similarity_donor",
)
ctx["similarity_lr_curves"] = LazyJSONLoader(
ctx, TAAR_SIMILARITY_BUCKET, TAAR_SIMILARITY_LRCURVES_KEY, "similarity_curves",
)
return ctx
# Initialize redis
TAARCache.get_instance(ctx).safe_load_data()
yield stack
def check_matrix_built(caplog):
msg = "Reconstructed matrices for similarity recommender"
return sum([msg in str(s) for s in caplog.records]) > 0
@mock_s3
def test_soft_fail(test_ctx, caplog):
# Create a new instance of a SimilarityRecommender.
ctx = install_no_data(test_ctx)
r = SimilarityRecommender(ctx)
with mock_install_no_data(test_ctx):
r = SimilarityRecommender(test_ctx)
# Don't recommend if the source files cannot be found.
assert not r.can_recommend({})
assert not check_matrix_built(caplog)
# Don't recommend if the source files cannot be found.
assert not r.can_recommend({})
@mock_s3
def test_can_recommend(test_ctx, caplog):
caplog.set_level(logging.INFO)
# Create a new instance of a SimilarityRecommender.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
assert check_matrix_built(caplog)
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
# Test that we can't recommend if we have not enough client info.
assert not r.can_recommend({})
# Test that we can recommend for a normal client.
assert r.can_recommend(generate_a_fake_taar_client())
# Test that we can recommend for a normal client.
assert r.can_recommend(generate_a_fake_taar_client())
# Check that we can not recommend if any required client field is missing.
required_fields = CATEGORICAL_FEATURES + CONTINUOUS_FEATURES
# Check that we can not recommend if any required client field is missing.
required_fields = CATEGORICAL_FEATURES + CONTINUOUS_FEATURES
for required_field in required_fields:
profile_without_x = generate_a_fake_taar_client()
for required_field in required_fields:
profile_without_x = generate_a_fake_taar_client()
# Make an empty value in a required field in the client info dict.
profile_without_x[required_field] = None
assert not r.can_recommend(profile_without_x)
# Make an empty value in a required field in the client info dict.
profile_without_x[required_field] = None
assert not r.can_recommend(profile_without_x)
# Completely remove (in place) the entire required field from the dict.
del profile_without_x[required_field]
assert not r.can_recommend(profile_without_x)
# Completely remove (in place) the entire required field from the dict.
del profile_without_x[required_field]
assert not r.can_recommend(profile_without_x)
@mock_s3
def test_recommendations(test_ctx):
with MetricsMock() as mm:
# Create a new instance of a SimilarityRecommender.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
recommendation_list = r.recommend(generate_a_fake_taar_client(), 1)
recommendation_list = r.recommend(generate_a_fake_taar_client(), 1)
assert isinstance(recommendation_list, list)
assert len(recommendation_list) == 1
assert isinstance(recommendation_list, list)
assert len(recommendation_list) == 1
recommendation, weight = recommendation_list[0]
recommendation, weight = recommendation_list[0]
# Make sure that the reported addons are the expected ones from the most similar donor.
assert "{test-guid-1}" == recommendation
assert type(weight) == np.float64
# Make sure that the reported addons are the expected ones from the most similar donor.
assert "{test-guid-1}" == recommendation
assert type(weight) == np.float64
assert mm.has_record(TIMING, stat="taar.similarity_donor")
assert mm.has_record(TIMING, stat="taar.similarity_curves")
assert mm.has_record(TIMING, stat="taar.similarity_recommend")
assert mm.has_record(TIMING, stat="taar.similarity_recommend")
@mock_s3
def test_recommender_str(test_ctx):
# Tests that the string representation of the recommender is correct.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
assert str(r) == "SimilarityRecommender"
@mock_s3
def test_get_lr(test_ctx):
# Tests that the likelihood ratio values are not empty for extreme values and are realistic.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
assert r.get_lr(0.0001) is not None
assert r.get_lr(10.0) is not None
assert r.get_lr(0.001) > r.get_lr(5.0)
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
assert r.get_lr(0.0001) is not None
assert r.get_lr(10.0) is not None
assert r.get_lr(0.001) > r.get_lr(5.0)
@mock_s3
def test_compute_clients_dist(test_ctx):
# Test the distance function computation.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
test_clients = [
{
"client_id": "test-client-002",
"activeAddons": [],
"geo_city": "sfo-us",
"subsession_length": 1,
"locale": "en-US",
"os": "windows",
"bookmark_count": 1,
"tab_open_count": 1,
"total_uri": 1,
"unique_tlds": 1,
},
{
"client_id": "test-client-003",
"activeAddons": [],
"geo_city": "brasilia-br",
"subsession_length": 1,
"locale": "br-PT",
"os": "windows",
"bookmark_count": 10,
"tab_open_count": 1,
"total_uri": 1,
"unique_tlds": 1,
},
{
"client_id": "test-client-004",
"activeAddons": [],
"geo_city": "brasilia-br",
"subsession_length": 100,
"locale": "br-PT",
"os": "windows",
"bookmark_count": 10,
"tab_open_count": 10,
"total_uri": 100,
"unique_tlds": 10,
},
]
per_client_test = []
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
test_clients = [
{
"client_id": "test-client-002",
"activeAddons": [],
"geo_city": "sfo-us",
"subsession_length": 1,
"locale": "en-US",
"os": "windows",
"bookmark_count": 1,
"tab_open_count": 1,
"total_uri": 1,
"unique_tlds": 1,
},
{
"client_id": "test-client-003",
"activeAddons": [],
"geo_city": "brasilia-br",
"subsession_length": 1,
"locale": "br-PT",
"os": "windows",
"bookmark_count": 10,
"tab_open_count": 1,
"total_uri": 1,
"unique_tlds": 1,
},
{
"client_id": "test-client-004",
"activeAddons": [],
"geo_city": "brasilia-br",
"subsession_length": 100,
"locale": "br-PT",
"os": "windows",
"bookmark_count": 10,
"tab_open_count": 10,
"total_uri": 100,
"unique_tlds": 10,
},
]
per_client_test = []
# Compute a different set of distances for each set of clients.
for tc in test_clients:
test_distances = r.compute_clients_dist(tc)
assert len(test_distances) == len(CONTINUOUS_FEATURE_FIXTURE_DATA)
per_client_test.append(test_distances[2][0])
# Compute a different set of distances for each set of clients.
for tc in test_clients:
test_distances = r.compute_clients_dist(tc)
assert len(test_distances) == len(CONTINUOUS_FEATURE_FIXTURE_DATA)
per_client_test.append(test_distances[2][0])
# Ensure the different clients also had different distances to a specific donor.
assert per_client_test[0] >= per_client_test[1] >= per_client_test[2]
# Ensure the different clients also had different distances to a specific donor.
assert per_client_test[0] >= per_client_test[1] >= per_client_test[2]
@mock_s3
def test_distance_functions(test_ctx):
# Tests the similarity functions via expected output when passing modified client data.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
# Tests the similarity functions via expected output when passing
# modified client data.
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
# Generate a fake client.
test_client = generate_a_fake_taar_client()
recs = r.recommend(test_client, 10)
assert len(recs) > 0
# Generate a fake client.
test_client = generate_a_fake_taar_client()
recs = r.recommend(test_client, 10)
assert len(recs) > 0
# Make it a generally poor match for the donors.
test_client.update({"total_uri": 10, "bookmark_count": 2, "subsession_length": 10})
# Make it a generally poor match for the donors.
test_client.update(
{"total_uri": 10, "bookmark_count": 2, "subsession_length": 10}
)
all_client_values_zero = test_client
# Make all categorical variables non-matching with any donor.
all_client_values_zero.update(
{key: "zero" for key in test_client.keys() if key in CATEGORICAL_FEATURES}
)
recs = r.recommend(all_client_values_zero, 10)
assert len(recs) == 0
all_client_values_zero = test_client
# Make all categorical variables non-matching with any donor.
all_client_values_zero.update(
{key: "zero" for key in test_client.keys() if key in CATEGORICAL_FEATURES}
)
recs = r.recommend(all_client_values_zero, 10)
assert len(recs) == 0
# Make all continuous variables equal to zero.
all_client_values_zero.update(
{key: 0 for key in test_client.keys() if key in CONTINUOUS_FEATURES}
)
recs = r.recommend(all_client_values_zero, 10)
assert len(recs) == 0
# Make all continuous variables equal to zero.
all_client_values_zero.update(
{key: 0 for key in test_client.keys() if key in CONTINUOUS_FEATURES}
)
recs = r.recommend(all_client_values_zero, 10)
assert len(recs) == 0
# Make all categorical variables non-matching with any donor.
all_client_values_high = test_client
all_client_values_high.update(
{
key: "one billion"
for key in test_client.keys()
if key in CATEGORICAL_FEATURES
}
)
recs = r.recommend(all_client_values_high, 10)
assert len(recs) == 0
# Make all categorical variables non-matching with any donor.
all_client_values_high = test_client
all_client_values_high.update(
{
key: "one billion"
for key in test_client.keys()
if key in CATEGORICAL_FEATURES
}
)
recs = r.recommend(all_client_values_high, 10)
assert len(recs) == 0
# Make all continuous variables equal to a very high numerical value.
all_client_values_high.update(
{key: 1e60 for key in test_client.keys() if key in CONTINUOUS_FEATURES}
)
recs = r.recommend(all_client_values_high, 10)
assert len(recs) == 0
# Make all continuous variables equal to a very high numerical value.
all_client_values_high.update(
{key: 1e60 for key in test_client.keys() if key in CONTINUOUS_FEATURES}
)
recs = r.recommend(all_client_values_high, 10)
assert len(recs) == 0
# Test for 0.0 values if j_c is not normalized and j_d is fine.
j_c = 0.0
j_d = 0.42
assert abs(j_c * j_d) == 0.0
assert abs((j_c + 0.01) * j_d) != 0.0
# Test for 0.0 values if j_c is not normalized and j_d is fine.
j_c = 0.0
j_d = 0.42
assert abs(j_c * j_d) == 0.0
assert abs((j_c + 0.01) * j_d) != 0.0
@mock_s3
def test_weights_continuous(test_ctx):
# Create a new instance of a SimilarityRecommender.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
with mock_install_continuous_data(test_ctx):
r = SimilarityRecommender(test_ctx)
# In the ensemble method recommendations should be a sorted list of tuples
# containing [(guid, weight), (guid, weight)... (guid, weight)].
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
with open("/tmp/similarity_recommender.json", "w") as fout:
fout.write(json.dumps(recommendation_list))
# In the ensemble method recommendations should be a sorted list of tuples
# containing [(guid, weight), (guid, weight)... (guid, weight)].
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
with open("/tmp/similarity_recommender.json", "w") as fout:
fout.write(json.dumps(recommendation_list))
# Make sure the structure of the recommendations is correct and
# that we recommended the the right addons.
# Make sure the structure of the recommendations is correct and
# that we recommended the the right addons.
assert len(recommendation_list) == 2
for recommendation, weight in recommendation_list:
assert isinstance(recommendation, six.string_types)
assert isinstance(weight, float)
assert len(recommendation_list) == 2
for recommendation, weight in recommendation_list:
assert isinstance(recommendation, six.string_types)
assert isinstance(weight, float)
# Test that sorting is appropriate.
rec0 = recommendation_list[0]
rec1 = recommendation_list[1]
# Test that sorting is appropriate.
rec0 = recommendation_list[0]
rec1 = recommendation_list[1]
rec0_weight = rec0[1]
rec1_weight = rec1[1]
rec0_weight = rec0[1]
rec1_weight = rec1[1]
# Duplicate presence of test-guid-1 should mean rec0_weight is double
# rec1_weight, and both should be greater than 1.0
# Duplicate presence of test-guid-1 should mean rec0_weight is double
# rec1_weight, and both should be greater than 1.0
assert rec0_weight > rec1_weight > 1.0
assert rec0_weight > rec1_weight > 1.0
@mock_s3
def test_weights_categorical(test_ctx):
"""
This should get :
@ -383,48 +408,24 @@ def test_weights_categorical(test_ctx):
"""
# Create a new instance of a SimilarityRecommender.
cat_ctx = install_categorical_data(test_ctx)
cts_ctx = install_continuous_data(test_ctx)
with mock_install_categorical_data(test_ctx):
r = SimilarityRecommender(test_ctx)
wrapped = cts_ctx.wrap(cat_ctx)
r = SimilarityRecommender(wrapped)
# In the ensemble method recommendations should be a sorted list of tuples
# containing [(guid, weight), (guid, weight)... (guid, weight)].
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
# In the ensemble method recommendations should be a sorted list of tuples
# containing [(guid, weight), (guid, weight)... (guid, weight)].
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
assert len(recommendation_list) == 2
# Make sure the structure of the recommendations is correct and that we recommended the the right addons.
for recommendation, weight in recommendation_list:
assert isinstance(recommendation, six.string_types)
assert isinstance(weight, float)
assert len(recommendation_list) == 2
# Make sure the structure of the recommendations is correct and that we recommended the the right addons.
for recommendation, weight in recommendation_list:
assert isinstance(recommendation, six.string_types)
assert isinstance(weight, float)
# Test that sorting is appropriate.
rec0 = recommendation_list[0]
rec1 = recommendation_list[1]
# Test that sorting is appropriate.
rec0 = recommendation_list[0]
rec1 = recommendation_list[1]
rec0_weight = rec0[1]
rec1_weight = rec1[1]
rec0_weight = rec0[1]
rec1_weight = rec1[1]
assert rec0_weight > rec1_weight > 0
@mock_s3
def test_recompute_matrices(test_ctx, caplog):
caplog.set_level(logging.INFO)
# Create a new instance of a SimilarityRecommender.
ctx = install_continuous_data(test_ctx)
r = SimilarityRecommender(ctx)
# Reloading the donors pool should reconstruct the matrices
caplog.clear()
r._donors_pool.force_expiry()
r.donors_pool
assert check_matrix_built(caplog)
# Reloading the LR curves should reconstruct the matrices
caplog.clear()
r._lr_curves.force_expiry()
r.lr_curves
assert check_matrix_built(caplog)
assert rec0_weight > rec1_weight > 0