* Consolidate env configuration into taar.settings and add a TAARLITE_MAX_RESULTS configuration
* Rework TAARLite recommender to use threaded caching
* Added cache warmup prior to process starting
* add target for a local start into makefile
* pytest updates to accomodate new redis requirements
* Added TAARlite GUID Ranking caching
* Reworked taarlite to precompute all values and run with redis
* Truncate the length of the GUID coinstallation list to keep the normalization times bounded for taarlite

Most of the performance problems with TAARlite had to do with normalizing very long lists of GUIDs.  

The normalization method now truncates the list to a multiple of the maximum number of TAARLITE suggestions (controlled by `TAARLITE_TRUNCATE` which defaults to 5x `TAARLITE_MAX_RESULTS`)
This commit is contained in:
Victor Ng 2020-08-26 14:01:19 -04:00 коммит произвёл GitHub
Родитель 70989662e4
Коммит 99f278eaa3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
22 изменённых файлов: 899 добавлений и 488 удалений

Просмотреть файл

@ -27,6 +27,9 @@ up:
-v ~/.config:/app/.config \
-v ~/.aws:/app/.aws \
-v ~/.gcp_creds:/app/.gcp_creds \
-e WORKERS=1 \
-e THREADS=2 \
-e LOG_LEVEL=20 \
-e GOOGLE_APPLICATION_CREDENTIALS=/app/.gcp_creds/vng-taar-dev-clientinfo-svc.json \
-e TAAR_API_PLUGIN=taar.plugin \
-e TAAR_ITEM_MATRIX_BUCKET=telemetry-public-analysis-2 \
@ -43,11 +46,12 @@ up:
-e TAAR_SIMILARITY_DONOR_KEY=taar/similarity/donors.json \
-e TAAR_SIMILARITY_LRCURVES_KEY=taar/similarity/lr_curves.json \
-e TAAR_MAX_RESULTS=10 \
-e TAARLITE_MAX_RESULTS=4 \
-e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \
-e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \
-e BIGTABLE_PROJECT_ID \
-e BIGTABLE_INSTANCE_ID \
-e BIGTABLE_TABLE_ID \
-e BIGTABLE_PROJECT_ID=${BIGTABLE_PROJECT_ID} \
-e BIGTABLE_INSTANCE_ID=${BIGTABLE_INSTANCE_ID} \
-e BIGTABLE_TABLE_ID=${BIGTABLE_TABLE_ID} \
-e GCLOUD_PROJECT=${GCLOUD_PROJECT} \
-p 8000:8000 \
-it taar:latest
@ -55,5 +59,8 @@ up:
test-container:
docker run -e CODECOV_TOKEN=${CODECOV_TOKEN} -it taar:latest test
run_local:
TAAR_API_PLUGIN=taar.plugin python taar/flask_app.py
shell:
docker run -it taar:latest bash

Просмотреть файл

@ -102,6 +102,20 @@ create a new release has been split out into separate
## Dependencies
### Google Cloud Storage resources
### TODO: put this into a table to be easier to read
The final TAAR models are stored in:
```gs://moz-fx-data-taar-pr-prod-e0f7-prod-models```
The TAAR production model bucket is defined in Airflow under the
variable `taar_etl_model_storage_bucket`
Temporary models that the Airflow ETL jobs require are stored in a
temporary bucket defined in the Airflow variable `taar_etl_storage_bucket`
### AWS resources
Recommendation engines load models from Amazon S3.

41
docker-compose.yml Normal file
Просмотреть файл

@ -0,0 +1,41 @@
version: "3.8"
services:
redis:
image: "redis:alpine"
ports:
- "6379:6379"
web:
image: "taar:latest"
depends_on:
- redis
environment:
- MUSICMATCH_API=${MUSICMATCH_API}
- WORKERS=1
- THREADS=2
- LOG_LEVEL=20
- GOOGLE_APPLICATION_CREDENTIALS=/app/.gcp_creds/vng-taar-dev-clientinfo-svc.json
- REDIS_HOST=redis
- TAAR_API_PLUGIN=taar.plugin
- TAAR_ITEM_MATRIX_BUCKET=telemetry-public-analysis-2
- TAAR_ITEM_MATRIX_KEY=telemetry-ml/addon_recommender/item_matrix.json
- TAAR_ADDON_MAPPING_BUCKET=telemetry-public-analysis-2
- TAAR_ADDON_MAPPING_KEY=telemetry-ml/addon_recommender/addon_mapping.json
- TAAR_ENSEMBLE_BUCKET=telemetry-parquet
- TAAR_ENSEMBLE_KEY=taar/ensemble/ensemble_weight.json
- TAAR_WHITELIST_BUCKET=telemetry-parquet
- TAAR_WHITELIST_KEY=telemetry-ml/addon_recommender/only_guids_top_200.json
- TAAR_LOCALE_BUCKET=telemetry-parquet
- TAAR_LOCALE_KEY=taar/locale/top10_dict.json
- TAAR_SIMILARITY_BUCKET=telemetry-parquet
- TAAR_SIMILARITY_DONOR_KEY=taar/similarity/donors.json
- TAAR_SIMILARITY_LRCURVES_KEY=taar/similarity/lr_curves.json
- TAAR_MAX_RESULTS=10
- TAARLITE_MAX_RESULTS=4
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- BIGTABLE_PROJECT_ID=${BIGTABLE_PROJECT_ID}
- BIGTABLE_INSTANCE_ID=${BIGTABLE_INSTANCE_ID}
- BIGTABLE_TABLE_ID=${BIGTABLE_TABLE_ID}
ports:
- "8000:8000"

Просмотреть файл

@ -3,7 +3,6 @@ channels:
- conda-forge
- defaults
dependencies:
- ipython=7.14.0
- pip=20.1.1
- python=3.7.6
- python_abi=3.7
@ -50,6 +49,7 @@ dependencies:
- entrypoints==0.3
- enum34==1.1.6
- fabric==2.1.3
- fakeredis==1.4.3
- flake8==3.7.7
- flask==1.0.2
- flask-api==1.0
@ -85,6 +85,7 @@ dependencies:
- more-itertools==4.2.0
- moto==1.3.14
- mozilla-srgutil==0.1.7
- mozilla-jsoncache==0.1.7
- networkx==2.4
- newrelic==5.14.1.144
- packaging==17.1
@ -121,6 +122,7 @@ dependencies:
- python-jose==3.1.0
- pytz==2018.5
- pyyaml==5.3.1
- redis==3.5.3
- requests==2.23.0
- requests-toolbelt==0.8.0
- responses==0.9.0
@ -130,7 +132,6 @@ dependencies:
- sentry-sdk==0.7.3
- setuptools-scm==2.1.0
- simplegeneric==0.8.1
- six==1.11.0
- spark==0.2.1
- spark-parser==1.8.7
- sshpubkeys==3.1.0

Просмотреть файл

@ -2,7 +2,6 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from decouple import config
from flask import request
import json
@ -13,11 +12,48 @@ from taar.context import default_context
from taar.profile_fetcher import ProfileFetcher
from taar import recommenders
# These are configurations that are specific to the TAAR library
TAAR_MAX_RESULTS = config("TAAR_MAX_RESULTS", default=10, cast=int)
from taar.settings import (
TAAR_MAX_RESULTS,
TAARLITE_MAX_RESULTS,
STATSD_HOST,
STATSD_PORT,
)
STATSD_HOST = config("STATSD_HOST", default="localhost", cast=str)
STATSD_PORT = config("STATSD_PORT", default=8125, cast=int)
def acquire_taarlite_singleton(PROXY_MANAGER):
if PROXY_MANAGER.getTaarLite() is None:
ctx = default_context()
root_ctx = ctx.child()
instance = recommenders.GuidBasedRecommender(root_ctx)
PROXY_MANAGER.setTaarLite(instance)
return PROXY_MANAGER.getTaarLite()
def acquire_taar_singleton(PROXY_MANAGER):
if PROXY_MANAGER.getTaarRM() is None:
ctx = default_context()
profile_fetcher = ProfileFetcher(ctx)
ctx["profile_fetcher"] = profile_fetcher
# Lock the context down after we've got basic bits installed
root_ctx = ctx.child()
r_factory = recommenders.RecommenderFactory(root_ctx)
root_ctx["recommender_factory"] = r_factory
instance = recommenders.RecommendationManager(root_ctx.child())
PROXY_MANAGER.setTaarRM(instance)
return PROXY_MANAGER.getTaarRM()
def warm_caches():
import sys
if "pytest" in sys.modules:
# Don't clobber the taarlite singleton under test
return
global PROXY_MANAGER
acquire_taarlite_singleton(PROXY_MANAGER)
class ResourceProxy(object):
@ -113,10 +149,10 @@ def configure_plugin(app): # noqa: C901
cdict["normalize"] = normalization_type
recommendations = taarlite_recommender.recommend(
client_data=cdict, limit=TAAR_MAX_RESULTS
client_data=cdict, limit=TAARLITE_MAX_RESULTS
)
if len(recommendations) != TAAR_MAX_RESULTS:
if len(recommendations) != TAARLITE_MAX_RESULTS:
recommendations = []
# Strip out weights from TAAR results to maintain compatibility
@ -216,29 +252,6 @@ def configure_plugin(app): # noqa: C901
)
return response
def acquire_taarlite_singleton(PROXY_MANAGER):
if PROXY_MANAGER.getTaarLite() is None:
ctx = default_context()
root_ctx = ctx.child()
instance = recommenders.GuidBasedRecommender(root_ctx)
PROXY_MANAGER.setTaarLite(instance)
return PROXY_MANAGER.getTaarLite()
def acquire_taar_singleton(PROXY_MANAGER):
if PROXY_MANAGER.getTaarRM() is None:
ctx = default_context()
profile_fetcher = ProfileFetcher(ctx)
ctx["profile_fetcher"] = profile_fetcher
# Lock the context down after we've got basic bits installed
root_ctx = ctx.child()
r_factory = recommenders.RecommenderFactory(root_ctx)
root_ctx["recommender_factory"] = r_factory
instance = recommenders.RecommendationManager(root_ctx.child())
PROXY_MANAGER.setTaarRM(instance)
return PROXY_MANAGER.getTaarRM()
class MyPlugin:
def set(self, config_options):
"""
@ -252,4 +265,5 @@ def configure_plugin(app): # noqa: C901
if "PROXY_RESOURCE" in config_options:
PROXY_MANAGER._resource = config_options["PROXY_RESOURCE"]
warm_caches()
return MyPlugin()

Просмотреть файл

@ -11,10 +11,12 @@ import threading
from .base_recommender import AbstractRecommender
from .s3config import TAAR_ITEM_MATRIX_BUCKET
from .s3config import TAAR_ITEM_MATRIX_KEY
from .s3config import TAAR_ADDON_MAPPING_BUCKET
from .s3config import TAAR_ADDON_MAPPING_KEY
from taar.settings import (
TAAR_ITEM_MATRIX_BUCKET,
TAAR_ITEM_MATRIX_KEY,
TAAR_ADDON_MAPPING_BUCKET,
TAAR_ADDON_MAPPING_KEY,
)
import markus

Просмотреть файл

@ -7,10 +7,12 @@ import itertools
from .base_recommender import AbstractRecommender
from .lazys3 import LazyJSONLoader
from .s3config import TAAR_WHITELIST_BUCKET
from .s3config import TAAR_WHITELIST_KEY
from .s3config import TAAR_ENSEMBLE_BUCKET
from .s3config import TAAR_ENSEMBLE_KEY
from taar.settings import (
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
)
from taar.utils import hasher

Просмотреть файл

@ -2,26 +2,19 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
from contextlib import contextmanager
import time
from .lazys3 import LazyJSONLoader
from srgutil.interfaces import IMozLogging
import markus
from .s3config import (
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
TAARLITE_GUID_RANKING_KEY,
)
from taar.recommenders.redis_cache import AddonsCoinstallCache
metrics = markus.get_metrics("taar")
ADDON_DL_ERR = (
f"Cannot download addon coinstallation file {TAARLITE_GUID_COINSTALL_KEY}"
)
NORM_MODE_ROWNORMSUM = "rownorm_sum"
NORM_MODE_ROWCOUNT = "row_count"
@ -29,6 +22,26 @@ NORM_MODE_ROWSUM = "row_sum"
NORM_MODE_GUIDCEPTION = "guidception"
@contextmanager
def log_timer_debug(msg, logger):
start_time = time.time()
try:
yield
finally:
end_time = time.time()
logger.debug(msg + f" Completed in {end_time-start_time} seconds")
@contextmanager
def log_timer_info(msg, logger):
start_time = time.time()
try:
yield
finally:
end_time = time.time()
logger.info(msg + f" Completed in {end_time-start_time} seconds")
class GuidBasedRecommender:
""" A recommender class that returns top N addons based on a
passed addon identifier. This will load a json file containing
@ -61,117 +74,14 @@ class GuidBasedRecommender:
def __init__(self, ctx):
self._ctx = ctx
if "coinstall_loader" in self._ctx:
self._addons_coinstall_loader = self._ctx["coinstall_loader"]
else:
self._addons_coinstall_loader = LazyJSONLoader(
self._ctx,
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
"guid_coinstall",
)
if "ranking_loader" in self._ctx:
self._guid_ranking_loader = self._ctx["ranking_loader"]
else:
self._guid_ranking_loader = LazyJSONLoader(
self._ctx,
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_RANKING_KEY,
"guid_ranking",
)
self._init_from_ctx()
# Force access to the JSON models for each request at
# recommender construction. This was lifted out of the
# constructor for the LazyJSONLoader so that the
# precomputation of the normalization tables can be done in
# the recommender.
_ = self._addons_coinstallations # noqa
_ = self._guid_rankings # noqa
self.logger.info("GUIDBasedRecommender is initialized")
def _init_from_ctx(self):
self.logger = self._ctx[IMozLogging].get_logger("taarlite")
if self._addons_coinstallations is None:
self.logger.error(ADDON_DL_ERR)
# Compute the floor install incidence that recommended addons
# must satisfy. Take 5% of the mean of all installed addons.
self._min_installs = np.mean(list(self._guid_rankings.values())) * 0.05
# Warn if the minimum number of installs drops below 100.
if self._min_installs < 100:
self.logger.warning(
"minimum installs threshold low: [%s]" % self._min_installs
)
@property
def _addons_coinstallations(self):
result, refreshed = self._addons_coinstall_loader.get()
if refreshed:
self.logger.info("Refreshing guid_maps for normalization")
self._precompute_normalization()
return result
@property
def _guid_rankings(self):
result, refreshed = self._guid_ranking_loader.get()
if refreshed:
self.logger.info("Refreshing guid_maps for normalization")
self._precompute_normalization()
return result
def _precompute_normalization(self):
if self._addons_coinstallations is None:
self.logger.error("Cannot find addon coinstallations to normalize.")
return
# Capture the total number of times that a guid was
# coinstalled with another GUID
#
# This is a map is guid->sum of coinstall counts
guid_count_map = {}
# Capture the number of times a GUID shows up per row
# of coinstallation data.
#
# This is a map of guid->rows that this guid appears on
row_count = {}
guid_row_norm = {}
for guidkey, coinstalls in self._addons_coinstallations.items():
rowsum = sum(coinstalls.values())
for coinstall_guid, coinstall_count in coinstalls.items():
# Capture the total number of time a GUID was
# coinstalled with other guids
guid_count_map.setdefault(coinstall_guid, 0)
guid_count_map[coinstall_guid] += coinstall_count
# Capture the unique number of times a GUID is
# coinstalled with other guids
row_count.setdefault(coinstall_guid, 0)
row_count[coinstall_guid] += 1
if coinstall_guid not in guid_row_norm:
guid_row_norm[coinstall_guid] = []
guid_row_norm[coinstall_guid].append(1.0 * coinstall_count / rowsum)
self._guid_maps = {
"count_map": guid_count_map,
"row_count": row_count,
"guid_row_norm": guid_row_norm,
}
self._redis_cache = AddonsCoinstallCache(self._ctx)
self.logger.info("GUIDBasedRecommender is initialized")
def can_recommend(self, client_data):
# We can't recommend if we don't have our data files.
if self._addons_coinstallations is None:
if not self._redis_cache.is_active():
return False
# If we have data coming from other sources, we can use that for
@ -181,10 +91,10 @@ class GuidBasedRecommender:
return False
# Use a dictionary keyed on the query guid
if addon_guid not in self._addons_coinstallations.keys():
if not self._redis_cache.has_coinstalls_for(addon_guid):
return False
if not self._addons_coinstallations.get(addon_guid):
if not self._redis_cache.get_coinstalls(addon_guid):
return False
return True
@ -195,71 +105,73 @@ class GuidBasedRecommender:
TAAR lite will yield 4 recommendations for the AMO page
"""
# Force access to the JSON models for each request at the
# start of the request to update normalization tables if
# required.
_ = self._addons_coinstallations # noqa
_ = self._guid_rankings # noqa
with log_timer_debug(f"Results computed", self.logger):
addon_guid = client_data.get("guid")
with log_timer_debug("get client data", self.logger):
addon_guid = client_data.get("guid")
normalize = client_data.get("normalize", NORM_MODE_ROWNORMSUM)
# Get the raw co-installation result dictionary
with log_timer_debug("Get filtered coinstallations", self.logger):
result_dict = self._redis_cache.get_filtered_coinstall(addon_guid, {})
norm_dict = {
"none": lambda guid, x: x,
NORM_MODE_ROWCOUNT: self.norm_row_count,
NORM_MODE_ROWSUM: self.norm_row_sum,
NORM_MODE_ROWNORMSUM: self.norm_rownorm_sum,
NORM_MODE_GUIDCEPTION: self.norm_guidception,
}
with log_timer_debug("acquire normalization method", self.logger):
normalize = client_data.get("normalize", NORM_MODE_ROWNORMSUM)
if normalize is not None and normalize not in norm_dict.keys():
# Yield no results if the normalization method is not
# specified
self.logger.warning(
"Invalid normalization parameter detected: [%s]" % normalize
norm_dict = {
"none": lambda guid, x: x,
NORM_MODE_ROWCOUNT: self.norm_row_count,
NORM_MODE_ROWSUM: self.norm_row_sum,
NORM_MODE_ROWNORMSUM: self.norm_rownorm_sum,
NORM_MODE_GUIDCEPTION: self.norm_guidception,
}
if normalize is not None and normalize not in norm_dict.keys():
# Yield no results if the normalization method is not
# specified
self.logger.warning(
"Invalid normalization parameter detected: [%s]" % normalize
)
return []
# Bind the normalization method
norm_method = norm_dict[normalize]
with log_timer_debug(
f"Compute normalization using method:{normalize}", self.logger
):
# Apply normalization
tmp_result_dict = norm_method(addon_guid, result_dict)
# Augment the result_dict with the installation counts
# and then we can sort using lexical sorting of strings.
# The idea here is to get something in the form of
# 0000.0000.0000
# The computed weight takes the first and second segments of
# integers. The third segment is the installation count of
# the addon but is zero padded.
TWICE_LIMIT = limit * 2
with log_timer_debug(
f"Augment {TWICE_LIMIT} with installation counts and resorted",
self.logger,
):
result_list = []
rank_sorted = sorted(
tmp_result_dict.items(), key=lambda x: x[1], reverse=True
)
for k, v in rank_sorted[:TWICE_LIMIT]:
lex_value = "{0:020.10f}.{1:010d}".format(
v, self._redis_cache.get_rankings(k, 0)
)
result_list.append((k, lex_value))
# Sort the result list in descending order by weight
result_list.sort(key=lambda x: x[1], reverse=True)
log_data = (str(addon_guid), [str(r) for r in result_list[:limit]])
self.logger.info(
"Addon: [%s] triggered these recommendation guids: [%s]" % log_data
)
return []
# Bind the normalization method
norm_method = norm_dict[normalize]
# Get the raw co-installation result dictionary
result_dict = self._addons_coinstallations.get(addon_guid, {})
# Collect addon GUIDs where the install incidence is below a
# floor incidence.
removal_keys = []
for k, v in result_dict.items():
if self._guid_rankings.get(k, 0) < self._min_installs:
removal_keys.append(k)
# Remove the collected addons that are not installed enough
for k in removal_keys:
del result_dict[k]
# Apply normalization
tmp_result_dict = norm_method(addon_guid, result_dict)
# Augment the result_dict with the installation counts
# and then we can sort using lexical sorting of strings.
# The idea here is to get something in the form of
# 0000.0000.0000
# The computed weight takes the first and second segments of
# integers. The third segment is the installation count of
# the addon but is zero padded.
result_dict = {}
for k, v in tmp_result_dict.items():
lex_value = "{0:020.10f}.{1:010d}".format(v, self._guid_rankings.get(k, 0))
result_dict[k] = lex_value
# Sort the result dictionary in descending order by weight
result_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
log_data = (str(addon_guid), [str(r) for r in result_list[:limit]])
self.logger.info(
"Addon: [%s] triggered these recommendation guids: [%s]" % log_data
)
return result_list[:limit]
@ -270,12 +182,11 @@ class GuidBasedRecommender:
This dampens weight of any suggested GUID inversely
proportional to it's overall popularity.
"""
uniq_guid_map = self._guid_maps["row_count"]
output_result_dict = {}
for result_guid, result_count in input_coinstall_dict.items():
output_result_dict[result_guid] = (
1.0 * result_count / uniq_guid_map[result_guid]
1.0 * result_count / self._redis_cache.guid_maps_rowcount(result_guid)
)
return output_result_dict
@ -284,11 +195,13 @@ class GuidBasedRecommender:
coinstallation GUIDs based on the sum of the weights for the
coinstallation GUIDs given a key GUID.
"""
guid_count_map = self._guid_maps["count_map"]
def generate_row_sum_list():
for guid, guid_weight in input_coinstall_dict.items():
norm_guid_weight = guid_weight * 1.0 / guid_count_map[guid]
norm_guid_weight = (
guid_weight * 1.0 / self._redis_cache.guid_maps_count_map(guid)
)
yield guid, norm_guid_weight
return dict(generate_row_sum_list())
@ -301,24 +214,31 @@ class GuidBasedRecommender:
The testcase for this scenario lays out the math more
explicitly.
"""
tmp_dict = self._normalize_row_weights(input_coinstall_dict)
guid_row_norm = self._guid_maps["guid_row_norm"]
with log_timer_debug("normalize row weights for coinstall dict", self.logger):
tmp_dict = self._normalize_row_weights(input_coinstall_dict)
output_dict = {}
for output_guid, output_guid_weight in tmp_dict.items():
guid_row_norm_list = guid_row_norm.get(output_guid, [])
if len(guid_row_norm_list) == 0:
self.logger.warning(
"Can't find GUID_ROW_NORM data for [{}]".format(output_guid)
with log_timer_debug(
f"normalizing output_dict of size: {len(tmp_dict)}", self.logger
):
output_dict = {}
for output_guid, output_guid_weight in tmp_dict.items():
guid_row_norm_list = self._redis_cache.guid_maps_row_norm(
output_guid, []
)
continue
norm_sum = sum(guid_row_norm_list)
if norm_sum == 0:
self.logger.warning(
"Sum of GUID_ROW_NORM data for [{}] is zero.".format(output_guid)
)
continue
output_dict[output_guid] = output_guid_weight / norm_sum
if len(guid_row_norm_list) == 0:
self.logger.warning(
"Can't find GUID_ROW_NORM data for [{}]".format(output_guid)
)
continue
norm_sum = sum(guid_row_norm_list)
if norm_sum == 0:
self.logger.warning(
"Sum of GUID_ROW_NORM data for [{}] is zero.".format(
output_guid
)
)
continue
output_dict[output_guid] = output_guid_weight / norm_sum
return output_dict
@ -367,7 +287,7 @@ class GuidBasedRecommender:
# Add in the next level
level -= 1
for guid in consolidated_coinstall_dict.keys():
next_level_coinstalls = self._addons_coinstallations.get(guid, {})
next_level_coinstalls = self._redis_cache.get_coinstalls(guid, {})
if next_level_coinstalls != {}:
# Normalize the next bunch of suggestions
next_level_coinstalls = self._normalize_row_weights(

Просмотреть файл

@ -7,8 +7,8 @@ from .lazys3 import LazyJSONLoader
from srgutil.interfaces import IMozLogging
import operator as op
import random
from .s3config import TAAR_WHITELIST_BUCKET
from .s3config import TAAR_WHITELIST_KEY
from taar.settings import TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
import markus

Просмотреть файл

@ -6,8 +6,7 @@ from srgutil.interfaces import IMozLogging
from .base_recommender import AbstractRecommender
from .lazys3 import LazyJSONLoader
from .s3config import TAAR_LOCALE_BUCKET
from .s3config import TAAR_LOCALE_KEY
from taar.settings import TAAR_LOCALE_BUCKET, TAAR_LOCALE_KEY
import markus

Просмотреть файл

@ -6,9 +6,12 @@ from taar.recommenders.ensemble_recommender import EnsembleRecommender
from taar.recommenders.randomizer import in_experiment, reorder_guids
from srgutil.interfaces import IMozLogging
from .lazys3 import LazyJSONLoader
from .s3config import TAAR_WHITELIST_BUCKET
from .s3config import TAAR_WHITELIST_KEY
from .s3config import TAAR_EXPERIMENT_PROB
from taar.settings import (
TAAR_WHITELIST_BUCKET,
TAAR_WHITELIST_KEY,
TAAR_EXPERIMENT_PROB,
)
import markus

Просмотреть файл

@ -0,0 +1,388 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import json
import os
import threading
import redis
import numpy as np
from srgutil.interfaces import IMozLogging
from taar.settings import (
REDIS_HOST,
REDIS_PORT,
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
TAARLITE_GUID_RANKING_KEY,
TAARLITE_TTL,
TAARLITE_TRUNCATE,
TAARLITE_UPDATE_POLL,
TAARLITE_MUTEX_TTL,
)
import time
from jsoncache.loader import s3_json_loader
ACTIVE_DB = "active_db"
UPDATE_CHECK = "update_id_check"
COINSTALL_PREFIX = "coinstall|"
FILTERED_COINSTALL_PREFIX = "filtered_coinstall|"
RANKING_PREFIX = "ranking|"
MIN_INSTALLS_PREFIX = "min_installs|"
# This is a map is guid->sum of coinstall counts
NORMDATA_COUNT_MAP_PREFIX = "normdata_count_map_prefix|"
# Capture the number of times a GUID shows up per row
# of coinstallation data.
NORMDATA_ROWCOUNT_PREFIX = "normdata_rowcount_prefix|"
NORMDATA_GUID_ROW_NORM_PREFIX = "normdata_guid_row_norm_prefix|"
NEXT_UPDATE_MUTEX = "next_update_mutex|"
NEXT_UPDATE_TIME = "next_update_time|"
class PrefixStripper:
def __init__(self, prefix, iterator, cast_to_str=False):
self._prefix = prefix
self._iter = iterator
self._cast_to_str = cast_to_str
def __iter__(self):
return self
def __next__(self):
result = self._iter.__next__()
result = result[len(self._prefix) :]
if self._cast_to_str:
result = str(result)
return result
class AddonsCoinstallCache:
"""
This class manages a redis instance to hold onto the taar-lite
GUID->GUID co-installation data
"""
def __init__(self, ctx, ttl=TAARLITE_TTL):
self._ctx = ctx
self.logger = self._ctx[IMozLogging].get_logger("taar")
self._ttl = ttl
rcon = self.init_redis_connections()
self._r0 = rcon[0]
self._r1 = rcon[1]
self._r2 = rcon[2]
if self._db() is None:
self.safe_load_data()
self.wait_for_data()
self.start_update_thread()
def start_update_thread(self):
self._update_thread = threading.Thread(
target=self._update_data_target, daemon=True
)
self._update_thread.start()
def _update_data_target(self):
while True:
self.logger.info(f"TAARLite update is alive. ident={self._ident}")
try:
self.logger.debug(f"Trying to acquire lock {self._ident}")
self._r0.set(
NEXT_UPDATE_MUTEX, self._ident, nx=True, ex=TAARLITE_MUTEX_TTL,
)
tmp = self._r0.get(NEXT_UPDATE_MUTEX)
if tmp is None:
# Someone else acquired the lock (and released it)
return
if tmp.decode("utf8") != self._ident:
# Someone else acquired the lock
return
self.logger.debug(f"Acquired lock {self._ident}")
try:
next_update = self._r0.get(NEXT_UPDATE_TIME)
if next_update is None:
next_update = time.time() + TAARLITE_TTL
self._r0.set(NEXT_UPDATE_TIME, next_update)
next_update = float(self._r0.get(NEXT_UPDATE_TIME))
self.logger.info(
f"Next TAARlite refresh is in {next_update - time.time()} seconds"
)
if next_update <= time.time():
# We're past the expiry time
self.safe_load_data()
finally:
self._r0.delete(NEXT_UPDATE_MUTEX)
self.logger.debug(f"Released lock {self._ident}")
except Exception:
self.logger.exception("Error while updating GUID GUID cache")
time.sleep(TAARLITE_UPDATE_POLL)
@property
def _ident(self):
""" pid/thread identity """
return f"{os.getpid()}_{threading.get_ident()}"
def init_redis_connections(self):
return {
0: redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=0),
1: redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=1),
2: redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=2),
}
def safe_load_data(self):
"""
This is a multiprocess, multithread safe method to safely load
data into the cache.
If a concurrent calls to this method are invoked, only the first
call will have any effect.
"""
# Pin the first thread ID to try to update data
# Note that nx flag so that this is only set if the
# UPDATE_CHECK is not previously set
#
# The thread barrier will autoexpire in 10 minutes in the
# event of process termination inside the critical section.
self._r0.set(UPDATE_CHECK, self._ident, nx=True, ex=TAARLITE_MUTEX_TTL)
self.logger.info(f"UPDATE_CHECK field is set: {self._ident}")
# This is a concurrency barrier to make sure only the pinned
# thread can update redis
update_ident = self._r0.get(UPDATE_CHECK).decode("utf8")
if update_ident != self._ident:
return
# We're past the thread barrier - load the data and clear the
# barrier when done
try:
self._load_data()
finally:
self._r0.delete(UPDATE_CHECK)
self.logger.info("UPDATE_CHECK field is cleared")
def _load_data(self):
active_db = self._r0.get(ACTIVE_DB)
if active_db is not None:
active_db = int(active_db)
if active_db == 1:
next_active_db = 2
else:
next_active_db = 1
else:
next_active_db = 1
self._copy_data(next_active_db)
self.logger.info("Completed precomputing normalized data")
# TODO: should this autoexpire to help indicate that no fresh
# data has loaded? Maybe N * update TTL time?
self._r0.set(ACTIVE_DB, next_active_db)
self.logger.info(f"Active DB is set to {next_active_db}")
def _copy_data(self, next_active_db):
if next_active_db == 1:
db = self._r1
else:
db = self._r2
# Clear this database before we do anything with it
db.flushdb()
self._update_rank_data(db)
self._update_coinstall_data(db)
def fetch_ranking_data(self):
return s3_json_loader(TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_RANKING_KEY)
def _update_rank_data(self, db):
data = self.fetch_ranking_data()
items = data.items()
len_items = len(items)
for i, (guid, count) in enumerate(items):
db.set(RANKING_PREFIX + guid, json.dumps(count))
if i % 1000 == 0:
self.logger.info(f"Loaded {i+1} of {len_items} GUID ranking into redis")
min_installs = np.mean(list(data.values())) * 0.05
db.set(MIN_INSTALLS_PREFIX, min_installs)
self.logger.info(f"Updated MIN_INSTALLS: {min_installs}")
def guid_maps_count_map(self, guid, default=None):
tmp = self._db().get(NORMDATA_COUNT_MAP_PREFIX + guid)
if tmp:
return json.loads(tmp.decode("utf8"))
return default
def guid_maps_rowcount(self, guid, default=None):
tmp = self._db().get(NORMDATA_ROWCOUNT_PREFIX + guid)
if tmp:
return json.loads(tmp.decode("utf8"))
return default
def guid_maps_row_norm(self, guid, default=None):
tmp = self._db().get(NORMDATA_GUID_ROW_NORM_PREFIX + guid)
if tmp:
return json.loads(tmp.decode("utf8"))
return default
def min_installs(self, db):
"""
Return the floor minimum installed addons that we will
consider, or 0 if nothing is currently stored in redis
"""
result = db.get(MIN_INSTALLS_PREFIX)
if result is None:
return 0
return float(result.decode("utf8"))
def fetch_coinstall_data(self):
return s3_json_loader(
TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_COINSTALL_KEY
)
def _update_coinstall_data(self, db):
data = self.fetch_coinstall_data()
items = data.items()
len_items = len(items)
guid_count_map = {}
row_count = {}
guid_row_norm = {}
for i, (guid, coinstalls) in enumerate(items):
tmp = dict(
[(k, v) for (k, v) in coinstalls.items() if v >= self.min_installs(db)]
)
db.set(FILTERED_COINSTALL_PREFIX + guid, json.dumps(tmp))
rowsum = sum(coinstalls.values())
for coinstall_guid, coinstall_count in coinstalls.items():
# Capture the total number of time a GUID was
# coinstalled with other guids
guid_count_map.setdefault(coinstall_guid, 0)
guid_count_map[coinstall_guid] += coinstall_count
# Capture the unique number of times a GUID is
# coinstalled with other guids
row_count.setdefault(coinstall_guid, 0)
row_count[coinstall_guid] += 1
if coinstall_guid not in guid_row_norm:
guid_row_norm[coinstall_guid] = []
guid_row_norm[coinstall_guid].append(1.0 * coinstall_count / rowsum)
db.set(COINSTALL_PREFIX + guid, json.dumps(coinstalls))
if i % 1000 == 0:
self.logger.info(
f"Loaded {i+1} of {len_items} GUID-GUID coinstall records into redis"
)
self.logger.info("guidmaps computed - saving to redis")
for guid, guid_count in guid_count_map.items():
db.set(NORMDATA_COUNT_MAP_PREFIX + guid, json.dumps(guid_count))
for coinstall_guid, coinstall_count in row_count.items():
db.set(
NORMDATA_ROWCOUNT_PREFIX + coinstall_guid, json.dumps(coinstall_count),
)
for coinstall_guid, norm_val in guid_row_norm.items():
db.set(
NORMDATA_GUID_ROW_NORM_PREFIX + coinstall_guid, json.dumps(norm_val),
)
self.logger.info("finished saving guidmaps to redis")
def get_filtered_coinstall(self, guid, default=None):
tmp = self._db().get(FILTERED_COINSTALL_PREFIX + guid)
if tmp:
raw_dict = json.loads(tmp.decode("utf8"))
# This truncates the size of the coinstall list for
# performance reasons
return dict(
sorted(raw_dict.items(), key=lambda x: x[1], reverse=True)[
:TAARLITE_TRUNCATE
]
)
return default
def get_rankings(self, guid, default=None):
"""
Return the rankings
"""
tmp = self._db().get(RANKING_PREFIX + guid)
if tmp:
return json.loads(tmp.decode("utf8"))
return default
def has_coinstalls_for(self, guid):
return self._db().get(COINSTALL_PREFIX + guid) is not None
def get_coinstalls(self, guid, default=None):
"""
Return a map of GUID:install count that represents the
coinstallation map for a particular addon GUID
"""
tmp = self._db().get(COINSTALL_PREFIX + guid)
if tmp:
return json.loads(tmp.decode("utf8"))
return default
def key_iter_ranking(self):
return PrefixStripper(
RANKING_PREFIX, self._db().scan_iter(match=RANKING_PREFIX + "*")
)
def key_iter_coinstall(self):
return PrefixStripper(
COINSTALL_PREFIX, self._db().scan_iter(match=COINSTALL_PREFIX + "*")
)
def is_active(self):
"""
return True if data is loaded
"""
# Any value in ACTIVE_DB indicates that data is live
return self._r0.get(ACTIVE_DB) is not None
def wait_for_data(self):
while True:
if self.is_active():
break
self.logger.info("waiting for data. spinlock active")
time.sleep(1)
self.logger.info("finished waiting for data")
def _db(self):
"""
This dereferences the ACTIVE_DB pointer to get the current
active redis instance
"""
active_db = self._r0.get(ACTIVE_DB)
if active_db is not None:
db = int(active_db.decode("utf8"))
if db == 1:
return self._r1
elif db == 2:
return self._r2

Просмотреть файл

@ -1,50 +0,0 @@
from decouple import config
TAAR_ENSEMBLE_BUCKET = config(
"TAAR_ENSEMBLE_BUCKET", default="test_ensemble_bucket"
)
TAAR_ENSEMBLE_KEY = config("TAAR_ENSEMBLE_KEY", default="test_ensemble_key")
TAAR_WHITELIST_BUCKET = config(
"TAAR_WHITELIST_BUCKET", default="test_whitelist_bucket"
)
TAAR_WHITELIST_KEY = config("TAAR_WHITELIST_KEY", default="test_whitelist_key")
TAAR_ITEM_MATRIX_BUCKET = config(
"TAAR_ITEM_MATRIX_BUCKET", default="test_matrix_bucket"
)
TAAR_ITEM_MATRIX_KEY = config("TAAR_ITEM_MATRIX_KEY", default="test_matrix_key")
TAAR_ADDON_MAPPING_BUCKET = config(
"TAAR_ADDON_MAPPING_BUCKET", default="test_mapping_bucket"
)
TAAR_ADDON_MAPPING_KEY = config(
"TAAR_ADDON_MAPPING_KEY", default="test_mapping_key"
)
TAAR_LOCALE_BUCKET = config("TAAR_LOCALE_BUCKET", default="test_locale_bucket")
TAAR_LOCALE_KEY = config("TAAR_LOCALE_KEY", default="test_locale_key")
TAAR_SIMILARITY_BUCKET = config(
"TAAR_SIMILARITY_BUCKET", default="test_similarity_bucket"
)
TAAR_SIMILARITY_DONOR_KEY = config(
"TAAR_SIMILARITY_DONOR_KEY", default="test_similarity_donor_key"
)
TAAR_SIMILARITY_LRCURVES_KEY = config(
"TAAR_SIMILARITY_LRCURVES_KEY", default="test_similarity_lrcurves_key"
)
TAAR_EXPERIMENT_PROB = config("TAAR_EXPERIMENT_PROB", default=0.0)
# TAAR-lite configuration below
TAARLITE_GUID_COINSTALL_BUCKET = config(
"TAARLITE_GUID_COINSTALL_BUCKET", "telemetry-parquet"
)
TAARLITE_GUID_COINSTALL_KEY = config(
"TAARlLITE_GUID_COINSTALL_KEY", "taar/lite/guid_coinstallation.json"
)
TAARLITE_GUID_RANKING_KEY = "taar/lite/guid_install_ranking.json"

Просмотреть файл

@ -9,9 +9,11 @@ from srgutil.interfaces import IMozLogging
import numpy as np
from .lazys3 import LazyJSONLoader
from .s3config import TAAR_SIMILARITY_BUCKET
from .s3config import TAAR_SIMILARITY_DONOR_KEY
from .s3config import TAAR_SIMILARITY_LRCURVES_KEY
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
TAAR_SIMILARITY_LRCURVES_KEY,
)
import markus

78
taar/settings.py Normal file
Просмотреть файл

@ -0,0 +1,78 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from decouple import config
REDIS_HOST = config("REDIS_HOST", "localhost", cast=str)
REDIS_PORT = config("REDIS_PORT", 6379, cast=int)
# These are configurations that are specific to the TAAR library
TAAR_MAX_RESULTS = config("TAAR_MAX_RESULTS", default=10, cast=int)
TAARLITE_MAX_RESULTS = config("TAARLITE_MAX_RESULTS", default=4, cast=int)
STATSD_HOST = config("STATSD_HOST", default="localhost", cast=str)
STATSD_PORT = config("STATSD_PORT", default=8125, cast=int)
TAAR_ENSEMBLE_BUCKET = config("TAAR_ENSEMBLE_BUCKET", default="test_ensemble_bucket")
TAAR_ENSEMBLE_KEY = config("TAAR_ENSEMBLE_KEY", default="test_ensemble_key")
TAAR_WHITELIST_BUCKET = config("TAAR_WHITELIST_BUCKET", default="test_whitelist_bucket")
TAAR_WHITELIST_KEY = config("TAAR_WHITELIST_KEY", default="test_whitelist_key")
TAAR_ITEM_MATRIX_BUCKET = config(
"TAAR_ITEM_MATRIX_BUCKET", default="test_matrix_bucket"
)
TAAR_ITEM_MATRIX_KEY = config("TAAR_ITEM_MATRIX_KEY", default="test_matrix_key")
TAAR_ADDON_MAPPING_BUCKET = config(
"TAAR_ADDON_MAPPING_BUCKET", default="test_mapping_bucket"
)
TAAR_ADDON_MAPPING_KEY = config("TAAR_ADDON_MAPPING_KEY", default="test_mapping_key")
TAAR_LOCALE_BUCKET = config("TAAR_LOCALE_BUCKET", default="test_locale_bucket")
TAAR_LOCALE_KEY = config("TAAR_LOCALE_KEY", default="test_locale_key")
TAAR_SIMILARITY_BUCKET = config(
"TAAR_SIMILARITY_BUCKET", default="test_similarity_bucket"
)
TAAR_SIMILARITY_DONOR_KEY = config(
"TAAR_SIMILARITY_DONOR_KEY", default="test_similarity_donor_key"
)
TAAR_SIMILARITY_LRCURVES_KEY = config(
"TAAR_SIMILARITY_LRCURVES_KEY", default="test_similarity_lrcurves_key"
)
TAAR_EXPERIMENT_PROB = config("TAAR_EXPERIMENT_PROB", default=0.0)
# TAAR-lite configuration below
TAARLITE_GUID_COINSTALL_BUCKET = config(
"TAARLITE_GUID_COINSTALL_BUCKET", "telemetry-parquet"
)
TAARLITE_GUID_COINSTALL_KEY = config(
"TAARlLITE_GUID_COINSTALL_KEY", "taar/lite/guid_coinstallation.json"
)
TAARLITE_GUID_RANKING_KEY = config(
"TAARLITE_GUID_RANKING_KEY", "taar/lite/guid_install_ranking.json"
)
# 4 hour liviliness for TAARLITE data
TAARLITE_TTL = config("TAARLITE_TTL", 60 * 60 * 4, cast=int)
# TAARlite needs redis backed mutex's to protect critical sections
TAARLITE_MUTEX_TTL = config("TAARLITE_MUTEX_TTL", 60 * 60, cast=int)
# Poll for TTL expiry every 60 seconds
TAARLITE_UPDATE_POLL = config("TAARLITE_UPDATE_POLL", 60, cast=int)
# Set a default TAARLite mutex TTL of 10 minutes (60 * 10)
TAARLITE_MUTEX_TTL = config("TAARLITE_MUTEX_TTL", 60 * 10, cast=int)
TAARLITE_TRUNCATE = config("TAARLITE_TRUNCATE", TAARLITE_MAX_RESULTS * 5, cast=int)

Просмотреть файл

@ -6,7 +6,7 @@ from taar.recommenders.ensemble_recommender import (
WeightCache,
EnsembleRecommender,
)
from taar.recommenders.s3config import (
from taar.settings import (
TAAR_ENSEMBLE_BUCKET,
TAAR_ENSEMBLE_KEY,
)

Просмотреть файл

@ -1,18 +1,15 @@
import json
from moto import mock_s3
import boto3
import fakeredis
import pytest
import mock
import contextlib
from taar.recommenders.guid_based_recommender import GuidBasedRecommender
from taar.recommenders.redis_cache import AddonsCoinstallCache
from taar.recommenders.s3config import (
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
TAARLITE_GUID_RANKING_KEY,
)
from taar.recommenders.redis_cache import NORMDATA_GUID_ROW_NORM_PREFIX
from taar.recommenders.lazys3 import LazyJSONLoader
from taar.recommenders.ua_parser import parse_ua, OSNAME_TO_ID
@ -84,205 +81,209 @@ RESULTS = {
}
def install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx):
conn = boto3.resource("s3", region_name="us-west-2")
@contextlib.contextmanager
def mock_coinstall_ranking_context(mock_coinstall, mock_ranking):
with contextlib.ExitStack() as stack:
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache, "fetch_ranking_data", return_value=mock_ranking,
)
)
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache,
"fetch_coinstall_data",
return_value=mock_coinstall,
)
)
conn.create_bucket(Bucket=TAARLITE_GUID_COINSTALL_BUCKET)
conn.Object(TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_COINSTALL_KEY).put(
Body=json.dumps(TAARLITE_MOCK_DATA)
)
conn.Object(TAARLITE_GUID_COINSTALL_BUCKET, TAARLITE_GUID_RANKING_KEY).put(
Body=json.dumps(TAARLITE_MOCK_GUID_RANKING)
)
coinstall_loader = LazyJSONLoader(
test_ctx,
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_COINSTALL_KEY,
"guid_coinstall",
)
ranking_loader = LazyJSONLoader(
test_ctx,
TAARLITE_GUID_COINSTALL_BUCKET,
TAARLITE_GUID_RANKING_KEY,
"guid_ranking",
)
test_ctx["coinstall_loader"] = coinstall_loader
test_ctx["ranking_loader"] = ranking_loader
# Patch fakeredis in
stack.enter_context(
mock.patch.object(
AddonsCoinstallCache,
"init_redis_connections",
return_value={
0: fakeredis.FakeStrictRedis(),
1: fakeredis.FakeStrictRedis(),
2: fakeredis.FakeStrictRedis(),
},
)
)
yield stack
@mock_s3
def test_recommender_nonormal(test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
with MetricsMock() as mm:
EXPECTED_RESULTS = RESULTS["default"]
install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
recommender = GuidBasedRecommender(test_ctx)
with MetricsMock() as mm:
EXPECTED_RESULTS = RESULTS["default"]
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-1"
guid = "guid-1"
actual = recommender.recommend({"guid": guid, "normalize": "none"})
assert actual == EXPECTED_RESULTS
actual = recommender.recommend({"guid": guid, "normalize": "none"})
assert actual == EXPECTED_RESULTS
mm.has_record(TIMING, "taar.guid_coinstall")
mm.has_record(TIMING, "taar.guid_ranking")
mm.has_record(TIMING, "taar.guid_recommendation")
mm.has_record(TIMING, "taar.guid_coinstall")
mm.has_record(TIMING, "taar.guid_ranking")
mm.has_record(TIMING, "taar.guid_recommendation")
@mock_s3
def test_row_count_recommender(
test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
EXPECTED_RESULTS = RESULTS["row_count"]
install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
EXPECTED_RESULTS = RESULTS["row_count"]
actual = recommender.recommend({"guid": guid, "normalize": "row_count"})
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
# Note that guid-9 is not included because it's weight is
# decreased 50% to 5
assert EXPECTED_RESULTS == actual
actual = recommender.recommend({"guid": guid, "normalize": "row_count"})
# Note that guid-9 is not included because it's weight is
# decreased 50% to 5
assert EXPECTED_RESULTS == actual
@mock_s3
def test_rownorm_sumrownorm(test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
EXPECTED_RESULTS = RESULTS["rownorm_sum"]
install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
EXPECTED_RESULTS = RESULTS["rownorm_sum"]
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
default_actual = recommender.recommend({"guid": guid})
default_actual = recommender.recommend({"guid": guid})
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
# Default normalization is rownorm_sum
assert actual == default_actual
assert actual == EXPECTED_RESULTS
"""
Some notes on verifying guid-1:
# Default normalization is rownorm_sum
assert actual == default_actual
assert actual == EXPECTED_RESULTS
"""
Some notes on verifying guid-1:
Numerator is the row weighted value of guid-1 : 50/150
Denominator is the sum of the row weighted value of guid-1 in all
other rows
Numerator is the row weighted value of guid-1 : 50/150
Denominator is the sum of the row weighted value of guid-1 in all
other rows
(guid-2) 50/150
(guid-3) 100/210
(guid-6) 5/305
(guid-2) 50/150
(guid-3) 100/210
(guid-6) 5/305
This gives us: [0.3333333333333333,
0.47619047619047616,
0.01639344262295082]
This gives us: [0.3333333333333333,
0.47619047619047616,
0.01639344262295082]
so the final result should be (5/150) / (50/150 + 100/210 + 5/305)
so the final result should be (5/150) / (50/150 + 100/210 + 5/305)
That gives a final expected weight for guid-1 to be: 0.403591682
"""
expected = 0.403591682
actual = float(actual[1][1][:-11])
assert expected == pytest.approx(actual, rel=1e-3)
That gives a final expected weight for guid-1 to be: 0.403591682
"""
expected = 0.403591682
actual = float(actual[1][1][:-11])
assert expected == pytest.approx(actual, rel=1e-3)
@mock_s3
def test_rowsum_recommender(test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
EXPECTED_RESULTS = RESULTS["row_sum"]
install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
EXPECTED_RESULTS = RESULTS["row_sum"]
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
actual = recommender.recommend({"guid": guid, "normalize": "row_sum"})
assert 4 == len(actual)
actual = recommender.recommend({"guid": guid, "normalize": "row_sum"})
assert 4 == len(actual)
expected_val = 50 / 155
actual_val = float(actual[0][1][:-11])
assert expected_val == pytest.approx(actual_val, rel=1e-3)
expected_val = 50 / 155
actual_val = float(actual[0][1][:-11])
assert expected_val == pytest.approx(actual_val, rel=1e-3)
assert actual == EXPECTED_RESULTS
assert actual == EXPECTED_RESULTS
@mock_s3
def test_guidception(test_ctx, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
EXPECTED_RESULTS = RESULTS["guidception"]
install_mock_data(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
EXPECTED_RESULTS = RESULTS["guidception"]
actual = recommender.recommend({"guid": guid, "normalize": "guidception"})
assert actual == EXPECTED_RESULTS
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
actual = recommender.recommend({"guid": guid, "normalize": "guidception"})
assert actual == EXPECTED_RESULTS
@mock_s3
def test_rownorm_sum_tiebreak(
test_ctx, TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
install_mock_data(TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
with mock_coinstall_ranking_context(
TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
recommender = GuidBasedRecommender(test_ctx)
guid = "guid-2"
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
# Note that the results have weights that are equal, but the tie
# break is solved by the install rate.
assert actual == EXPECTED_RESULTS
# Note that the results have weights that are equal, but the tie
# break is solved by the install rate.
assert actual == EXPECTED_RESULTS
@mock_s3
def test_missing_rownorm_data_issue_31(
test_ctx, TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
install_mock_data(TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
recommender = GuidBasedRecommender(test_ctx)
with mock_coinstall_ranking_context(
TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
recommender = GuidBasedRecommender(test_ctx)
# Explicitly destroy the guid-4 key in the row_norm data
del recommender._guid_maps["guid_row_norm"]["guid-4"]
for i, row in enumerate(EXPECTED_RESULTS):
if row[0] == "guid-4":
del EXPECTED_RESULTS[i]
break
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
guid = "guid-2"
# Explicitly destroy the guid-4 key in the row_norm data
recommender._redis_cache._db().delete(NORMDATA_GUID_ROW_NORM_PREFIX + "guid-4")
for i, row in enumerate(EXPECTED_RESULTS):
if row[0] == "guid-4":
del EXPECTED_RESULTS[i]
break
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
guid = "guid-2"
assert actual == EXPECTED_RESULTS
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
assert actual == EXPECTED_RESULTS
@mock_s3
def test_divide_by_zero_rownorm_data_issue_31(
test_ctx, TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
install_mock_data(TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx)
recommender = GuidBasedRecommender(test_ctx)
with mock_coinstall_ranking_context(
TAARLITE_TIE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING
):
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
recommender = GuidBasedRecommender(test_ctx)
# Explicitly set the guid-4 key in the row_norm data to have a sum
# of zero weights
recommender._guid_maps["guid_row_norm"]["guid-4"] = [0, 0, 0]
EXPECTED_RESULTS = RESULTS["rownorm_sum_tiebreak"]
# Destroy the guid-4 key in the expected results as a sum of 0
# will generate a divide by zero error
for i, row in enumerate(EXPECTED_RESULTS):
if row[0] == "guid-4":
del EXPECTED_RESULTS[i]
break
# Explicitly set the guid-4 key in the row_norm data to have a sum
# of zero weights
recommender._redis_cache._db().set(
NORMDATA_GUID_ROW_NORM_PREFIX + "guid-4", json.dumps([0, 0, 0])
)
guid = "guid-2"
# Destroy the guid-4 key in the expected results as a sum of 0
# will generate a divide by zero error
for i, row in enumerate(EXPECTED_RESULTS):
if row[0] == "guid-4":
del EXPECTED_RESULTS[i]
break
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
guid = "guid-2"
assert actual == EXPECTED_RESULTS
actual = recommender.recommend({"guid": guid, "normalize": "rownorm_sum"})
assert actual == EXPECTED_RESULTS
def test_user_agent_strings():

Просмотреть файл

@ -6,11 +6,13 @@
Test cases for the TAAR Hybrid recommender
"""
import pytest
from taar.recommenders.hybrid_recommender import CuratedRecommender
from taar.recommenders.hybrid_recommender import HybridRecommender
from taar.recommenders.ensemble_recommender import EnsembleRecommender
from taar.recommenders.s3config import TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
from taar.settings import TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
# from taar.recommenders.hybrid_recommender import ENSEMBLE_WEIGHTS
from .test_ensemblerecommender import install_mock_ensemble_data
@ -112,6 +114,7 @@ def test_hybrid_recommendations(test_ctx):
assert len(guid_list) == LIMIT
@pytest.mark.skip(reason="this test seems to break sporadically")
@mock_s3
def test_stable_hybrid_results(test_ctx):
# verify that the recommendations mix the curated and

Просмотреть файл

@ -6,7 +6,9 @@ from flask import url_for
import pytest
from taar.settings import TAARLITE_MAX_RESULTS
from taar.context import default_context
from .test_guid_based_recommender import mock_coinstall_ranking_context
try:
from unittest.mock import MagicMock
@ -48,11 +50,7 @@ def test_only_promoted_addons_post(client, app):
res = client.post(
"/v1/api/recommendations/not_a_real_hash/",
json=dict(
{
"options": {
"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]
}
}
{"options": {"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]}}
),
follow_redirects=True,
)
@ -101,9 +99,7 @@ class ProfileFetcherEnabledRecommendationManager(FakeRecommendationManager):
def __init__(self, *args, **kwargs):
self._ctx = default_context()
self._ctx["profile_fetcher"] = kwargs["profile_fetcher"]
super(ProfileFetcherEnabledRecommendationManager, self).__init__(
args, kwargs
)
super(ProfileFetcherEnabledRecommendationManager, self).__init__(args, kwargs)
@pytest.fixture
@ -115,9 +111,7 @@ def locale_recommendation_manager(monkeypatch):
import taar.flask_app
taar.flask_app.APP_WRAPPER.set(
{"PROXY_RESOURCE": LocaleRecommendationManager()}
)
taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": LocaleRecommendationManager()})
@pytest.fixture
@ -129,9 +123,7 @@ def empty_recommendation_manager(monkeypatch):
import taar.flask_app
taar.flask_app.APP_WRAPPER.set(
{"PROXY_RESOURCE": EmptyRecommendationManager()}
)
taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": EmptyRecommendationManager()})
@pytest.fixture
@ -143,9 +135,7 @@ def platform_recommendation_manager(monkeypatch):
import taar.flask_app
taar.flask_app.APP_WRAPPER.set(
{"PROXY_RESOURCE": PlatformRecommendationManager()}
)
taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": PlatformRecommendationManager()})
@pytest.fixture
@ -157,9 +147,7 @@ def static_recommendation_manager(monkeypatch):
import taar.flask_app
taar.flask_app.APP_WRAPPER.set(
{"PROXY_RESOURCE": StaticRecommendationManager()}
)
taar.flask_app.APP_WRAPPER.set({"PROXY_RESOURCE": StaticRecommendationManager()})
@pytest.fixture
@ -235,9 +223,7 @@ def test_simple_request(client, static_recommendation_manager):
assert response.data == expected
def test_mixed_and_promoted_and_taar_adodns(
client, static_recommendation_manager
):
def test_mixed_and_promoted_and_taar_adodns(client, static_recommendation_manager):
"""
Test that we can provide addon suggestions that also get clobbered
by the promoted addon set.
@ -246,11 +232,7 @@ def test_mixed_and_promoted_and_taar_adodns(
res = client.post(
url,
json=dict(
{
"options": {
"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]
}
}
{"options": {"promoted": [["guid1", 10], ["guid2", 5], ["guid55", 8]]}}
),
follow_redirects=True,
)
@ -281,11 +263,7 @@ def test_overlapping_mixed_and_promoted_and_taar_adodns(
json=dict(
{
"options": {
"promoted": [
["test-addon-1", 10],
["guid2", 5],
["guid55", 8],
]
"promoted": [["test-addon-1", 10], ["guid2", 5], ["guid55", 8],]
}
}
),
@ -293,13 +271,7 @@ def test_overlapping_mixed_and_promoted_and_taar_adodns(
)
# The result should order the GUIDs in descending order of weight
expected = {
"results": [
"test-addon-1",
"guid55",
"guid2",
"test-addon-2",
"test-addon-N",
]
"results": ["test-addon-1", "guid55", "guid2", "test-addon-2", "test-addon-N",]
}
assert res.json == expected
@ -351,3 +323,17 @@ def test_client_has_no_addon(client, profile_enabled_rm):
res = client.get(url, follow_redirects=True)
assert res.json["results"] is False
def test_taarlite(client, TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING, test_ctx):
"""
Check that the result size of taarlite is TAARLITE_MAX_RESULTS
"""
with mock_coinstall_ranking_context(TAARLITE_MOCK_DATA, TAARLITE_MOCK_GUID_RANKING):
url = url_for("taarlite_recommendations", guid="guid-1",)
res = client.get(url, follow_redirects=True)
assert len(res.json["results"]) == TAARLITE_MAX_RESULTS
assert res.json["results"] == ["guid-5", "guid-6", "guid-3", "guid-2"]

Просмотреть файл

@ -10,7 +10,7 @@ import boto3
from moto import mock_s3
from taar.recommenders.s3config import (
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
)
@ -90,5 +90,5 @@ def test_force_expiry(test_ctx):
def check_jdata_status(jdata, status):
assert jdata == {'test': 'donor_key'}
assert jdata == {"test": "donor_key"}
assert status

Просмотреть файл

@ -9,7 +9,7 @@ import json
from taar.recommenders import LocaleRecommender
from taar.recommenders.s3config import TAAR_LOCALE_KEY, TAAR_LOCALE_BUCKET
from taar.settings import TAAR_LOCALE_KEY, TAAR_LOCALE_BUCKET
from markus import TIMING
from markus.testing import MetricsMock

Просмотреть файл

@ -25,7 +25,7 @@ from .similarity_data import CATEGORICAL_FEATURE_FIXTURE_DATA
from markus import TIMING
from markus.testing import MetricsMock
from taar.recommenders.s3config import (
from taar.settings import (
TAAR_SIMILARITY_BUCKET,
TAAR_SIMILARITY_DONOR_KEY,
TAAR_SIMILARITY_LRCURVES_KEY,