Remerge #157 for weighted randomization (#171)

* Unified patch for #157 * Update ETL job links Update documentation and removed unnecessary env variables. Split up some test cases
2020-07-06 14:19:18 -04:00 · 2020-07-06 14:19:18 -04:00 · b185583d42
--- a/README.md
+++ b/README.md
@ -51,9 +51,9 @@ This is the ordered list of the currently supported models:
 | Order | Model | Description | Conditions | Generator job |
 |-------|-------|-------------|------------|---------------|
 | 1 | [Collaborative](taar/recommenders/collaborative_recommender.py) | recommends add-ons based on add-ons installed by other users (i.e. [collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering))|Telemetry data is available for the user and the user has at least one enabled add-on|[source](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/ml/AddonRecommender.scala)|
-| 2 | [Similarity](taar/recommenders/similarity_recommender.py) | recommends add-ons based on add-ons installed by similar representative users|Telemetry data is available for the user and a suitable representative donor can be found|[source](https://github.com/mozilla/python_mozetl/blob/master/mozetl/taar/taar_similarity.py)|
-| 3 | [Locale](taar/recommenders/locale_recommender.py) |recommends add-ons based on the top addons for the user's locale|Telemetry data is available for the user and the locale has enough users|[source](https://github.com/mozilla/python_mozetl/blob/master/mozetl/taar/taar_locale.py)|
-| 4 | [Ensemble](taar/recommenders/ensemble_recommender.py) &#42;|recommends add-ons based on the combined (by [stacked generalization](https://en.wikipedia.org/wiki/Ensemble_learning#Stacking)) recomendations of other available recommender modules.|More than one of the other Models are available to provide recommendations.|[source](https://github.com/mozilla/python_mozetl/blob/master/mozetl/taar/taar_ensemble.py)|
+| 2 | [Similarity](taar/recommenders/similarity_recommender.py) | recommends add-ons based on add-ons installed by similar representative users|Telemetry data is available for the user and a suitable representative donor can be found|[source](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_similarity.py)|
+| 3 | [Locale](taar/recommenders/locale_recommender.py) |recommends add-ons based on the top addons for the user's locale|Telemetry data is available for the user and the locale has enough users|[source](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_locale.py|
+| 4 | [Ensemble](taar/recommenders/ensemble_recommender.py) &#42;|recommends add-ons based on the combined (by [stacked generalization](https://en.wikipedia.org/wiki/Ensemble_learning#Stacking)) recomendations of other available recommender modules.|More than one of the other Models are available to provide recommendations.|[source](https://github.com/mozilla/telemetry-airflow/blob/master/jobs/taar_ensemble.py|

 All jobs are scheduled in Mozilla's instance of
 [Airflow](https://github.com/mozilla/telemetry-airflow).  The
--- a/docs/randomized_tails.md
+++ b/docs/randomized_tails.md
@ -0,0 +1,16 @@
+# Randomized tail selection of addons
+
+The `TAAR_EXPERIMENT_PROB` sets a probability that a user is in an experiment
+to get randomized recommendations.
+
+Randomized recommendations does not mean that recommendations are
+fully randomized.  Weights for each recommendation are normalized to
+so that the sum of weights equals 1.0.
+
+Using `numpy.random.choice` - we then select a non-uniform random
+sample from the list of suggestions without replacement.  Weights are
+used to define a vector of probabilities.
+
+
+By default - TAAR_EXPERIMENT_PROB is set to 0.0 which in effect
+disables the randomization feature.
--- a/taar/profile_fetcher.py
+++ b/taar/profile_fetcher.py
@ -11,7 +11,6 @@ import json
 import zlib
 import datetime

-
 BIGTABLE_PROJECT_ID = config(
    "BIGTABLE_PROJECT_ID", default="cfr-personalization-experiment"
 )
--- a/taar/recommenders/init.py
+++ b/taar/recommenders/init.py
@ -2,12 +2,14 @@ from .collaborative_recommender import CollaborativeRecommender
 from .locale_recommender import LocaleRecommender
 from .similarity_recommender import SimilarityRecommender
 from .recommendation_manager import RecommendationManager, RecommenderFactory
+from .fixtures import hasher  # noqa


 __all__ = [
-    'CollaborativeRecommender',
-    'LocaleRecommender',
-    'SimilarityRecommender',
-    'RecommendationManager',
-    'RecommenderFactory',
+    "CollaborativeRecommender",
+    "LocaleRecommender",
+    "SimilarityRecommender",
+    "RecommendationManager",
+    "RecommenderFactory",
+    "hasher",
 ]
--- a/taar/recommenders/ensemble_recommender.py
+++ b/taar/recommenders/ensemble_recommender.py
@ -7,9 +7,17 @@ import itertools
 from .base_recommender import AbstractRecommender
 from .lazys3 import LazyJSONLoader

+from .s3config import TAAR_WHITELIST_BUCKET
+from .s3config import TAAR_WHITELIST_KEY
 from .s3config import TAAR_ENSEMBLE_BUCKET
 from .s3config import TAAR_ENSEMBLE_KEY

+from .fixtures import hasher
+
+
+def is_test_client(client_id):
+    return len(set(client_id.replace("-", ""))) == 1
+

 class WeightCache:
    def __init__(self, ctx):
@ -48,6 +56,10 @@ class EnsembleRecommender(AbstractRecommender):
        for rkey in self.RECOMMENDER_KEYS:
            self._recommender_map[rkey] = recommender_factory.create(rkey)

+        self._whitelist_data = LazyJSONLoader(
+            self._ctx, TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
+        )
+
        self._weight_cache = WeightCache(self._ctx.child())
        self.logger.info("EnsembleRecommender initialized")

@ -64,18 +76,26 @@ class EnsembleRecommender(AbstractRecommender):
        return result

    def recommend(self, client_data, limit, extra_data={}):
-        try:
-            results = self._recommend(client_data, limit, extra_data)
-        except Exception as e:
-            results = []
-            self._weight_cache._weights.force_expiry()
-            self.logger.exception(
-                "Ensemble recommender crashed for {}".format(
-                    client_data.get("client_id", "no-client-id")
-                ),
-                e,
-            )
+        client_id = client_data.get("client_id", "no-client-id")

+        if is_test_client(client_id):
+            whitelist = self._whitelist_data.get()[0]
+            samples = whitelist[:limit]
+            self.logger.info("Test ID detected [{}]".format(client_id))
+
+            # Compute a stable weight for any whitelisted addon based
+            # on the sha256 hash of the GUID
+            p = [(int(hasher(s), 16) % 100) / 100.0 for s in samples]
+            results = list(zip(samples, p))
+        else:
+            try:
+                results = self._recommend(client_data, limit, extra_data)
+            except Exception as e:
+                results = []
+                self._weight_cache._weights.force_expiry()
+                self.logger.exception(
+                    "Ensemble recommender crashed for {}".format(client_id), e
+                )
        return results

    def _recommend(self, client_data, limit, extra_data={}):
@ -120,7 +140,9 @@ class EnsembleRecommender(AbstractRecommender):

        # group by the guid, sum up the weights for recurring GUID
        # suggestions across all recommenders
-        guid_grouper = itertools.groupby(flattened_results, lambda item: item[0])
+        guid_grouper = itertools.groupby(
+            flattened_results, lambda item: item[0]
+        )

        ensemble_suggestions = []
        for (guid, guid_group) in guid_grouper:
@ -141,10 +163,12 @@ class EnsembleRecommender(AbstractRecommender):

        log_data = (
            client_data["client_id"],
+            extra_data.get("guid_randomization", False),
            str(ensemble_weights),
            str([r[0] for r in results]),
        )
        self.logger.info(
-            "client_id: [%s], ensemble_weight: [%s], guids: [%s]" % log_data
+            "client_id: [%s], guid_randomization: [%s], ensemble_weight: [%s], guids: [%s]"
+            % log_data
        )
        return results
--- a/taar/recommenders/fixtures.py
+++ b/taar/recommenders/fixtures.py
@ -0,0 +1,14 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+These are fixtures that are used for testing TAAR in a production
+enviroment with known stable client_ids
+"""
+
+import hashlib
+
+
+def hasher(client_id):
+    return hashlib.new("sha256", client_id.encode("utf8")).hexdigest()
--- a/taar/recommenders/hybrid_recommender.py
+++ b/taar/recommenders/hybrid_recommender.py
@ -58,6 +58,7 @@ class CuratedRecommender(AbstractRecommender):
    def recommend(self, client_data, limit, extra_data={}):
        """
        Curated recommendations are just random selections
+        from the whitelist and we explicitly set the weighting to 1.0
        """
        guids = self._curated_wl.get_randomized_guid_sample(limit)

--- a/taar/recommenders/randomizer.py
+++ b/taar/recommenders/randomizer.py
@ -0,0 +1,40 @@
+"""
+This module re-orders the (GUID, weight) 2-tuples using
+numpy.random.choice
+"""
+
+import numpy as np
+
+
+def in_experiment(client_id, xp_prob=0.5):
+    """
+    Return whether or not this client_id is in the experiment.
+
+    xp_prob is a probability between 0.0 and 1.0 which is the
+    chance that the experimental branch is selected.
+    """
+    hex_client = ''.join([c for c in client_id.lower() if c in 'abcdef0123456789'])
+    int_client = int(hex_client, 16)
+    return int((int_client % 100) <= (xp_prob * 100))
+
+
+def reorder_guids(guid_weight_tuples, size=None):
+    """
+    This reorders (GUID, weight) 2-tuples based on the weight using
+    random selection, without replacement.
+
+    @size denotes the length of the output.
+    """
+    weight_list = [weight for (guid, weight) in guid_weight_tuples]
+    guids = [guid for (guid, weight) in guid_weight_tuples]
+    guid_map = dict(zip(guids, guid_weight_tuples))
+
+    if size is None:
+        size = len(guids)
+
+    # Normalize the weights so that they're probabilities
+    total_weight = sum(weight_list)
+    probabilities = [w * 1.0 / total_weight for w in weight_list]
+
+    choices = np.random.choice(guids, size=size, replace=False, p=probabilities)
+    return [guid_map[guid] for guid in choices]
--- a/taar/recommenders/recommendation_manager.py
+++ b/taar/recommenders/recommendation_manager.py
@ -3,42 +3,22 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

 from taar.recommenders.ensemble_recommender import EnsembleRecommender
+from taar.recommenders.randomizer import in_experiment, reorder_guids
 from srgutil.interfaces import IMozLogging

 from taar.context import default_context

 from .lazys3 import LazyJSONLoader
-import random

 from .s3config import TAAR_WHITELIST_BUCKET
 from .s3config import TAAR_WHITELIST_KEY
-
-import hashlib
+from .s3config import TAAR_EXPERIMENT_PROB

 # We need to build a default logger for the schema validation as there
 # is no class to bind to yet.
 ctx = default_context()


-def hasher(client_id):
-    return hashlib.new("sha256", client_id.encode("utf8")).hexdigest()
-
-
-TEST_CLIENT_IDS = [
-    hasher("00000000-0000-0000-0000-000000000000"),
-    hasher("11111111-1111-1111-1111-111111111111"),
-    hasher("22222222-2222-2222-2222-222222222222"),
-    hasher("33333333-3333-3333-3333-333333333333"),
-]
-
-EMPTY_TEST_CLIENT_IDS = [
-    hasher("00000000-aaaa-0000-0000-000000000000"),
-    hasher("11111111-aaaa-1111-1111-111111111111"),
-    hasher("22222222-aaaa-2222-2222-222222222222"),
-    hasher("33333333-aaaa-3333-3333-333333333333"),
-]
-
-
 class RecommenderFactory:
    """
    A RecommenderFactory provides support to create recommenders.
@ -82,6 +62,10 @@ class RecommendationManager:
            self._ctx, TAAR_WHITELIST_BUCKET, TAAR_WHITELIST_KEY
        )

+        self._experiment_prob = ctx.get(
+            "TAAR_EXPERIMENT_PROB", TAAR_EXPERIMENT_PROB
+        )
+
    def recommend(self, client_id, limit, extra_data={}):
        """Return recommendations for the given client.

@ -93,24 +77,29 @@ class RecommendationManager:
        :param extra_data: a dictionary with extra client data.
        """

-        if client_id in TEST_CLIENT_IDS:
-            data = self._whitelist_data.get()[0]
-            random.shuffle(data)
-            samples = data[:limit]
-            self.logger.info("Test ID detected [{}]".format(client_id))
-            return [(s, 1.1) for s in samples]
-
-        if client_id in EMPTY_TEST_CLIENT_IDS:
-            self.logger.info("Empty Test ID detected [{}]".format(client_id))
-            return []
+        results = None

        client_info = self.profile_fetcher.get(client_id)
        if client_info is None:
            self.logger.info(
                "Defaulting to empty results.  No client info fetched from storage backend."
            )
-            return []
+            results = []

-        results = self._ensemble_recommender.recommend(client_info, limit, extra_data)
+        if in_experiment(client_id, self._experiment_prob):
+            if results is None:
+                # Fetch back all possible whitelisted addons for this
+                # client
+                extra_data["guid_randomization"] = True
+                whitelist = self._whitelist_data.get()[0]
+                results = self._ensemble_recommender.recommend(
+                    client_info, len(whitelist), extra_data
+                )
+            results = reorder_guids(results, limit)
+        else:
+            if results is None:
+                results = self._ensemble_recommender.recommend(
+                    client_info, limit, extra_data
+                )

        return results
--- a/taar/recommenders/s3config.py
+++ b/taar/recommenders/s3config.py
@ -28,3 +28,5 @@ TAAR_SIMILARITY_DONOR_KEY = config(
 TAAR_SIMILARITY_LRCURVES_KEY = config(
    "TAAR_SIMILARITY_LRCURVES_KEY", default="test_similarity_lrcurves_key"
 )
+
+TAAR_EXPERIMENT_PROB = config("TAAR_EXPERIMENT_PROB", default=0.0)
--- a/tests/test_hybrid_recommender.py
+++ b/tests/test_hybrid_recommender.py
@ -104,13 +104,24 @@ def test_hybrid_recommendations(test_ctx):
        # of recommendations
        assert len(guid_list) == LIMIT

+
+@mock_s3
+def test_stable_hybrid_results(test_ctx):
+    # verify that the recommendations mix the curated and
+    # ensemble results
+    ctx = install_mock_curated_data(test_ctx)
+    ctx = install_ensemble_fixtures(ctx)
+
+    r = HybridRecommender(ctx)
    # Test that the results are actually mixed
    guid_list = r.recommend({"client_id": "000000"}, limit=4)

-    # A mixed list will have two recommendations with weight > 1.0
-    # (ensemble) and 2 with exactly weight 1.0 from the curated list
+    assert len(guid_list) == 4

-    assert guid_list[0][1] > 1.0
-    assert guid_list[1][1] > 1.0
-    assert guid_list[2][1] == 1.0
-    assert guid_list[3][1] == 1.0
+    # A mixed list will have two recommendations with weight = 1.0
+    # (curated) and 2 with exactly weight < 1.0 from the ensemble list
+
+    assert guid_list[0][1] == 1.0
+    assert guid_list[1][1] == 1.0
+    assert guid_list[2][1] < 1.0
+    assert guid_list[3][1] < 1.0
--- a/tests/test_randomizer.py
+++ b/tests/test_randomizer.py
@ -0,0 +1,61 @@
+"""
+Test that we can reorder (GUID, weight) tuples based on random
+selection based on probability,
+"""
+
+from taar.recommenders.randomizer import reorder_guids
+from taar.recommenders.randomizer import in_experiment
+
+import numpy as np
+from collections import Counter
+
+
+def most_frequent(List):
+    occurence_count = Counter(List)
+    return occurence_count.most_common(1)[0][0]
+
+
+def test_reorder_guids():
+    # These weights are selected carefully so that they are different
+    # enough that a randomized selection using the weighted inputs
+    # will be stable 'enough' that we should be able to pass tests
+    # consistently over a sufficiently large sample
+
+    # Fix the random seed so that we get stable results between test
+    # runs
+    np.random.seed(seed=42)
+
+    guid_weight_tuples = [
+        ("guid1", 0.01),
+        ("guid2", 0.09),
+        ("guid3", 0.30),
+        ("guid4", 0.60),
+    ]
+
+    # Run this 100 times to get the average ordering
+    results = []
+    for i in range(100):
+        results.append(reorder_guids(guid_weight_tuples))
+
+    best_result = []
+    for i in range(4):
+        best_result.append(most_frequent([row[i] for row in results])[0])
+    assert best_result == ["guid4", "guid3", "guid2", "guid1"]
+
+
+def test_experimental_branch_guid():
+    """
+    Test the experimental cutoff selection code.
+
+    The evaluation should be stable for a given probability and
+    client_id.
+    """
+    for i in range(10, 100, 10):
+        id = hex(i)[2:]
+        cutoff = (i + 9.0) / 100
+
+        total = sum([in_experiment(id, cutoff) for i in range(100)])
+        assert total == 100
+
+        total = sum([in_experiment(id, cutoff - 0.1) for i in range(100)])
+        assert total == 0
--- a/tests/test_recommendation_manager.py
+++ b/tests/test_recommendation_manager.py
@ -6,8 +6,6 @@ import boto3
 import json
 from moto import mock_s3
 from taar.recommenders import RecommendationManager
-from taar.recommenders.recommendation_manager import TEST_CLIENT_IDS
-from taar.recommenders.recommendation_manager import EMPTY_TEST_CLIENT_IDS
 from taar.recommenders.base_recommender import AbstractRecommender

 from taar.recommenders.ensemble_recommender import (
@ -19,6 +17,9 @@ from taar.recommenders.ensemble_recommender import (
 from .mocks import MockRecommenderFactory
 from .test_hybrid_recommender import install_mock_curated_data

+import operator
+from functools import reduce
+

 class StubRecommender(AbstractRecommender):
    """ A shared, stub recommender that can be used for testing.
@ -35,23 +36,32 @@ class StubRecommender(AbstractRecommender):
        return self._recommendations


-def install_mocks(ctx):
+def install_mocks(ctx, mock_fetcher=None):
    ctx = ctx.child()

-    class MockProfileFetcher:
+    class DefaultMockProfileFetcher:
        def get(self, client_id):
            return {"client_id": client_id}

-    ctx["profile_fetcher"] = MockProfileFetcher()
+    if mock_fetcher is None:
+        mock_fetcher = DefaultMockProfileFetcher()
+
+    ctx["profile_fetcher"] = mock_fetcher
    ctx["recommender_factory"] = MockRecommenderFactory()

    DATA = {
-        "ensemble_weights": {"collaborative": 1000, "similarity": 100, "locale": 10}
+        "ensemble_weights": {
+            "collaborative": 1000,
+            "similarity": 100,
+            "locale": 10,
+        }
    }

    conn = boto3.resource("s3", region_name="us-west-2")
    conn.create_bucket(Bucket=TAAR_ENSEMBLE_BUCKET)
-    conn.Object(TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY).put(Body=json.dumps(DATA))
+    conn.Object(TAAR_ENSEMBLE_BUCKET, TAAR_ENSEMBLE_KEY).put(
+        Body=json.dumps(DATA)
+    )

    return ctx

@ -63,6 +73,7 @@ def test_none_profile_returns_empty_list(test_ctx):
    class MockProfileFetcher:
        def get(self, client_id):
            return None
+
    ctx["profile_fetcher"] = MockProfileFetcher()

    rec_manager = RecommendationManager(ctx)
@ -87,9 +98,7 @@ def test_simple_recommendation(test_ctx):
    ]

    manager = RecommendationManager(ctx.child())
-    recommendation_list = manager.recommend(
-        "some_ignored_id", 10
-    )
+    recommendation_list = manager.recommend("some_ignored_id", 10)

    assert isinstance(recommendation_list, list)
    assert recommendation_list == EXPECTED_RESULTS
@ -101,21 +110,56 @@ def test_fixed_client_id_valid(test_ctx):
    ctx = install_mock_curated_data(ctx)

    manager = RecommendationManager(ctx.child())
-    recommendation_list = manager.recommend(
-        TEST_CLIENT_IDS[0], 10
-    )
+    recommendation_list = manager.recommend('111111', 10)

    assert len(recommendation_list) == 10


@mock_s3
 def test_fixed_client_id_empty_list(test_ctx):
+    class NoClientFetcher:
+        def get(self, client_id):
+            return None
+
+    ctx = install_mocks(test_ctx, mock_fetcher=NoClientFetcher())
+
+    ctx = install_mock_curated_data(ctx)
+
+    manager = RecommendationManager(ctx.child())
+    recommendation_list = manager.recommend("not_a_real_client_id", 10)
+
+    assert len(recommendation_list) == 0
+
+
+@mock_s3
+def test_experimental_randomization(test_ctx):
    ctx = install_mocks(test_ctx)
    ctx = install_mock_curated_data(ctx)

    manager = RecommendationManager(ctx.child())
-    recommendation_list = manager.recommend(
-        EMPTY_TEST_CLIENT_IDS[0], 10
-    )
+    raw_list = manager.recommend('111111', 10)

-    assert len(recommendation_list) == 0
+    # Clobber the experiment probability to be 100% to force a
+    # reordering.
+    ctx["TAAR_EXPERIMENT_PROB"] = 1.0
+
+    manager = RecommendationManager(ctx.child())
+    rand_list = manager.recommend('111111', 10)
+
+    """
+    The two lists should be :
+
+    * different (guid, weight) lists (possibly just order)
+    * same length
+    """
+    assert (
+        reduce(
+            operator.and_,
+            [
+                (t1[0] == t2[0] and t1[1] == t2[1])
+                for t1, t2 in zip(rand_list, raw_list)
+            ],
+        )
+        is False
+    )
+    assert len(rand_list) == len(raw_list)