зеркало из https://github.com/mozilla/taar.git
339 строки
11 KiB
Python
339 строки
11 KiB
Python
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
import json
|
|
import six
|
|
|
|
import pytest
|
|
import numpy as np
|
|
import scipy.stats
|
|
|
|
from taar.cache import JSONCache, Clock
|
|
|
|
from taar.recommenders.similarity_recommender import \
|
|
CATEGORICAL_FEATURES, CONTINUOUS_FEATURES, DONOR_LIST_KEY, LR_CURVES_SIMILARITY_TO_PROBABILITY, \
|
|
SimilarityRecommender
|
|
|
|
from .similarity_data import CONTINUOUS_FEATURE_FIXTURE_DATA
|
|
from .similarity_data import CATEGORICAL_FEATURE_FIXTURE_DATA
|
|
|
|
|
|
def generate_fake_lr_curves(num_elements, ceiling=10.0):
|
|
"""
|
|
Generate a mock likelihood ratio (LR) curve that can be used for
|
|
testing.
|
|
"""
|
|
lr_index = list(np.linspace(0, ceiling, num_elements))
|
|
|
|
# This sets up a normal distribution with a mean of 0.5 and std
|
|
# deviation of 0.5
|
|
numerator_density = [scipy.stats.norm.pdf(float(i), 0.5, 0.5) for i in lr_index]
|
|
|
|
# This sets up a normal distribution with a mean of 5.0 and std
|
|
# deviation of 1.5. So this is right shifted and has a wide std
|
|
# deviation compared to the first curve.
|
|
|
|
# This results in a small overlap. Using ranges between 0.5 to
|
|
# 5.0 will yield large values to small values.
|
|
denominator_density = [scipy.stats.norm.pdf(float(i), 5.0, 1.5) for i in lr_index]
|
|
return list(zip(lr_index, zip(numerator_density, denominator_density)))
|
|
|
|
|
|
def generate_a_fake_taar_client():
|
|
return {
|
|
"client_id": "test-client-001",
|
|
"activeAddons": [],
|
|
"geo_city": "brasilia-br",
|
|
"subsession_length": 4911,
|
|
"locale": "br-PT",
|
|
"os": "mac",
|
|
"bookmark_count": 7,
|
|
"tab_open_count": 4,
|
|
"total_uri": 222,
|
|
"unique_tlds": 21
|
|
}
|
|
|
|
|
|
class MockNoDataUtils:
|
|
def get_s3_json_content(self, *args, **kwargs):
|
|
return None
|
|
|
|
|
|
class MockCategoricalData:
|
|
|
|
cat_data = json.loads(json.dumps(CATEGORICAL_FEATURE_FIXTURE_DATA))
|
|
lrs_data = json.loads(json.dumps(generate_fake_lr_curves(1000)))
|
|
|
|
def get_s3_json_content(self, bucket, key):
|
|
if key == DONOR_LIST_KEY:
|
|
return self.cat_data
|
|
if key == LR_CURVES_SIMILARITY_TO_PROBABILITY:
|
|
return self.lrs_data
|
|
|
|
|
|
class MockContinuousData:
|
|
|
|
cts_data = json.loads(json.dumps(CONTINUOUS_FEATURE_FIXTURE_DATA))
|
|
lrs_data = json.loads(json.dumps(generate_fake_lr_curves(1000)))
|
|
|
|
def get_s3_json_content(self, bucket, key):
|
|
if key == DONOR_LIST_KEY:
|
|
return self.cts_data
|
|
if key == LR_CURVES_SIMILARITY_TO_PROBABILITY:
|
|
return self.lrs_data
|
|
|
|
|
|
@pytest.fixture
|
|
def cat_test_ctx(test_ctx):
|
|
ctx = test_ctx
|
|
ctx['utils'] = MockCategoricalData()
|
|
ctx['clock'] = Clock()
|
|
ctx['cache'] = JSONCache(ctx)
|
|
return ctx.child()
|
|
|
|
|
|
@pytest.fixture
|
|
def cts_test_ctx(test_ctx):
|
|
ctx = test_ctx
|
|
ctx['utils'] = MockContinuousData()
|
|
ctx['clock'] = Clock()
|
|
ctx['cache'] = JSONCache(ctx)
|
|
return ctx.child()
|
|
|
|
|
|
def test_soft_fail(test_ctx):
|
|
# Create a new instance of a SimilarityRecommender.
|
|
ctx = test_ctx
|
|
ctx['utils'] = MockNoDataUtils()
|
|
ctx['clock'] = Clock()
|
|
ctx['cache'] = JSONCache(ctx)
|
|
r = SimilarityRecommender(ctx)
|
|
|
|
# Don't recommend if the source files cannot be found.
|
|
assert not r.can_recommend({})
|
|
|
|
|
|
def test_can_recommend(cts_test_ctx):
|
|
# Create a new instance of a SimilarityRecommender.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
|
|
# Test that we can't recommend if we have not enough client info.
|
|
assert not r.can_recommend({})
|
|
|
|
# Test that we can recommend for a normal client.
|
|
assert r.can_recommend(generate_a_fake_taar_client())
|
|
|
|
# Check that we can not recommend if any required client field is missing.
|
|
required_fields = CATEGORICAL_FEATURES + CONTINUOUS_FEATURES
|
|
|
|
for required_field in required_fields:
|
|
profile_without_x = generate_a_fake_taar_client()
|
|
|
|
# Make an empty value in a required field in the client info dict.
|
|
profile_without_x[required_field] = None
|
|
assert not r.can_recommend(profile_without_x)
|
|
|
|
# Completely remove (in place) the entire required field from the dict.
|
|
del profile_without_x[required_field]
|
|
assert not r.can_recommend(profile_without_x)
|
|
|
|
|
|
def test_recommendations(cts_test_ctx):
|
|
# Create a new instance of a SimilarityRecommender.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
|
|
# TODO: clobber the SimilarityRecommender::lr_curves
|
|
|
|
recommendation_list = r.recommend(generate_a_fake_taar_client(), 1)
|
|
|
|
assert isinstance(recommendation_list, list)
|
|
assert len(recommendation_list) == 1
|
|
|
|
recommendation, weight = recommendation_list[0]
|
|
|
|
# Make sure that the reported addons are the expected ones from the most similar donor.
|
|
assert "{test-guid-1}" == recommendation
|
|
assert type(weight) == np.float64
|
|
|
|
|
|
def test_recommender_str(cts_test_ctx):
|
|
# Tests that the string representation of the recommender is correct.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
assert str(r) == "SimilarityRecommender"
|
|
|
|
|
|
def test_get_lr(cts_test_ctx):
|
|
# Tests that the likelihood ratio values are not empty for extreme values and are realistic.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
assert r.get_lr(0.0001) is not None
|
|
assert r.get_lr(10.0) is not None
|
|
assert r.get_lr(0.001) > r.get_lr(5.0)
|
|
|
|
|
|
def test_compute_clients_dist(cts_test_ctx):
|
|
# Test the distance function computation.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
test_clients = [
|
|
{
|
|
"client_id": "test-client-002",
|
|
"activeAddons": [],
|
|
"geo_city": "sfo-us",
|
|
"subsession_length": 1,
|
|
"locale": "en-US",
|
|
"os": "windows",
|
|
"bookmark_count": 1,
|
|
"tab_open_count": 1,
|
|
"total_uri": 1,
|
|
"unique_tlds": 1
|
|
},
|
|
{
|
|
"client_id": "test-client-003",
|
|
"activeAddons": [],
|
|
"geo_city": "brasilia-br",
|
|
"subsession_length": 1,
|
|
"locale": "br-PT",
|
|
"os": "windows",
|
|
"bookmark_count": 10,
|
|
"tab_open_count": 1,
|
|
"total_uri": 1,
|
|
"unique_tlds": 1
|
|
},
|
|
{
|
|
"client_id": "test-client-004",
|
|
"activeAddons": [],
|
|
"geo_city": "brasilia-br",
|
|
"subsession_length": 100,
|
|
"locale": "br-PT",
|
|
"os": "windows",
|
|
"bookmark_count": 10,
|
|
"tab_open_count": 10,
|
|
"total_uri": 100,
|
|
"unique_tlds": 10
|
|
}
|
|
]
|
|
per_client_test = []
|
|
|
|
# Compute a different set of distances for each set of clients.
|
|
for tc in test_clients:
|
|
test_distances = r.compute_clients_dist(tc)
|
|
assert len(test_distances) == len(CONTINUOUS_FEATURE_FIXTURE_DATA)
|
|
per_client_test.append(test_distances[2][0])
|
|
|
|
# Ensure the different clients also had different distances to a specific donor.
|
|
assert per_client_test[0] >= per_client_test[1] >= per_client_test[2]
|
|
|
|
|
|
def test_distance_functions(cts_test_ctx):
|
|
# Tests the similarity functions via expected output when passing modified client data.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
|
|
# Generate a fake client.
|
|
test_client = generate_a_fake_taar_client()
|
|
recs = r.recommend(test_client, 10)
|
|
assert len(recs) > 0
|
|
|
|
# Make it a generally poor match for the donors.
|
|
test_client.update({'total_uri': 10, 'bookmark_count': 2, 'subsession_length': 10})
|
|
|
|
all_client_values_zero = test_client
|
|
# Make all categorical variables non-matching with any donor.
|
|
all_client_values_zero.update({key: 'zero' for key in test_client.keys() if key in CATEGORICAL_FEATURES})
|
|
recs = r.recommend(all_client_values_zero, 10)
|
|
assert len(recs) == 0
|
|
|
|
# Make all continuous variables equal to zero.
|
|
all_client_values_zero.update({key: 0 for key in test_client.keys() if key in CONTINUOUS_FEATURES})
|
|
recs = r.recommend(all_client_values_zero, 10)
|
|
assert len(recs) == 0
|
|
|
|
# Make all categorical variables non-matching with any donor.
|
|
all_client_values_high = test_client
|
|
all_client_values_high.update({key: 'one billion' for key in test_client.keys() if key in CATEGORICAL_FEATURES})
|
|
recs = r.recommend(all_client_values_high, 10)
|
|
assert len(recs) == 0
|
|
|
|
# Make all continuous variables equal to a very high numerical value.
|
|
all_client_values_high.update({key: 1e60 for key in test_client.keys() if key in CONTINUOUS_FEATURES})
|
|
recs = r.recommend(all_client_values_high, 10)
|
|
assert len(recs) == 0
|
|
|
|
# Test for 0.0 values if j_c is not normalized and j_d is fine.
|
|
j_c = 0.0
|
|
j_d = 0.42
|
|
assert abs(j_c * j_d) == 0.0
|
|
assert abs((j_c + 0.01) * j_d) != 0.0
|
|
|
|
|
|
def test_weights_continuous(cts_test_ctx):
|
|
# Create a new instance of a SimilarityRecommender.
|
|
ctx = cts_test_ctx
|
|
r = SimilarityRecommender(ctx)
|
|
|
|
# In the ensemble method recommendations should be a sorted list of tuples
|
|
# containing [(guid, weight), (guid, weight)... (guid, weight)].
|
|
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
|
|
with open('/tmp/similarity_recommender.json', 'w') as fout:
|
|
fout.write(json.dumps(recommendation_list))
|
|
|
|
# Make sure the structure of the recommendations is correct and
|
|
# that we recommended the the right addons.
|
|
|
|
assert len(recommendation_list) == 2
|
|
for recommendation, weight in recommendation_list:
|
|
assert isinstance(recommendation, six.string_types)
|
|
assert isinstance(weight, float)
|
|
|
|
# Test that sorting is appropriate.
|
|
rec0 = recommendation_list[0]
|
|
rec1 = recommendation_list[1]
|
|
|
|
rec0_weight = rec0[1]
|
|
rec1_weight = rec1[1]
|
|
|
|
# Duplicate presence of test-guid-1 should mean rec0_weight is double
|
|
# rec1_weight, and both should be greater than 1.0
|
|
|
|
assert rec0_weight > rec1_weight > 1.0
|
|
|
|
|
|
def test_weights_categorical(cat_test_ctx, cts_test_ctx):
|
|
'''
|
|
This should get :
|
|
["{test-guid-1}", "{test-guid-2}", "{test-guid-3}", "{test-guid-4}"],
|
|
["{test-guid-9}", "{test-guid-10}", "{test-guid-11}", "{test-guid-12}"]
|
|
from the first two entries in the sample data where the geo_city
|
|
data
|
|
|
|
'''
|
|
# Create a new instance of a SimilarityRecommender.
|
|
ctx = cat_test_ctx
|
|
ctx2 = cts_test_ctx
|
|
wrapped = ctx2.wrap(ctx)
|
|
r = SimilarityRecommender(wrapped)
|
|
|
|
# In the ensemble method recommendations should be a sorted list of tuples
|
|
# containing [(guid, weight), (guid, weight)... (guid, weight)].
|
|
recommendation_list = r.recommend(generate_a_fake_taar_client(), 2)
|
|
|
|
assert len(recommendation_list) == 2
|
|
# Make sure the structure of the recommendations is correct and that we recommended the the right addons.
|
|
for recommendation, weight in recommendation_list:
|
|
assert isinstance(recommendation, six.string_types)
|
|
assert isinstance(weight, float)
|
|
|
|
# Test that sorting is appropriate.
|
|
rec0 = recommendation_list[0]
|
|
rec1 = recommendation_list[1]
|
|
|
|
rec0_weight = rec0[1]
|
|
rec1_weight = rec1[1]
|
|
|
|
assert rec0_weight > rec1_weight > 0
|