493 строки
16 KiB
Python
493 строки
16 KiB
Python
import functools
|
|
import pytest
|
|
from collections import namedtuple
|
|
from pyspark.sql.types import (
|
|
StructField,
|
|
ArrayType,
|
|
BooleanType,
|
|
StringType,
|
|
LongType,
|
|
MapType,
|
|
StructType,
|
|
DoubleType,
|
|
)
|
|
from mozetl.search.aggregates import (
|
|
search_aggregates,
|
|
search_clients_daily,
|
|
explode_search_counts,
|
|
add_derived_columns,
|
|
MAX_CLIENT_SEARCH_COUNT,
|
|
)
|
|
|
|
|
|
# Some boilerplate to help define example dataframes for testing
|
|
|
|
# A helper class for declaratively creating dataframe factories
|
|
dataframe_field = namedtuple(
|
|
"dataframe_field", ["name", "default_value", "type", "nullable"]
|
|
)
|
|
|
|
|
|
def to_field(field_tuple):
|
|
"""Create a dataframe_field from a tuple"""
|
|
return dataframe_field(*field_tuple)
|
|
|
|
|
|
def get_dataframe_factory_config(fields):
|
|
"""Parse a list of dataframe_fields to a schema and set of default values"""
|
|
schema = StructType(
|
|
[StructField(field.name, field.type, field.nullable) for field in fields]
|
|
)
|
|
default_sample = {field.name: field.default_value for field in fields}
|
|
|
|
return schema, default_sample
|
|
|
|
|
|
@pytest.fixture
|
|
def define_dataframe_factory(dataframe_factory):
|
|
def partial(fields):
|
|
"""Create a dataframe_factory from a set of field configs"""
|
|
schema, default_sample = get_dataframe_factory_config(fields)
|
|
return functools.partial(
|
|
dataframe_factory.create_dataframe, base=default_sample, schema=schema
|
|
)
|
|
|
|
return partial
|
|
|
|
|
|
# Boilerplate for generating example main_summary tables
|
|
def generate_search_count(engine="google", source="urlbar", count=4):
|
|
return {"engine": engine, "source": source, "count": count}
|
|
|
|
|
|
addons_type = ArrayType(
|
|
StructType(
|
|
[
|
|
StructField("addon_id", StringType(), False),
|
|
StructField("blocklisted", BooleanType(), True),
|
|
StructField("name", StringType(), True),
|
|
StructField("user_disabled", BooleanType(), True),
|
|
StructField("app_disabled", BooleanType(), True),
|
|
StructField("version", StringType(), True),
|
|
StructField("scope", LongType(), True),
|
|
StructField("type", StringType(), True),
|
|
StructField("foreign_install", BooleanType(), True),
|
|
StructField("has_binary_components", BooleanType(), True),
|
|
StructField("install_day", LongType(), True),
|
|
StructField("update_day", LongType(), True),
|
|
StructField("signed_state", LongType(), True),
|
|
StructField("is_system", BooleanType(), True),
|
|
StructField("is_web_extension", BooleanType(), True),
|
|
StructField("multiprocess_compatible", BooleanType(), True),
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
def generate_addon(addon_id, name, version):
|
|
return {"addon_id": addon_id, "name": name, "version": version}
|
|
|
|
|
|
active_addons = [
|
|
generate_addon("random@mozilla.com", "random", "0.1"),
|
|
generate_addon("followonsearch@mozilla.com", "Follow-on Search Telemetry", "0.9.5"),
|
|
]
|
|
|
|
|
|
search_type = ArrayType(
|
|
StructType(
|
|
[
|
|
StructField("engine", StringType(), False),
|
|
StructField("source", StringType(), False),
|
|
StructField("count", LongType(), False),
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
main_summary_schema = [
|
|
("client_id", "a", StringType(), False),
|
|
("sample_id", "42", StringType(), False),
|
|
("submission_date", "20170101", StringType(), False),
|
|
("os", "windows", StringType(), True),
|
|
("os_version", "10.0", StringType(), True),
|
|
("channel", "release", StringType(), True),
|
|
("country", "DE", StringType(), True),
|
|
("locale", "de", StringType(), True),
|
|
("user_pref_browser_search_region", "DE", StringType(), True),
|
|
("search_cohort", None, StringType(), True),
|
|
("app_version", "54.0.1", StringType(), True),
|
|
("distribution_id", None, StringType(), True),
|
|
("subsession_counter", 1, LongType(), True),
|
|
("search_counts", [generate_search_count()], search_type, True),
|
|
(
|
|
"scalar_parent_browser_search_ad_clicks",
|
|
None,
|
|
MapType(StringType(), LongType()),
|
|
True,
|
|
),
|
|
(
|
|
"scalar_parent_browser_search_with_ads",
|
|
None,
|
|
MapType(StringType(), LongType()),
|
|
True,
|
|
),
|
|
("active_addons", active_addons, addons_type, True),
|
|
# 30 minutes in active_ticks (30min * 60sec/min / 5sec/tick)
|
|
("active_ticks", 360, LongType(), True),
|
|
("scalar_parent_browser_engagement_tab_open_event_count", 5, LongType(), True),
|
|
("scalar_parent_browser_engagement_max_concurrent_tab_count", 10, LongType(), True),
|
|
("subsession_start_date", "2017-01-01 10:00", StringType(), False),
|
|
# One hour per ping
|
|
("subsession_length", 60 * 60, LongType(), False),
|
|
# Roughly 2016-01-01
|
|
("profile_creation_date", 16801, LongType(), False),
|
|
("default_search_engine", "google", StringType(), False),
|
|
(
|
|
"default_search_engine_data_load_path",
|
|
"jar:[app]/omni.ja!browser/google.xml",
|
|
StringType(),
|
|
False,
|
|
),
|
|
(
|
|
"default_search_engine_data_submission_url",
|
|
"https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b",
|
|
StringType(),
|
|
False,
|
|
),
|
|
]
|
|
|
|
exploded_schema = [
|
|
x
|
|
for x in main_summary_schema
|
|
if x[0]
|
|
not in (
|
|
"search_counts",
|
|
"scalar_parent_browser_search_with_ads",
|
|
"scalar_parent_browser_search_ad_clicks",
|
|
)
|
|
] + [
|
|
("engine", "google", StringType(), False),
|
|
("source", "urlbar", StringType(), False),
|
|
("count", 4, LongType(), False),
|
|
]
|
|
|
|
derived_schema = exploded_schema + [
|
|
("type", "chrome-sap", StringType(), False),
|
|
("addon_version", "0.9.5", StringType(), False),
|
|
]
|
|
|
|
|
|
@pytest.fixture()
|
|
def generate_main_summary_data(define_dataframe_factory):
|
|
return define_dataframe_factory(list(map(to_field, main_summary_schema)))
|
|
|
|
|
|
@pytest.fixture
|
|
def main_summary(generate_main_summary_data):
|
|
return generate_main_summary_data(
|
|
[
|
|
{"client_id": "b", "country": "US"},
|
|
{"app_version": "52.0.3"},
|
|
{"distribution_id": "totally not null"},
|
|
{
|
|
"search_counts": [
|
|
generate_search_count(engine="bing"),
|
|
generate_search_count(engine="yahoo"),
|
|
]
|
|
},
|
|
]
|
|
+
|
|
# Some duplicate default rows to test aggregation
|
|
[{}] * 5
|
|
+
|
|
# Client with no searches
|
|
[
|
|
{
|
|
"client_id": "c",
|
|
"search_counts": None,
|
|
"user_pref_browser_search_region": None,
|
|
}
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def simple_main_summary(generate_main_summary_data):
|
|
return generate_main_summary_data(
|
|
[
|
|
{
|
|
"search_counts": [
|
|
generate_search_count(engine="bing"),
|
|
generate_search_count(engine="yahoo"),
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def generate_exploded_data(define_dataframe_factory):
|
|
return define_dataframe_factory(list(map(to_field, exploded_schema)))
|
|
|
|
|
|
@pytest.fixture()
|
|
def exploded_simple_main_summary(generate_exploded_data):
|
|
return generate_exploded_data([{"engine": "yahoo"}, {"engine": "bing"}])
|
|
|
|
|
|
@pytest.fixture()
|
|
def exploded_data_for_derived_cols(generate_exploded_data):
|
|
return generate_exploded_data(
|
|
[
|
|
{"source": "sap:urlbar:SomeCodeHere"},
|
|
{"source": "follow-on:urlbar:SomeCodeHere"},
|
|
{"source": "in-content:sap:SomeCodeHere"},
|
|
{"source": "in-content:sap-follow-on:SomeCodeHere"},
|
|
{"source": "in-content:organic:something"},
|
|
{"source": "unknowngarbagestring"},
|
|
{"source": "urlbar"},
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def derived_columns(define_dataframe_factory):
|
|
# template for the expected results
|
|
factory = define_dataframe_factory(list(map(to_field, derived_schema)))
|
|
|
|
return factory(
|
|
[
|
|
{"source": "sap:urlbar:SomeCodeHere", "type": "tagged-sap"},
|
|
{"source": "follow-on:urlbar:SomeCodeHere", "type": "tagged-follow-on"},
|
|
{"source": "in-content:sap:SomeCodeHere", "type": "tagged-sap"},
|
|
{
|
|
"source": "in-content:sap-follow-on:SomeCodeHere",
|
|
"type": "tagged-follow-on",
|
|
},
|
|
{"source": "in-content:organic:something", "type": "organic"},
|
|
{"source": "unknowngarbagestring", "type": "unknown"},
|
|
{"source": "urlbar", "type": "sap"},
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def expected_search_dashboard_data(define_dataframe_factory):
|
|
# template for the expected results
|
|
factory = define_dataframe_factory(
|
|
list(
|
|
map(
|
|
to_field,
|
|
[
|
|
("submission_date", "20170101", StringType(), False),
|
|
("country", "DE", StringType(), True),
|
|
("locale", "de", StringType(), True),
|
|
("search_cohort", None, StringType(), True),
|
|
("app_version", "54.0.1", StringType(), True),
|
|
("distribution_id", None, StringType(), True),
|
|
("os", "windows", StringType(), True),
|
|
("os_version", "10.0", StringType(), True),
|
|
("addon_version", "0.9.5", StringType(), False),
|
|
("default_search_engine", "google", StringType(), False),
|
|
("engine", "google", StringType(), False),
|
|
("source", "urlbar", StringType(), False),
|
|
("tagged-sap", None, LongType(), True),
|
|
("tagged-follow-on", None, LongType(), True),
|
|
("tagged_sap", None, LongType(), True),
|
|
("tagged_follow_on", None, LongType(), True),
|
|
("sap", 4, LongType(), True),
|
|
("organic", None, LongType(), True),
|
|
("ad-click", None, LongType(), True),
|
|
("search-with-ads", None, LongType(), True),
|
|
("ad_click", None, LongType(), True),
|
|
("search_with_ads", None, LongType(), True),
|
|
("unknown", None, LongType(), True),
|
|
],
|
|
)
|
|
)
|
|
)
|
|
|
|
return factory(
|
|
[
|
|
{"country": "US"},
|
|
{"app_version": "52.0.3"},
|
|
{"distribution_id": "totally not null"},
|
|
{"engine": "yahoo"},
|
|
{"engine": "bing"},
|
|
{"sap": 20},
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def expected_search_clients_daily_data(define_dataframe_factory):
|
|
# template for the expected results
|
|
factory = define_dataframe_factory(
|
|
list(
|
|
map(
|
|
to_field,
|
|
[
|
|
("client_id", "a", StringType(), False),
|
|
("sample_id", "42", StringType(), False),
|
|
("submission_date", "20170101", StringType(), False),
|
|
("os", "windows", StringType(), True),
|
|
("os_version", "10.0", StringType(), True),
|
|
("channel", "release", StringType(), True),
|
|
("country", "DE", StringType(), True),
|
|
("locale", "de", StringType(), True),
|
|
("user_pref_browser_search_region", "DE", StringType(), True),
|
|
("search_cohort", None, StringType(), True),
|
|
("app_version", "54.0.1", StringType(), True),
|
|
("distribution_id", None, StringType(), True),
|
|
("addon_version", "0.9.5", StringType(), False),
|
|
("engine", "google", StringType(), True),
|
|
("source", "urlbar", StringType(), True),
|
|
("tagged-sap", None, LongType(), True),
|
|
("tagged-follow-on", None, LongType(), True),
|
|
("tagged_sap", None, LongType(), True),
|
|
("tagged_follow_on", None, LongType(), True),
|
|
("sap", 4, LongType(), True),
|
|
("organic", None, LongType(), True),
|
|
("ad-click", None, LongType(), True),
|
|
("ad_click", None, LongType(), True),
|
|
("search-with-ads", None, LongType(), True),
|
|
("search_with_ads", None, LongType(), True),
|
|
("unknown", None, LongType(), True),
|
|
# Roughly 2016-01-01
|
|
("profile_creation_date", 16801, LongType(), False),
|
|
("default_search_engine", "google", StringType(), False),
|
|
(
|
|
"default_search_engine_data_load_path",
|
|
"jar:[app]/omni.ja!browser/google.xml",
|
|
StringType(),
|
|
False,
|
|
),
|
|
(
|
|
"default_search_engine_data_submission_url",
|
|
"https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b",
|
|
StringType(),
|
|
False,
|
|
),
|
|
("sessions_started_on_this_day", 1, LongType(), True),
|
|
("profile_age_in_days", 366, LongType(), True),
|
|
("subsession_hours_sum", 1.0, DoubleType(), True),
|
|
("active_addons_count_mean", 2.0, DoubleType(), True),
|
|
("max_concurrent_tab_count_max", 10, LongType(), True),
|
|
("tab_open_event_count_sum", 5, LongType(), True),
|
|
("active_hours_sum", 0.5, DoubleType(), True),
|
|
],
|
|
)
|
|
)
|
|
)
|
|
|
|
return factory(
|
|
[
|
|
{"client_id": "b", "country": "US"},
|
|
# Covers 5 dupe rows and custom app_version, distribution_id rows
|
|
{
|
|
"app_version": "52.0.3",
|
|
"sap": 28,
|
|
"sessions_started_on_this_day": 7,
|
|
"subsession_hours_sum": 7.0,
|
|
"tab_open_event_count_sum": 35,
|
|
"active_hours_sum": 3.5,
|
|
},
|
|
{"engine": "bing"},
|
|
{"engine": "yahoo"},
|
|
{
|
|
"client_id": "c",
|
|
"unknown": None,
|
|
"sap": 0,
|
|
"tagged-sap": None,
|
|
"tagged-follow-on": None,
|
|
"tagged_sap": None,
|
|
"tagged_follow_on": None,
|
|
"source": None,
|
|
"engine": None,
|
|
"user_pref_browser_search_region": None,
|
|
},
|
|
]
|
|
)
|
|
|
|
|
|
# Testing functions
|
|
|
|
|
|
def test_explode_search_counts(
|
|
simple_main_summary, exploded_simple_main_summary, df_equals
|
|
):
|
|
actual = explode_search_counts(simple_main_summary)
|
|
|
|
assert df_equals(actual, exploded_simple_main_summary)
|
|
|
|
|
|
def test_explode_search_counts_bing_absurd(
|
|
generate_main_summary_data, generate_exploded_data, df_equals
|
|
):
|
|
main_summary_bing_absurd = generate_main_summary_data(
|
|
[
|
|
{
|
|
"search_counts": [
|
|
generate_search_count(
|
|
engine="bing", count=(MAX_CLIENT_SEARCH_COUNT + 1)
|
|
),
|
|
generate_search_count(engine="yahoo"),
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
# expected result only includes yahoo, because the bing entry had an absurd
|
|
# number of searches
|
|
expected = generate_exploded_data([{"engine": "yahoo"}])
|
|
|
|
actual = explode_search_counts(main_summary_bing_absurd)
|
|
|
|
assert df_equals(expected, actual)
|
|
|
|
|
|
def test_explode_ad_click_counts(
|
|
generate_main_summary_data, generate_exploded_data, df_equals
|
|
):
|
|
main_summary_with_ad_click_counts = generate_main_summary_data(
|
|
[
|
|
{
|
|
"scalar_parent_browser_search_ad_clicks": {"google": 1},
|
|
"scalar_parent_browser_search_with_ads": {"google": 1},
|
|
}
|
|
]
|
|
)
|
|
|
|
expected = generate_exploded_data(
|
|
[
|
|
{},
|
|
{"source": "ad-click:", "count": 1},
|
|
{"source": "search-with-ads:", "count": 1},
|
|
]
|
|
)
|
|
exploded = explode_search_counts(main_summary_with_ad_click_counts)
|
|
|
|
assert df_equals(exploded, expected)
|
|
|
|
|
|
def test_add_derived_columns(
|
|
exploded_data_for_derived_cols, derived_columns, df_equals
|
|
):
|
|
actual = add_derived_columns(exploded_data_for_derived_cols)
|
|
|
|
assert df_equals(actual, derived_columns)
|
|
|
|
|
|
def test_basic_aggregation(main_summary, expected_search_dashboard_data, df_equals):
|
|
actual = search_aggregates(main_summary)
|
|
assert df_equals(actual, expected_search_dashboard_data)
|
|
|
|
|
|
def test_search_clients_daily(
|
|
main_summary, expected_search_clients_daily_data, df_equals
|
|
):
|
|
actual = search_clients_daily(main_summary)
|
|
|
|
assert df_equals(actual, expected_search_clients_daily_data)
|