import functools import pytest from collections import namedtuple from pyspark.sql.types import ( StructField, ArrayType, BooleanType, StringType, LongType, MapType, StructType, DoubleType, ) from mozetl.search.aggregates import ( search_aggregates, search_clients_daily, explode_search_counts, add_derived_columns, MAX_CLIENT_SEARCH_COUNT, ) # Some boilerplate to help define example dataframes for testing # A helper class for declaratively creating dataframe factories dataframe_field = namedtuple( "dataframe_field", ["name", "default_value", "type", "nullable"] ) def to_field(field_tuple): """Create a dataframe_field from a tuple""" return dataframe_field(*field_tuple) def get_dataframe_factory_config(fields): """Parse a list of dataframe_fields to a schema and set of default values""" schema = StructType( [StructField(field.name, field.type, field.nullable) for field in fields] ) default_sample = {field.name: field.default_value for field in fields} return schema, default_sample @pytest.fixture def define_dataframe_factory(dataframe_factory): def partial(fields): """Create a dataframe_factory from a set of field configs""" schema, default_sample = get_dataframe_factory_config(fields) return functools.partial( dataframe_factory.create_dataframe, base=default_sample, schema=schema ) return partial # Boilerplate for generating example main_summary tables def generate_search_count(engine="google", source="urlbar", count=4): return {"engine": engine, "source": source, "count": count} addons_type = ArrayType( StructType( [ StructField("addon_id", StringType(), False), StructField("blocklisted", BooleanType(), True), StructField("name", StringType(), True), StructField("user_disabled", BooleanType(), True), StructField("app_disabled", BooleanType(), True), StructField("version", StringType(), True), StructField("scope", LongType(), True), StructField("type", StringType(), True), StructField("foreign_install", BooleanType(), True), StructField("has_binary_components", BooleanType(), True), StructField("install_day", LongType(), True), StructField("update_day", LongType(), True), StructField("signed_state", LongType(), True), StructField("is_system", BooleanType(), True), StructField("is_web_extension", BooleanType(), True), StructField("multiprocess_compatible", BooleanType(), True), ] ) ) def generate_addon(addon_id, name, version): return {"addon_id": addon_id, "name": name, "version": version} active_addons = [ generate_addon("random@mozilla.com", "random", "0.1"), generate_addon("followonsearch@mozilla.com", "Follow-on Search Telemetry", "0.9.5"), ] search_type = ArrayType( StructType( [ StructField("engine", StringType(), False), StructField("source", StringType(), False), StructField("count", LongType(), False), ] ) ) main_summary_schema = [ ("client_id", "a", StringType(), False), ("sample_id", "42", StringType(), False), ("submission_date", "20170101", StringType(), False), ("os", "windows", StringType(), True), ("os_version", "10.0", StringType(), True), ("channel", "release", StringType(), True), ("country", "DE", StringType(), True), ("locale", "de", StringType(), True), ("user_pref_browser_search_region", "DE", StringType(), True), ("search_cohort", None, StringType(), True), ("app_version", "54.0.1", StringType(), True), ("distribution_id", None, StringType(), True), ("subsession_counter", 1, LongType(), True), ("search_counts", [generate_search_count()], search_type, True), ( "scalar_parent_browser_search_ad_clicks", None, MapType(StringType(), LongType()), True, ), ( "scalar_parent_browser_search_with_ads", None, MapType(StringType(), LongType()), True, ), ("active_addons", active_addons, addons_type, True), # 30 minutes in active_ticks (30min * 60sec/min / 5sec/tick) ("active_ticks", 360, LongType(), True), ("scalar_parent_browser_engagement_tab_open_event_count", 5, LongType(), True), ("scalar_parent_browser_engagement_max_concurrent_tab_count", 10, LongType(), True), ("subsession_start_date", "2017-01-01 10:00", StringType(), False), # One hour per ping ("subsession_length", 60 * 60, LongType(), False), # Roughly 2016-01-01 ("profile_creation_date", 16801, LongType(), False), ("default_search_engine", "google", StringType(), False), ( "default_search_engine_data_load_path", "jar:[app]/omni.ja!browser/google.xml", StringType(), False, ), ( "default_search_engine_data_submission_url", "https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b", StringType(), False, ), ] exploded_schema = [ x for x in main_summary_schema if x[0] not in ( "search_counts", "scalar_parent_browser_search_with_ads", "scalar_parent_browser_search_ad_clicks", ) ] + [ ("engine", "google", StringType(), False), ("source", "urlbar", StringType(), False), ("count", 4, LongType(), False), ] derived_schema = exploded_schema + [ ("type", "chrome-sap", StringType(), False), ("addon_version", "0.9.5", StringType(), False), ] @pytest.fixture() def generate_main_summary_data(define_dataframe_factory): return define_dataframe_factory(list(map(to_field, main_summary_schema))) @pytest.fixture def main_summary(generate_main_summary_data): return generate_main_summary_data( [ {"client_id": "b", "country": "US"}, {"app_version": "52.0.3"}, {"distribution_id": "totally not null"}, { "search_counts": [ generate_search_count(engine="bing"), generate_search_count(engine="yahoo"), ] }, ] + # Some duplicate default rows to test aggregation [{}] * 5 + # Client with no searches [ { "client_id": "c", "search_counts": None, "user_pref_browser_search_region": None, } ] ) @pytest.fixture def simple_main_summary(generate_main_summary_data): return generate_main_summary_data( [ { "search_counts": [ generate_search_count(engine="bing"), generate_search_count(engine="yahoo"), ] } ] ) @pytest.fixture() def generate_exploded_data(define_dataframe_factory): return define_dataframe_factory(list(map(to_field, exploded_schema))) @pytest.fixture() def exploded_simple_main_summary(generate_exploded_data): return generate_exploded_data([{"engine": "yahoo"}, {"engine": "bing"}]) @pytest.fixture() def exploded_data_for_derived_cols(generate_exploded_data): return generate_exploded_data( [ {"source": "sap:urlbar:SomeCodeHere"}, {"source": "follow-on:urlbar:SomeCodeHere"}, {"source": "in-content:sap:SomeCodeHere"}, {"source": "in-content:sap-follow-on:SomeCodeHere"}, {"source": "in-content:organic:something"}, {"source": "unknowngarbagestring"}, {"source": "urlbar"}, ] ) @pytest.fixture() def derived_columns(define_dataframe_factory): # template for the expected results factory = define_dataframe_factory(list(map(to_field, derived_schema))) return factory( [ {"source": "sap:urlbar:SomeCodeHere", "type": "tagged-sap"}, {"source": "follow-on:urlbar:SomeCodeHere", "type": "tagged-follow-on"}, {"source": "in-content:sap:SomeCodeHere", "type": "tagged-sap"}, { "source": "in-content:sap-follow-on:SomeCodeHere", "type": "tagged-follow-on", }, {"source": "in-content:organic:something", "type": "organic"}, {"source": "unknowngarbagestring", "type": "unknown"}, {"source": "urlbar", "type": "sap"}, ] ) @pytest.fixture() def expected_search_dashboard_data(define_dataframe_factory): # template for the expected results factory = define_dataframe_factory( list( map( to_field, [ ("submission_date", "20170101", StringType(), False), ("country", "DE", StringType(), True), ("locale", "de", StringType(), True), ("search_cohort", None, StringType(), True), ("app_version", "54.0.1", StringType(), True), ("distribution_id", None, StringType(), True), ("os", "windows", StringType(), True), ("os_version", "10.0", StringType(), True), ("addon_version", "0.9.5", StringType(), False), ("default_search_engine", "google", StringType(), False), ("engine", "google", StringType(), False), ("source", "urlbar", StringType(), False), ("tagged-sap", None, LongType(), True), ("tagged-follow-on", None, LongType(), True), ("tagged_sap", None, LongType(), True), ("tagged_follow_on", None, LongType(), True), ("sap", 4, LongType(), True), ("organic", None, LongType(), True), ("ad-click", None, LongType(), True), ("search-with-ads", None, LongType(), True), ("ad_click", None, LongType(), True), ("search_with_ads", None, LongType(), True), ("unknown", None, LongType(), True), ], ) ) ) return factory( [ {"country": "US"}, {"app_version": "52.0.3"}, {"distribution_id": "totally not null"}, {"engine": "yahoo"}, {"engine": "bing"}, {"sap": 20}, ] ) @pytest.fixture() def expected_search_clients_daily_data(define_dataframe_factory): # template for the expected results factory = define_dataframe_factory( list( map( to_field, [ ("client_id", "a", StringType(), False), ("sample_id", "42", StringType(), False), ("submission_date", "20170101", StringType(), False), ("os", "windows", StringType(), True), ("os_version", "10.0", StringType(), True), ("channel", "release", StringType(), True), ("country", "DE", StringType(), True), ("locale", "de", StringType(), True), ("user_pref_browser_search_region", "DE", StringType(), True), ("search_cohort", None, StringType(), True), ("app_version", "54.0.1", StringType(), True), ("distribution_id", None, StringType(), True), ("addon_version", "0.9.5", StringType(), False), ("engine", "google", StringType(), True), ("source", "urlbar", StringType(), True), ("tagged-sap", None, LongType(), True), ("tagged-follow-on", None, LongType(), True), ("tagged_sap", None, LongType(), True), ("tagged_follow_on", None, LongType(), True), ("sap", 4, LongType(), True), ("organic", None, LongType(), True), ("ad-click", None, LongType(), True), ("ad_click", None, LongType(), True), ("search-with-ads", None, LongType(), True), ("search_with_ads", None, LongType(), True), ("unknown", None, LongType(), True), # Roughly 2016-01-01 ("profile_creation_date", 16801, LongType(), False), ("default_search_engine", "google", StringType(), False), ( "default_search_engine_data_load_path", "jar:[app]/omni.ja!browser/google.xml", StringType(), False, ), ( "default_search_engine_data_submission_url", "https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b", StringType(), False, ), ("sessions_started_on_this_day", 1, LongType(), True), ("profile_age_in_days", 366, LongType(), True), ("subsession_hours_sum", 1.0, DoubleType(), True), ("active_addons_count_mean", 2.0, DoubleType(), True), ("max_concurrent_tab_count_max", 10, LongType(), True), ("tab_open_event_count_sum", 5, LongType(), True), ("active_hours_sum", 0.5, DoubleType(), True), ], ) ) ) return factory( [ {"client_id": "b", "country": "US"}, # Covers 5 dupe rows and custom app_version, distribution_id rows { "app_version": "52.0.3", "sap": 28, "sessions_started_on_this_day": 7, "subsession_hours_sum": 7.0, "tab_open_event_count_sum": 35, "active_hours_sum": 3.5, }, {"engine": "bing"}, {"engine": "yahoo"}, { "client_id": "c", "unknown": None, "sap": 0, "tagged-sap": None, "tagged-follow-on": None, "tagged_sap": None, "tagged_follow_on": None, "source": None, "engine": None, "user_pref_browser_search_region": None, }, ] ) # Testing functions def test_explode_search_counts( simple_main_summary, exploded_simple_main_summary, df_equals ): actual = explode_search_counts(simple_main_summary) assert df_equals(actual, exploded_simple_main_summary) def test_explode_search_counts_bing_absurd( generate_main_summary_data, generate_exploded_data, df_equals ): main_summary_bing_absurd = generate_main_summary_data( [ { "search_counts": [ generate_search_count( engine="bing", count=(MAX_CLIENT_SEARCH_COUNT + 1) ), generate_search_count(engine="yahoo"), ] } ] ) # expected result only includes yahoo, because the bing entry had an absurd # number of searches expected = generate_exploded_data([{"engine": "yahoo"}]) actual = explode_search_counts(main_summary_bing_absurd) assert df_equals(expected, actual) def test_explode_ad_click_counts( generate_main_summary_data, generate_exploded_data, df_equals ): main_summary_with_ad_click_counts = generate_main_summary_data( [ { "scalar_parent_browser_search_ad_clicks": {"google": 1}, "scalar_parent_browser_search_with_ads": {"google": 1}, } ] ) expected = generate_exploded_data( [ {}, {"source": "ad-click:", "count": 1}, {"source": "search-with-ads:", "count": 1}, ] ) exploded = explode_search_counts(main_summary_with_ad_click_counts) assert df_equals(exploded, expected) def test_add_derived_columns( exploded_data_for_derived_cols, derived_columns, df_equals ): actual = add_derived_columns(exploded_data_for_derived_cols) assert df_equals(actual, derived_columns) def test_basic_aggregation(main_summary, expected_search_dashboard_data, df_equals): actual = search_aggregates(main_summary) assert df_equals(actual, expected_search_dashboard_data) def test_search_clients_daily( main_summary, expected_search_clients_daily_data, df_equals ): actual = search_clients_daily(main_summary) assert df_equals(actual, expected_search_clients_daily_data)