python_mozetl/tests/test_clientsdaily.py

152 строки
5.0 KiB
Python

import pytest
import os
from mozetl.schemas import MAIN_SUMMARY_SCHEMA
from mozetl.clientsdaily import rollup as cd
EXPECTED_INTEGER_VALUES = {
"active_addons_count_mean": 3613,
"crashes_detected_content_sum": 9,
"first_paint_mean": 12802105,
"pings_aggregated_by_this_row": 1122,
"search_count_all_sum": 1043,
"scalar_parent_browser_engagement_unique_domains_count_max": 3160,
"scalar_parent_browser_engagement_unique_domains_count_mean": 2628,
}
@pytest.fixture
def main_summary(spark):
root = os.path.dirname(__file__)
path = os.path.join(
root, "resources", "main_summary-late-may-1123-rows-anonymized.json"
)
frame = spark.read.json(path, MAIN_SUMMARY_SCHEMA)
return frame
@pytest.fixture
def main_summary_with_search(main_summary):
return cd.extract_search_counts(main_summary)
@pytest.fixture
def clients_daily(main_summary_with_search):
return cd.to_profile_day_aggregates(main_summary_with_search)
def test_extract_search_counts(main_summary_with_search):
row = main_summary_with_search.agg({"search_count_all": "sum"}).collect()[0]
total = list(row.asDict().values())[0]
assert total == EXPECTED_INTEGER_VALUES["search_count_all_sum"]
def test_domains_count(main_summary_with_search):
unique_domains = "scalar_parent_browser_engagement_unique_domains_count"
row = main_summary_with_search.agg({unique_domains: "sum"}).collect()[0]
total = list(row.asDict().values())[0]
assert total == 4402
def test_to_profile_day_aggregates(clients_daily):
# Sum up the means and sums as calculated over 1123 rows,
# one of which is a duplicate.
aggd = dict([(k, "sum") for k in EXPECTED_INTEGER_VALUES])
result = clients_daily.agg(aggd).collect()[0]
for k, expected in list(EXPECTED_INTEGER_VALUES.items()):
actual = int(result["sum({})".format(k)])
assert actual == expected
def test_profile_creation_date_fields(clients_daily):
# Spark's from_unixtime() is apparently sensitive to environment TZ
# See https://issues.apache.org/jira/browse/SPARK-17971
# There are therefore three possible expected results, depending on
# the TZ setting of the system on which the tests run.
expected_back = set(
[
"2014-12-16",
"2016-09-07",
"2016-05-12",
"2017-02-16",
"2012-11-17",
"2013-09-08",
"2017-02-12",
"2016-04-04",
"2017-04-25",
"2015-06-17",
]
)
expected_utc = set(
[
"2014-12-17",
"2016-09-08",
"2016-05-13",
"2017-02-17",
"2012-11-18",
"2013-09-09",
"2017-02-13",
"2016-04-05",
"2017-04-26",
"2015-06-18",
]
)
expected_forward = set(
[
"2014-12-18",
"2016-09-09",
"2016-05-14",
"2017-02-18",
"2012-11-19",
"2013-09-10",
"2017-02-14",
"2016-04-06",
"2017-04-27",
"2015-06-19",
]
)
ten_pcds = clients_daily.select("profile_creation_date").take(10)
actual1 = set([list(r.asDict().values())[0][:10] for r in ten_pcds])
assert actual1 in (expected_back, expected_utc, expected_forward)
expected2_back = [378, 894, 261, 1361, 101, 1656, 415, 29, 703, 102]
expected2_utc = [377, 893, 260, 1360, 100, 1655, 414, 28, 702, 101]
expected2_forward = [376, 892, 259, 1359, 99, 1654, 413, 27, 701, 100]
ten_pdas = clients_daily.select("profile_age_in_days").take(10)
actual2 = [list(r.asDict().values())[0] for r in ten_pdas]
assert actual2 in (expected2_back, expected2_utc, expected2_forward)
def test_sessions_started_on_this_day(clients_daily):
expected = [2, 0, 3, 2, 1, 0, 1, 0, 0, 3]
ten_ssotds = clients_daily.select("sessions_started_on_this_day").take(10)
actual = [list(r.asDict().values())[0] for r in ten_ssotds]
assert actual == expected
# Similar to the test above, but a little easier to compare with
# the source data.
def test_sessions_started_on_this_day_sorted(clients_daily):
expected = [1, 5, 1, 1, 1, 0, 0, 0, 0, 0]
one_day = clients_daily.where("activity_date == '2017-05-25'").orderBy("client_id")
ten_ssotds = one_day.select("sessions_started_on_this_day").take(10)
actual = [list(r.asDict().values())[0] for r in ten_ssotds]
assert actual == expected
# Ensure that "first" aggregations skip null values
def test_first_skips_nulls(clients_daily):
filter_template = "client_id = '{}' and activity_date = '{}'"
client = "0c495fce-5fbf-4f4a-ac03-2dedcef0a8d0"
day = "2017-05-25"
filter_clause = filter_template.format(client, day)
null_to_false = clients_daily.where(filter_clause).select("sync_configured").first()
expected = False
actual = null_to_false.sync_configured
assert actual == expected
expected = 230
actual = clients_daily.where("sync_configured is null").count()
assert actual == expected