Update dependencies to fix ci (#354)
This commit is contained in:
Родитель
5756722cd5
Коммит
e42d0a9d23
|
@ -16,7 +16,7 @@ install_dependencies: &install_dependencies
|
|||
command: |
|
||||
apt update
|
||||
apt install -y libsnappy-dev openjdk-8-jre-headless
|
||||
pip install tox coverage
|
||||
pip install tox coverage==5.3
|
||||
|
||||
save_cache_settings: &save_cache_settings
|
||||
key: v1-python_mozetl-{{ checksum "setup.py" }}
|
||||
|
@ -66,20 +66,15 @@ test_settings: &test_settings
|
|||
|
||||
version: 2
|
||||
jobs:
|
||||
py27:
|
||||
py37:
|
||||
<<: *test_settings
|
||||
parallelism: 4
|
||||
docker:
|
||||
- image: python:2.7-stretch
|
||||
py35:
|
||||
<<: *test_settings
|
||||
parallelism: 4
|
||||
docker:
|
||||
- image: python:3.5-stretch
|
||||
- image: python:3.7-stretch
|
||||
|
||||
lint:
|
||||
docker:
|
||||
- image: python:3.6-stretch
|
||||
- image: python:3.7-stretch
|
||||
working_directory: ~/python_mozetl
|
||||
steps:
|
||||
- checkout
|
||||
|
@ -93,7 +88,7 @@ jobs:
|
|||
|
||||
docs:
|
||||
docker:
|
||||
- image: python:2.7-stretch
|
||||
- image: python:3.7-stretch
|
||||
working_directory: ~/python_mozetl
|
||||
steps:
|
||||
- checkout
|
||||
|
@ -137,8 +132,7 @@ workflows:
|
|||
version: 2
|
||||
build:
|
||||
jobs:
|
||||
- py27
|
||||
- py35
|
||||
- py37
|
||||
- lint
|
||||
- docs
|
||||
- docs-deploy:
|
||||
|
|
|
@ -81,7 +81,7 @@ def extract_search_counts(frame):
|
|||
extracted = grouped.select(
|
||||
"did",
|
||||
F.col("sum(search_count_atom)").alias("search_count_all"),
|
||||
*[F.col("sum({})".format(c)).alias(c) for c in SEARCH_ACCESS_COLUMNS]
|
||||
*[F.col("sum({})".format(c)).alias(c) for c in SEARCH_ACCESS_COLUMNS],
|
||||
)
|
||||
# Create a homologous output row for each input row
|
||||
# where search_counts is NULL.
|
||||
|
@ -91,7 +91,7 @@ def extract_search_counts(frame):
|
|||
.select(
|
||||
"did",
|
||||
F.lit(0).alias("search_count_all"),
|
||||
*[F.lit(0).alias(c) for c in SEARCH_ACCESS_COLUMNS]
|
||||
*[F.lit(0).alias(c) for c in SEARCH_ACCESS_COLUMNS],
|
||||
)
|
||||
)
|
||||
intermediate = extracted.unionAll(nulls)
|
||||
|
|
|
@ -155,8 +155,8 @@ class Prof(object):
|
|||
|
||||
# Helpers.
|
||||
def fix_vendor(vendor_id):
|
||||
if vendor_id == u"Intel Open Source Technology Center":
|
||||
return u"0x8086"
|
||||
if vendor_id == "Intel Open Source Technology Center":
|
||||
return "0x8086"
|
||||
return vendor_id
|
||||
|
||||
|
||||
|
@ -427,7 +427,7 @@ class Trend(TrendBase):
|
|||
|
||||
text = json.dumps(self.cache)
|
||||
|
||||
print("Writing file {0}".format(self.local_path, text))
|
||||
print("Writing file {0}".format(self.local_path))
|
||||
with open(self.local_path, "w") as fp:
|
||||
fp.write(text)
|
||||
|
||||
|
@ -621,9 +621,9 @@ if __name__ == "__main__":
|
|||
WinArchTrend(),
|
||||
WindowsVendorTrend(),
|
||||
WindowsVistaPlusGroup([Direct2DTrend(), Direct3D11Trend()]),
|
||||
DeviceGenTrend(u"0x8086", "intel"),
|
||||
DeviceGenTrend(u"0x10de", "nvidia"),
|
||||
DeviceGenTrend(u"0x1002", "amd"),
|
||||
DeviceGenTrend("0x8086", "intel"),
|
||||
DeviceGenTrend("0x10de", "nvidia"),
|
||||
DeviceGenTrend("0x1002", "amd"),
|
||||
]
|
||||
),
|
||||
]
|
||||
|
|
|
@ -372,7 +372,7 @@ def search_aggregates_etl(submission_date, bucket, prefix, **kwargs):
|
|||
prefix,
|
||||
SEARCH_AGGREGATES_VERSION,
|
||||
search_aggregates,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
@ -384,7 +384,7 @@ def search_clients_daily_etl(submission_date, bucket, prefix, **kwargs):
|
|||
SEARCH_CLIENTS_DAILY_VERSION,
|
||||
search_clients_daily,
|
||||
orderBy=["sample_id"],
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -24,12 +24,12 @@ spark = SparkSession.builder.appName("modules-with-missing-symbols").getOrCreate
|
|||
|
||||
sc.addPyFile("stemming-1.0.1/stemming/porter2.py")
|
||||
|
||||
from crashcorrelations import (
|
||||
from crashcorrelations import ( # noqa E402
|
||||
utils,
|
||||
download_data,
|
||||
crash_deviations,
|
||||
comments,
|
||||
) # noqa E402
|
||||
)
|
||||
|
||||
|
||||
# workaround airflow not able to different schedules for tasks in a dag
|
||||
|
|
|
@ -78,7 +78,7 @@ def get_df(spark, date_from):
|
|||
|
||||
|
||||
def get_addons_per_client(users_df, minimum_addons_count):
|
||||
""" Extracts a DataFrame that contains one row
|
||||
"""Extracts a DataFrame that contains one row
|
||||
for each client along with the list of active add-on GUIDs.
|
||||
"""
|
||||
|
||||
|
@ -328,8 +328,7 @@ def today_minus_7_days():
|
|||
|
||||
|
||||
def verify_valid_coefs(coefs):
|
||||
""" verify that the model has proper floating point values (> 0)
|
||||
"""
|
||||
"""verify that the model has proper floating point values (> 0)"""
|
||||
|
||||
assert "ensemble_weights" in coefs
|
||||
weights = coefs["ensemble_weights"]
|
||||
|
@ -362,9 +361,9 @@ def verify_valid_coefs(coefs):
|
|||
|
||||
|
||||
class CostLLR:
|
||||
""" based on Niko Brummer's original implementation:
|
||||
Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection"
|
||||
Computer Speech and Language, 2005
|
||||
"""based on Niko Brummer's original implementation:
|
||||
Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection"
|
||||
Computer Speech and Language, 2005
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -417,8 +416,8 @@ class CostLLR:
|
|||
|
||||
def cross_validation_split(dataset, k_folds):
|
||||
"""
|
||||
Splits dataframe into k_folds, returning array of dataframes
|
||||
"""
|
||||
Splits dataframe into k_folds, returning array of dataframes
|
||||
"""
|
||||
dataset_split = []
|
||||
h = 1.0 / k_folds
|
||||
df = dataset.select("*", rand().alias("rand"))
|
||||
|
|
|
@ -27,7 +27,7 @@ ONE_WEEK_AGO = (dt.datetime.now() - dt.timedelta(days=7)).strftime("%Y%m%d")
|
|||
|
||||
|
||||
def is_valid_addon(broadcast_amo_whitelist, guid, addon):
|
||||
""" Filter individual addons out to exclude, system addons,
|
||||
"""Filter individual addons out to exclude, system addons,
|
||||
legacy addons, disabled addons, sideloaded addons.
|
||||
"""
|
||||
return not (
|
||||
|
@ -47,7 +47,7 @@ def is_valid_addon(broadcast_amo_whitelist, guid, addon):
|
|||
|
||||
|
||||
def get_addons_per_client(broadcast_amo_whitelist, users_df):
|
||||
""" Extracts a DataFrame that contains one row
|
||||
"""Extracts a DataFrame that contains one row
|
||||
for each client along with the list of active add-on GUIDs.
|
||||
"""
|
||||
|
||||
|
@ -71,7 +71,7 @@ def get_addons_per_client(broadcast_amo_whitelist, users_df):
|
|||
|
||||
|
||||
def get_initial_sample(spark):
|
||||
""" Takes an initial sample from the longitudinal dataset
|
||||
"""Takes an initial sample from the longitudinal dataset
|
||||
(randomly sampled from main summary). Coarse filtering on:
|
||||
- number of installed addons (greater than 1)
|
||||
- corrupt and generally wierd telemetry entries
|
||||
|
@ -92,8 +92,7 @@ def get_initial_sample(spark):
|
|||
|
||||
|
||||
def extract_telemetry(spark):
|
||||
""" load some training data from telemetry given a sparkContext
|
||||
"""
|
||||
"""load some training data from telemetry given a sparkContext"""
|
||||
sc = spark.sparkContext
|
||||
|
||||
# Define the set of feature names to be used in the donor computations.
|
||||
|
|
|
@ -18,8 +18,7 @@ OUTPUT_BASE_FILENAME = "guid_install_ranking"
|
|||
|
||||
|
||||
def extract_telemetry(sparkSession):
|
||||
""" Load some training data from telemetry given a sparkContext
|
||||
"""
|
||||
"""Load some training data from telemetry given a sparkContext"""
|
||||
frame = sparkSession.sql(
|
||||
"""
|
||||
SELECT
|
||||
|
@ -42,7 +41,7 @@ def extract_telemetry(sparkSession):
|
|||
|
||||
|
||||
def transform(frame):
|
||||
""" Convert the dataframe to JSON and augment each record to
|
||||
"""Convert the dataframe to JSON and augment each record to
|
||||
include the install count for each addon.
|
||||
"""
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ def get_samples(spark, date_from):
|
|||
|
||||
|
||||
def get_addons_per_client(users_df, addon_whitelist, minimum_addons_count):
|
||||
""" Extracts a DataFrame that contains one row
|
||||
"""Extracts a DataFrame that contains one row
|
||||
for each client along with the list of active add-on GUIDs.
|
||||
"""
|
||||
|
||||
|
@ -125,8 +125,7 @@ def get_addons_per_client(users_df, addon_whitelist, minimum_addons_count):
|
|||
|
||||
|
||||
def compute_clusters(addons_df, num_clusters, random_seed):
|
||||
""" Performs user clustering by using add-on ids as features.
|
||||
"""
|
||||
"""Performs user clustering by using add-on ids as features."""
|
||||
|
||||
# Build the stages of the pipeline. We need hashing to make the next
|
||||
# steps work.
|
||||
|
@ -145,8 +144,7 @@ def compute_clusters(addons_df, num_clusters, random_seed):
|
|||
|
||||
|
||||
def get_donor_pools(users_df, clusters_df, num_donors, random_seed=None):
|
||||
""" Samples users from each cluster.
|
||||
"""
|
||||
"""Samples users from each cluster."""
|
||||
cluster_population = clusters_df.groupBy("prediction").count().collect()
|
||||
clusters_histogram = [(x["prediction"], x["count"]) for x in cluster_population]
|
||||
|
||||
|
@ -216,7 +214,7 @@ def format_donors_dictionary(donors_df):
|
|||
|
||||
|
||||
def similarity_function(x, y):
|
||||
""" Similarity function for comparing user features.
|
||||
"""Similarity function for comparing user features.
|
||||
|
||||
This actually really should be implemented in taar.similarity_recommender
|
||||
and then imported here for consistency.
|
||||
|
@ -260,7 +258,7 @@ def generate_non_cartesian_pairs(first_rdd, second_rdd):
|
|||
def get_lr_curves(
|
||||
spark, features_df, cluster_ids, kernel_bandwidth, num_pdf_points, random_seed=None
|
||||
):
|
||||
""" Compute the likelihood ratio curves for clustered clients.
|
||||
"""Compute the likelihood ratio curves for clustered clients.
|
||||
|
||||
Work-flow followed in this function is as follows:
|
||||
|
||||
|
|
|
@ -88,7 +88,7 @@ def store_json_to_s3(json_data, base_filename, date, prefix, bucket):
|
|||
|
||||
|
||||
def load_amo_external_whitelist():
|
||||
""" Download and parse the AMO add-on whitelist.
|
||||
"""Download and parse the AMO add-on whitelist.
|
||||
|
||||
:raises RuntimeError: the AMO whitelist file cannot be downloaded or contains
|
||||
no valid add-ons.
|
||||
|
@ -133,8 +133,8 @@ def load_amo_curated_whitelist():
|
|||
|
||||
def hash_telemetry_id(telemetry_id):
|
||||
"""
|
||||
This hashing function is a reference implementation based on :
|
||||
https://phabricator.services.mozilla.com/D8311
|
||||
This hashing function is a reference implementation based on :
|
||||
https://phabricator.services.mozilla.com/D8311
|
||||
|
||||
"""
|
||||
return hashlib.sha256(telemetry_id.encode("utf8")).hexdigest()
|
||||
|
|
|
@ -44,7 +44,7 @@ def generate_filter_parameters(end_date, days_back):
|
|||
|
||||
|
||||
def write_csv(dataframe, path, header=True):
|
||||
""" Write a dataframe to local disk.
|
||||
"""Write a dataframe to local disk.
|
||||
|
||||
Disclaimer: Do not write csv files larger than driver memory. This
|
||||
is ~15GB for ec2 c3.xlarge (due to caching overhead).
|
||||
|
|
18
setup.py
18
setup.py
|
@ -2,13 +2,13 @@
|
|||
from setuptools import setup, find_packages
|
||||
|
||||
test_deps = [
|
||||
'coverage==4.5.2',
|
||||
'coverage==5.3',
|
||||
'pytest-cov==2.6.0',
|
||||
'pytest-timeout==1.3.3',
|
||||
'moto==1.3.16',
|
||||
'mock==2.0.0',
|
||||
'pytest==3.10.1',
|
||||
'flake8==3.6.0'
|
||||
'flake8==3.8.4'
|
||||
]
|
||||
|
||||
extras = {
|
||||
|
@ -30,17 +30,17 @@ setup(
|
|||
'boto==2.49.0',
|
||||
'boto3==1.16.20',
|
||||
'botocore==1.19.20',
|
||||
'click==6.7',
|
||||
'click==7.1.2',
|
||||
'click_datetime==0.2',
|
||||
'numpy==1.13.3',
|
||||
'pandas==0.23.4',
|
||||
'numpy==1.19.4',
|
||||
'pandas==1.1.4',
|
||||
'pyspark==2.3.2',
|
||||
'python_moztelemetry==0.10.2',
|
||||
'requests-toolbelt==0.8.0',
|
||||
'requests==2.20.1',
|
||||
'scipy==1.0.0rc1',
|
||||
'requests-toolbelt==0.9.1',
|
||||
'requests==2.25.0',
|
||||
'scipy==1.5.4',
|
||||
'typing==3.6.4',
|
||||
'six==1.11.0',
|
||||
'six==1.15.0',
|
||||
],
|
||||
tests_require=test_deps,
|
||||
extras_require=extras,
|
||||
|
|
|
@ -66,44 +66,44 @@ def test_profile_creation_date_fields(clients_daily):
|
|||
# the TZ setting of the system on which the tests run.
|
||||
expected_back = set(
|
||||
[
|
||||
u"2014-12-16",
|
||||
u"2016-09-07",
|
||||
u"2016-05-12",
|
||||
u"2017-02-16",
|
||||
u"2012-11-17",
|
||||
u"2013-09-08",
|
||||
u"2017-02-12",
|
||||
u"2016-04-04",
|
||||
u"2017-04-25",
|
||||
u"2015-06-17",
|
||||
"2014-12-16",
|
||||
"2016-09-07",
|
||||
"2016-05-12",
|
||||
"2017-02-16",
|
||||
"2012-11-17",
|
||||
"2013-09-08",
|
||||
"2017-02-12",
|
||||
"2016-04-04",
|
||||
"2017-04-25",
|
||||
"2015-06-17",
|
||||
]
|
||||
)
|
||||
expected_utc = set(
|
||||
[
|
||||
u"2014-12-17",
|
||||
u"2016-09-08",
|
||||
u"2016-05-13",
|
||||
u"2017-02-17",
|
||||
u"2012-11-18",
|
||||
u"2013-09-09",
|
||||
u"2017-02-13",
|
||||
u"2016-04-05",
|
||||
u"2017-04-26",
|
||||
u"2015-06-18",
|
||||
"2014-12-17",
|
||||
"2016-09-08",
|
||||
"2016-05-13",
|
||||
"2017-02-17",
|
||||
"2012-11-18",
|
||||
"2013-09-09",
|
||||
"2017-02-13",
|
||||
"2016-04-05",
|
||||
"2017-04-26",
|
||||
"2015-06-18",
|
||||
]
|
||||
)
|
||||
expected_forward = set(
|
||||
[
|
||||
u"2014-12-18",
|
||||
u"2016-09-09",
|
||||
u"2016-05-14",
|
||||
u"2017-02-18",
|
||||
u"2012-11-19",
|
||||
u"2013-09-10",
|
||||
u"2017-02-14",
|
||||
u"2016-04-06",
|
||||
u"2017-04-27",
|
||||
u"2015-06-19",
|
||||
"2014-12-18",
|
||||
"2016-09-09",
|
||||
"2016-05-14",
|
||||
"2017-02-18",
|
||||
"2012-11-19",
|
||||
"2013-09-10",
|
||||
"2017-02-14",
|
||||
"2016-04-06",
|
||||
"2017-04-27",
|
||||
"2015-06-19",
|
||||
]
|
||||
)
|
||||
ten_pcds = clients_daily.select("profile_creation_date").take(10)
|
||||
|
|
|
@ -15,19 +15,19 @@ def sample_document():
|
|||
# string before passing through to the sampler code.
|
||||
"content": {"payload": {"foo": "bar"}},
|
||||
"meta": {
|
||||
u"Content-Length": u"7094",
|
||||
u"Date": u"Sun, 19 Aug 2018 15:08:00 GMT",
|
||||
u"Host": u"incoming.telemetry.mozilla.org",
|
||||
"Hostname": u"ip-1.1.1.1",
|
||||
"Content-Length": "7094",
|
||||
"Date": "Sun, 19 Aug 2018 15:08:00 GMT",
|
||||
"Host": "incoming.telemetry.mozilla.org",
|
||||
"Hostname": "ip-1.1.1.1",
|
||||
"Timestamp": 1534691279765301222,
|
||||
"Type": u"telemetry-raw",
|
||||
u"User-Agent": u"pingsender/1.0",
|
||||
u"X-Forwarded-For": u"127.0.0.1",
|
||||
u"X-PingSender-Version": u"1.0",
|
||||
u"args": u"v=4",
|
||||
u"protocol": u"HTTP/1.1",
|
||||
u"remote_addr": u"1.1.1.1",
|
||||
u"uri": u"/submit/telemetry/doc-id/main/Firefox/61.0.2/release/20180807170231",
|
||||
"Type": "telemetry-raw",
|
||||
"User-Agent": "pingsender/1.0",
|
||||
"X-Forwarded-For": "127.0.0.1",
|
||||
"X-PingSender-Version": "1.0",
|
||||
"args": "v=4",
|
||||
"protocol": "HTTP/1.1",
|
||||
"remote_addr": "1.1.1.1",
|
||||
"uri": "/submit/telemetry/doc-id/main/Firefox/61.0.2/release/20180807170231",
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from pyspark.sql.types import ArrayType, LongType, StringType, StructField, Stru
|
|||
|
||||
@pytest.fixture()
|
||||
def sync_summary_schema():
|
||||
""""Generate a schema for sync_summary. This subset contains enough
|
||||
"""Generate a schema for sync_summary. This subset contains enough
|
||||
structure for testing bookmark validation. The schema is derived from
|
||||
[`telemetry-batch-view`][1].
|
||||
|
||||
|
|
|
@ -241,10 +241,16 @@ EXPECTED_FINAL_JDATA = {
|
|||
|
||||
@pytest.yield_fixture(scope="function")
|
||||
def s3_fixture():
|
||||
mock_s3().start()
|
||||
s3 = mock_s3()
|
||||
s3.start()
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=taar_amowhitelist.AMO_DUMP_BUCKET)
|
||||
conn.create_bucket(
|
||||
Bucket=taar_amowhitelist.AMO_DUMP_BUCKET,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
taar_utils.store_json_to_s3(
|
||||
json.dumps(SAMPLE_DATA),
|
||||
taar_amowhitelist.AMO_DUMP_BASE_FILENAME,
|
||||
|
@ -253,7 +259,7 @@ def s3_fixture():
|
|||
taar_amowhitelist.AMO_DUMP_BUCKET,
|
||||
)
|
||||
yield conn, SAMPLE_DATA
|
||||
mock_s3().stop()
|
||||
s3.stop()
|
||||
|
||||
|
||||
def test_extract(s3_fixture):
|
||||
|
|
|
@ -108,44 +108,35 @@ MOCK_KEYED_ADDONS = [
|
|||
|
||||
EXPECTED_GUID_GUID_DATA = [
|
||||
Row(
|
||||
key_addon=u"test-guid-2",
|
||||
key_addon="test-guid-2",
|
||||
coinstallation_counts=[
|
||||
Row(id=u"test-guid-6", n=1),
|
||||
Row(id=u"test-guid-5", n=1),
|
||||
Row(id=u"test-guid-3", n=1),
|
||||
Row(id=u"test-guid-1", n=1),
|
||||
Row(id="test-guid-6", n=1),
|
||||
Row(id="test-guid-5", n=1),
|
||||
Row(id="test-guid-3", n=1),
|
||||
Row(id="test-guid-1", n=1),
|
||||
],
|
||||
),
|
||||
Row(key_addon=u"test-guid-4", coinstallation_counts=[Row(id=u"test-guid-1", n=1)]),
|
||||
Row(key_addon="test-guid-4", coinstallation_counts=[Row(id="test-guid-1", n=1)]),
|
||||
Row(
|
||||
key_addon=u"test-guid-3",
|
||||
key_addon="test-guid-3",
|
||||
coinstallation_counts=[Row(id="test-guid-2", n=1), Row(id="test-guid-1", n=2)],
|
||||
),
|
||||
Row(
|
||||
key_addon="test-guid-5",
|
||||
coinstallation_counts=[Row(id="test-guid-6", n=1), Row(id="test-guid-2", n=1)],
|
||||
),
|
||||
Row(
|
||||
key_addon="test-guid-1",
|
||||
coinstallation_counts=[
|
||||
Row(id=u"test-guid-2", n=1),
|
||||
Row(id=u"test-guid-1", n=2),
|
||||
Row(id="test-guid-2", n=1),
|
||||
Row(id="test-guid-1", n=2),
|
||||
Row(id="test-guid-3", n=2),
|
||||
Row(id="test-guid-4", n=1),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
key_addon=u"test-guid-5",
|
||||
coinstallation_counts=[
|
||||
Row(id=u"test-guid-6", n=1),
|
||||
Row(id=u"test-guid-2", n=1),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
key_addon=u"test-guid-1",
|
||||
coinstallation_counts=[
|
||||
Row(id=u"test-guid-2", n=1),
|
||||
Row(id=u"test-guid-1", n=2),
|
||||
Row(id=u"test-guid-3", n=2),
|
||||
Row(id=u"test-guid-4", n=1),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
key_addon=u"test-guid-6",
|
||||
coinstallation_counts=[
|
||||
Row(id=u"test-guid-2", n=1),
|
||||
Row(id=u"test-guid-5", n=1),
|
||||
],
|
||||
key_addon="test-guid-6",
|
||||
coinstallation_counts=[Row(id="test-guid-2", n=1), Row(id="test-guid-5", n=1)],
|
||||
),
|
||||
]
|
||||
|
||||
|
@ -236,7 +227,12 @@ def test_load_s3(spark):
|
|||
|
||||
# Create the bucket before we upload
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
bucket_obj = conn.create_bucket(Bucket=BUCKET)
|
||||
bucket_obj = conn.create_bucket(
|
||||
Bucket=BUCKET,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
load_df = spark.createDataFrame(EXPECTED_GUID_GUID_DATA)
|
||||
taar_lite_guidguid.load_s3(load_df, "20180301", PREFIX, BUCKET)
|
||||
|
|
|
@ -55,10 +55,10 @@ MOCK_TELEMETRY_SAMPLE = [
|
|||
]
|
||||
|
||||
EXPECTED_ADDON_INSTALLATIONS = {
|
||||
u"test-guid-1": 100,
|
||||
u"test-guid-2": 200,
|
||||
u"test-guid-3": 300,
|
||||
u"test-guid-4": 400,
|
||||
"test-guid-1": 100,
|
||||
"test-guid-2": 200,
|
||||
"test-guid-3": 300,
|
||||
"test-guid-4": 400,
|
||||
}
|
||||
|
||||
|
||||
|
@ -74,12 +74,12 @@ def test_extract_phase(spark):
|
|||
|
||||
output = dict(extract_df.rdd.map(lambda_func).collect())
|
||||
EXPECTED = {
|
||||
u"test-guid-1": 1,
|
||||
u"test-guid-2": 3,
|
||||
u"test-guid-3": 3,
|
||||
u"test-guid-4": 2,
|
||||
u"test-guid-5": 2,
|
||||
u"test-guid-6": 1,
|
||||
"test-guid-1": 1,
|
||||
"test-guid-2": 3,
|
||||
"test-guid-3": 3,
|
||||
"test-guid-4": 2,
|
||||
"test-guid-5": 2,
|
||||
"test-guid-6": 1,
|
||||
}
|
||||
assert EXPECTED == output
|
||||
|
||||
|
@ -106,7 +106,12 @@ def test_load_s3(spark):
|
|||
|
||||
# Create the bucket before we upload
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
bucket_obj = conn.create_bucket(Bucket=BUCKET)
|
||||
bucket_obj = conn.create_bucket(
|
||||
Bucket=BUCKET,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
rdd = spark.createDataFrame(MOCK_TELEMETRY_SAMPLE)
|
||||
result_json = taar_lite_guidranking.transform(rdd)
|
||||
|
|
|
@ -85,7 +85,12 @@ def test_load(mock_transformed_data):
|
|||
date = "20190105"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
taar_update_whitelist.load_etl(mock_transformed_data, date, prefix, bucket)
|
||||
|
||||
|
|
|
@ -50,7 +50,12 @@ def test_read_from_s3():
|
|||
s3_json_fname = "test.json"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
with NamedTemporaryFile("w") as json_file:
|
||||
json.dump(SAMPLE_DATA, json_file)
|
||||
|
@ -71,7 +76,12 @@ def test_write_to_s3():
|
|||
dest_filename = "test.json"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
bucket_obj = conn.create_bucket(Bucket=bucket)
|
||||
bucket_obj = conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
with NamedTemporaryFile("w") as json_file:
|
||||
json.dump(SAMPLE_DATA, json_file)
|
||||
|
@ -102,7 +112,12 @@ def test_write_json_s3():
|
|||
content = {"it-IT": ["firefox@getpocket.com"]}
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
bucket_obj = conn.create_bucket(Bucket=bucket)
|
||||
bucket_obj = conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
# Store the data in the mocked bucket.
|
||||
taar_utils.store_json_to_s3(
|
||||
|
@ -123,7 +138,12 @@ def test_write_json_s3():
|
|||
@mock_s3
|
||||
def test_load_amo_external_whitelist():
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=taar_utils.AMO_DUMP_BUCKET)
|
||||
conn.create_bucket(
|
||||
Bucket=taar_utils.AMO_DUMP_BUCKET,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
# Make sure that whitelist loading fails before mocking the S3 file.
|
||||
EXCEPTION_MSG = "Empty AMO whitelist detected"
|
||||
|
|
|
@ -31,7 +31,9 @@ def test_write_csv_ascii(generate_data, tmpdir):
|
|||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
assert [l.decode("utf-8") for l in data.rstrip().split(b"\r\n")[1:]] == test_data
|
||||
assert [
|
||||
line.decode("utf-8") for line in data.rstrip().split(b"\r\n")[1:]
|
||||
] == test_data
|
||||
|
||||
|
||||
def test_generate_filter_parameters():
|
||||
|
@ -58,7 +60,7 @@ def test_generate_filter_parameters():
|
|||
|
||||
|
||||
def test_write_csv_valid_unicode(generate_data, tmpdir):
|
||||
test_data = [u"∆", u"∫", u"∬"]
|
||||
test_data = ["∆", "∫", "∬"]
|
||||
df = generate_data(test_data)
|
||||
|
||||
path = str(tmpdir.join("test_data.csv"))
|
||||
|
@ -67,7 +69,9 @@ def test_write_csv_valid_unicode(generate_data, tmpdir):
|
|||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
assert [l.decode("utf-8") for l in data.rstrip().split(b"\r\n")[1:]] == test_data
|
||||
assert [
|
||||
line.decode("utf-8") for line in data.rstrip().split(b"\r\n")[1:]
|
||||
] == test_data
|
||||
|
||||
|
||||
@mock_s3
|
||||
|
@ -76,7 +80,12 @@ def test_write_csv_to_s3(generate_data):
|
|||
key = "test.csv"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
utils.write_csv_to_s3(generate_data(["foo"]), bucket, key)
|
||||
|
||||
|
@ -92,7 +101,12 @@ def test_write_csv_to_s3_no_header(generate_data):
|
|||
key = "test.csv"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
utils.write_csv_to_s3(generate_data(), bucket, key, header=False)
|
||||
|
||||
|
@ -107,7 +121,12 @@ def test_write_csv_to_s3_existing(generate_data):
|
|||
key = "test.csv"
|
||||
|
||||
conn = boto3.resource("s3", region_name="us-west-2")
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
conn.create_bucket(
|
||||
Bucket=bucket,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": "us-west-2",
|
||||
},
|
||||
)
|
||||
|
||||
utils.write_csv_to_s3(generate_data(["foo"]), bucket, key)
|
||||
utils.write_csv_to_s3(generate_data(["foo", "bar"]), bucket, key)
|
||||
|
|
8
tox.ini
8
tox.ini
|
@ -4,7 +4,7 @@
|
|||
# and then run "tox" from this directory.
|
||||
|
||||
[tox]
|
||||
envlist = py27, py35, flake8, black, docs
|
||||
envlist = py37, flake8, black, docs
|
||||
|
||||
[pytest]
|
||||
addopts =
|
||||
|
@ -20,17 +20,17 @@ max-line-length = 100
|
|||
|
||||
[testenv:flake8]
|
||||
deps =
|
||||
flake8==3.6.0
|
||||
flake8==3.8.4
|
||||
commands =
|
||||
flake8 mozetl tests
|
||||
|
||||
[testenv:black]
|
||||
deps = black
|
||||
deps = black==20.8b1
|
||||
commands = black --check mozetl/ tests/
|
||||
|
||||
[testenv:docs]
|
||||
description = invoke sphinx-build to build HTML docs
|
||||
basepython = python2.7
|
||||
basepython = python3.7
|
||||
deps =
|
||||
sphinx >= 1.7.5, < 2
|
||||
m2r
|
||||
|
|
Загрузка…
Ссылка в новой задаче