Add taarlite guid ranking job (#9)
This commit is contained in:
Родитель
96de28b2e6
Коммит
ddc33901f6
|
@ -1,7 +1,7 @@
|
|||
FROM continuumio/miniconda3
|
||||
ENV PYTHONDONTWRITEBYTECODE 1
|
||||
|
||||
MAINTAINER Victor Ng <vng@mozilla.com>
|
||||
MAINTAINER Evgeny Pavlov <epavlov@mozilla.com>
|
||||
|
||||
# add a non-privileged user for installing and running
|
||||
# the application
|
||||
|
|
24
README.md
24
README.md
|
@ -19,10 +19,14 @@ cluster using a git checkout.
|
|||
|
||||
## New GCS storage locations
|
||||
|
||||
Top level bucket:
|
||||
Prod buckets:
|
||||
|
||||
GCS_BUCKET=moz-fx-data-derived-datasets-parquet
|
||||
moz-fx-data-taar-pr-prod-e0f7-prod-etl
|
||||
moz-fx-data-taar-pr-prod-e0f7-prod-models
|
||||
|
||||
Test bucket:
|
||||
|
||||
taar_models
|
||||
|
||||
## Jobs
|
||||
|
||||
|
@ -47,6 +51,15 @@ taar_etl.taar_amowhitelist
|
|||
Path: gs://taar_models/addon_recommender/featured_addons_database.json
|
||||
Path: gs://taar_models/addon_recommender/featured_whitelist_addons.json
|
||||
|
||||
taar_lite_guid_ranking
|
||||
|
||||
This job loads installation counts by addon from BigQuery telemetry telemetry.addons table
|
||||
and saves it to GCS.
|
||||
|
||||
Output file:
|
||||
Path: gs://taar_models/taar/lite/guid_install_ranking.json
|
||||
|
||||
|
||||
taar_etl.taar_update_whitelist
|
||||
|
||||
This job extracts the editorial approved addons from AMO
|
||||
|
@ -60,7 +73,6 @@ taar_etl.taar_update_whitelist
|
|||
|
||||
taar_etl.taar_profile_bigtable
|
||||
|
||||
|
||||
This task is responsible for extracting data from BigQuery from
|
||||
the telemetry table: `clients_last_seen`
|
||||
and exports temporary files in Avro format to a bucket in Google
|
||||
|
@ -82,16 +94,19 @@ taar_etl.taar_profile_bigtable
|
|||
## PySpark Jobs
|
||||
|
||||
taar_similarity
|
||||
|
||||
Output file:
|
||||
Path: gs://taar_models/similarity/donors.json
|
||||
Path: gs://taar_models/similarity/lr_curves.json
|
||||
|
||||
taar_locale
|
||||
|
||||
Output file:
|
||||
Path: gs://taar_models/locale/top10_dict.json
|
||||
|
||||
|
||||
taar_lite
|
||||
|
||||
Compute addon coinstallation rates for TAARlite
|
||||
|
||||
Output file:
|
||||
|
@ -101,6 +116,7 @@ taar_lite
|
|||
## Google Cloud Platform jobs
|
||||
|
||||
taar_etl.taar_profile_bigtable
|
||||
|
||||
This job extracts user profile data from `clients_last_seen` to
|
||||
build a user profile table in Bigtable. This job is split into 3
|
||||
parts:
|
||||
|
@ -124,7 +140,7 @@ taar_etl.taar_profile_bigtable
|
|||
|
||||
## Uploading images to gcr.io
|
||||
|
||||
Travis will automatically build a docker image and push the image into
|
||||
CircleCI will automatically build a docker image and push the image into
|
||||
gcr.io for production using the latest tag.
|
||||
|
||||
You can use images from the gcr.io image repository using a path like:
|
||||
|
|
4
setup.py
4
setup.py
|
@ -3,14 +3,14 @@ from setuptools import find_packages, setup
|
|||
setup(
|
||||
name="taar_gcp_etl",
|
||||
use_scm_version=False,
|
||||
version="0.5.2",
|
||||
version="0.6.0",
|
||||
setup_requires=["setuptools_scm", "pytest-runner"],
|
||||
tests_require=["pytest"],
|
||||
include_package_data=True,
|
||||
packages=find_packages(exclude=["tests", "tests/*"]),
|
||||
description="Telemetry-Aware Addon Recommender ETL Tools",
|
||||
author="Mozilla Corporation",
|
||||
author_email="vng@mozilla.org",
|
||||
author_email="epavlov@mozilla.com",
|
||||
url="https://github.com/mozilla/taar_gcp_etl",
|
||||
license="MPL 2.0",
|
||||
install_requires=[],
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
This ETL job computes the number of installations for all addons.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import click
|
||||
from google.cloud import bigquery
|
||||
|
||||
from taar_etl.taar_utils import store_json_to_gcs
|
||||
|
||||
OUTPUT_BUCKET = "taar_models"
|
||||
OUTPUT_PREFIX = "taar/lite"
|
||||
OUTPUT_FILENAME = "guid_install_ranking.json"
|
||||
|
||||
|
||||
def extract_telemetry(iso_today):
|
||||
telemetry_client = bigquery.Client()
|
||||
res = telemetry_client.query(f'''
|
||||
SELECT
|
||||
addon_id as addon_guid,
|
||||
count(client_id) as install_count
|
||||
FROM
|
||||
`moz-fx-data-shared-prod`.telemetry.addons
|
||||
WHERE submission_date = DATE('{iso_today}')
|
||||
GROUP BY addon_id
|
||||
''')
|
||||
|
||||
return {row[0]: row[1] for row in res.result()}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--date", required=True)
|
||||
@click.option("--bucket", default=OUTPUT_BUCKET)
|
||||
@click.option("--prefix", default=OUTPUT_PREFIX)
|
||||
def main(date, bucket, prefix):
|
||||
logging.info("Processing GUID install rankings")
|
||||
|
||||
result_data = extract_telemetry(date)
|
||||
store_json_to_gcs(bucket, prefix, OUTPUT_FILENAME, result_data, date)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Загрузка…
Ссылка в новой задаче