Add taarlite guid ranking job (#9)

2021-01-22 13:47:35 -08:00 · 2021-01-22 13:47:35 -08:00 · ddc33901f6
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 FROM continuumio/miniconda3
 ENV PYTHONDONTWRITEBYTECODE 1

-MAINTAINER Victor Ng <vng@mozilla.com>
+MAINTAINER Evgeny Pavlov <epavlov@mozilla.com>

 # add a non-privileged user for installing and running
 # the application
--- a/README.md
+++ b/README.md
@ -19,10 +19,14 @@ cluster using a git checkout.

 ## New GCS storage locations

-Top level bucket: 
+Prod buckets: 

-    GCS_BUCKET=moz-fx-data-derived-datasets-parquet
+    moz-fx-data-taar-pr-prod-e0f7-prod-etl
+    moz-fx-data-taar-pr-prod-e0f7-prod-models

+Test bucket:
+
+    taar_models

 ## Jobs

@ -47,6 +51,15 @@ taar_etl.taar_amowhitelist
        Path: gs://taar_models/addon_recommender/featured_addons_database.json
        Path: gs://taar_models/addon_recommender/featured_whitelist_addons.json

+taar_lite_guid_ranking
+
+    This job loads installation counts by addon from BigQuery telemetry telemetry.addons table
+    and saves it to GCS.
+
+    Output file:
+        Path: gs://taar_models/taar/lite/guid_install_ranking.json
+
+
 taar_etl.taar_update_whitelist

    This job extracts the editorial approved addons from AMO
@ -60,7 +73,6 @@ taar_etl.taar_update_whitelist

 taar_etl.taar_profile_bigtable

-
    This task is responsible for extracting data from BigQuery from
    the telemetry table: `clients_last_seen`
    and exports temporary files in Avro format to a bucket in Google
@ -82,16 +94,19 @@ taar_etl.taar_profile_bigtable
 ## PySpark Jobs

 taar_similarity
+
    Output file: 
        Path: gs://taar_models/similarity/donors.json
        Path: gs://taar_models/similarity/lr_curves.json

 taar_locale
+
    Output file: 
        Path: gs://taar_models/locale/top10_dict.json


 taar_lite
+
    Compute addon coinstallation rates for TAARlite
    
    Output file: 
@ -101,6 +116,7 @@ taar_lite
 ## Google Cloud Platform jobs

 taar_etl.taar_profile_bigtable
+
    This job extracts user profile data from `clients_last_seen` to
    build a user profile table in Bigtable. This job is split into 3
    parts:
@ -124,7 +140,7 @@ taar_etl.taar_profile_bigtable

 ## Uploading images to gcr.io

-Travis will automatically build a docker image and push the image into
+CircleCI will automatically build a docker image and push the image into
 gcr.io for production using the latest tag.

 You can use images from the gcr.io image repository using a path like:
--- a/setup.py
+++ b/setup.py
@ -3,14 +3,14 @@ from setuptools import find_packages, setup
 setup(
    name="taar_gcp_etl",
    use_scm_version=False,
-    version="0.5.2",
+    version="0.6.0",
    setup_requires=["setuptools_scm", "pytest-runner"],
    tests_require=["pytest"],
    include_package_data=True,
    packages=find_packages(exclude=["tests", "tests/*"]),
    description="Telemetry-Aware Addon Recommender ETL Tools",
    author="Mozilla Corporation",
-    author_email="vng@mozilla.org",
+    author_email="epavlov@mozilla.com",
    url="https://github.com/mozilla/taar_gcp_etl",
    license="MPL 2.0",
    install_requires=[],
--- a/taar_etl/taar_lite_guid_ranking.py
+++ b/taar_etl/taar_lite_guid_ranking.py
@ -0,0 +1,44 @@
+"""
+This ETL job computes the number of installations for all addons.
+"""
+
+import logging
+
+import click
+from google.cloud import bigquery
+
+from taar_etl.taar_utils import store_json_to_gcs
+
+OUTPUT_BUCKET = "taar_models"
+OUTPUT_PREFIX = "taar/lite"
+OUTPUT_FILENAME = "guid_install_ranking.json"
+
+
+def extract_telemetry(iso_today):
+    telemetry_client = bigquery.Client()
+    res = telemetry_client.query(f'''  
+      SELECT
+          addon_id as addon_guid,
+          count(client_id) as install_count
+      FROM
+          `moz-fx-data-shared-prod`.telemetry.addons
+      WHERE submission_date = DATE('{iso_today}')
+      GROUP BY addon_id
+      ''')
+
+    return {row[0]: row[1] for row in res.result()}
+
+
+@click.command()
+@click.option("--date", required=True)
+@click.option("--bucket", default=OUTPUT_BUCKET)
+@click.option("--prefix", default=OUTPUT_PREFIX)
+def main(date, bucket, prefix):
+    logging.info("Processing GUID install rankings")
+
+    result_data = extract_telemetry(date)
+    store_json_to_gcs(bucket, prefix, OUTPUT_FILENAME, result_data, date)
+
+
+if __name__ == "__main__":
+    main()