DENG-1728 adding glean_app metrics to bigquery-etl (#4720)

* adding telemetry_dev_cycle_derived to bigquery_etl * Update dags.yaml Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com> * changes after code review * move to _external dataset * rename table * fix defaults * schema from file --------- Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com>
2023-12-20 20:14:11 +01:00 · 2023-12-20 20:14:11 +01:00 · c05aec0f9b
--- a/dags.yaml
+++ b/dags.yaml
@ -1399,3 +1399,23 @@ bqetl_mobile_feature_usage:
  schedule_interval: 0 6 * * *
  tags:
    - impact/tier_3
+
+bqetl_telemetry_dev_cycle:
+  default_args:
+    depends_on_past: false
+    email:
+      - telemetry-alerts@mozilla.com
+      - leli@mozilla.com
+    email_on_failure: true
+    email_on_retry: false
+    end_date: null
+    owner: leli@mozilla.com
+    retries: 2
+    retry_delay: 30m
+    start_date: '2023-12-19'
+  description: DAG for Telemetry Dev Cycle Dashboard
+  repo: bigquery-etl
+  schedule_interval: 0 18 * * *
+  tags:
+    - impact/tier_3
+    - repo/bigquery-etl
--- a/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_derived/dataset_metadata.yaml
+++ b/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_derived/dataset_metadata.yaml
@ -0,0 +1,10 @@
+friendly_name: Telemetry Dev Cycle Derived
+description: |-
+  Data for the Telemetry Dev Cycle Dashboard.
+dataset_base_acl: derived
+user_facing: false
+labels: {}
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
--- a/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/dataset_metadata.yaml
+++ b/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/dataset_metadata.yaml
@ -0,0 +1,11 @@
+friendly_name: Telemetry Dev Cycle Derived
+description: |-
+  External data for the Telemetry Dev Cycle Dashboard.
+  The data in this dataset is coming from external source like API requests.
+dataset_base_acl: derived
+user_facing: false
+labels: {}
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
--- a/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/metadata.yaml
+++ b/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/metadata.yaml
@ -0,0 +1,11 @@
+friendly_name: Glean Metrics External
+description: |-
+  This table represents data downloaded from https://probeinfo.telemetry.mozilla.org/glean/{glean_app}/metric.
+  The downloaded data is cleaned and only relevant fields are shown.
+owners:
+- leli@mozilla.com
+labels:
+  incremental: false
+  owner1: leli@mozilla.com
+scheduling:
+  dag_name: bqetl_telemetry_dev_cycle
--- a/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/query.py
+++ b/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/query.py
@ -0,0 +1,125 @@
+"""Glean metric data - download from API, clean and upload to BigQuery."""
+
+import logging
+from pathlib import Path
+
+import click
+import requests
+import yaml
+from google.cloud import bigquery
+
+API_BASE_URL = "https://probeinfo.telemetry.mozilla.org"
+
+DEFAULT_PROJECT_ID = Path(__file__).parent.parent.parent.name
+DEFAULT_DATASET_ID = Path(__file__).parent.parent.name
+DEFAULT_TABLE_NAME = Path(__file__).parent.name
+DEFAULT_BAD_REQUEST_THRESHOLD = 5
+
+SCHEMA_FILE = Path(__file__).parent / "schema.yaml"
+SCHEMA = bigquery.SchemaField.from_api_repr(
+    {"name": "root", "type": "RECORD", **yaml.safe_load(SCHEMA_FILE.read_text())}
+).fields
+
+
+def get_api_response(url):
+    """Get json of response if the requests to the API was successful."""
+    response = requests.get(url)
+    response.raise_for_status()
+    return response.json()
+
+
+def store_data_in_bigquery(data, schema, destination_project, destination_table_id):
+    """Upload data to Bigquery in a single, non partitioned table."""
+    client = bigquery.Client(project=destination_project)
+
+    job_config = bigquery.LoadJobConfig(
+        create_disposition="CREATE_IF_NEEDED",
+        schema=schema,
+        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
+        write_disposition="WRITE_TRUNCATE",
+    )
+
+    load_job = client.load_table_from_json(
+        data, destination_table_id, location="US", job_config=job_config
+    )
+    load_job.result()
+    stored_table = client.get_table(destination_table_id)
+    logging.info(f"Loaded {stored_table.num_rows} rows into {destination_table_id}.")
+
+
+def download_glean_metrics(url, threshold):
+    """Download metrics for glean products and parse the data."""
+    # get a list of all glean apps
+    glean_apps_response = get_api_response(f"{url}/glean/repositories")
+    glean_apps = [glean_app["name"] for glean_app in glean_apps_response]
+
+    glean_metrics = []
+    error_counter = 0
+    for glean_app in glean_apps:
+        try:
+            metrics = get_api_response(f"{url}/glean/{glean_app}/metrics")
+            for name, metric in metrics.items():
+                first_seen = metric["history"][0]["dates"]["first"][:10]
+                last_seen = metric["history"][-1]["dates"]["last"][:10]
+                expires = metric["history"][0]["expires"]
+                glean_metrics.append(
+                    {
+                        "glean_app": glean_app,
+                        "metric": name,
+                        "type": metric["history"][0]["type"],
+                        "first_seen_date": first_seen,
+                        "last_seen_date": last_seen,
+                        "expires": expires,
+                    }
+                )
+        except requests.exceptions.HTTPError as err:
+            error_counter += 1
+            logging.error(err)
+        if error_counter > threshold:
+            raise Exception(
+                f"More then the accepted threshold of {threshold} requests failed."
+            )
+    return glean_metrics
+
+
+@click.command
+@click.option(
+    "--bq_project_id",
+    default=DEFAULT_PROJECT_ID,
+    show_default=True,
+    help="BigQuery project the data is written to.",
+)
+@click.option(
+    "--bq_dataset_id",
+    default=DEFAULT_DATASET_ID,
+    show_default=True,
+    help="BigQuery dataset the data is written to.",
+)
+@click.option(
+    "--bq_table_name",
+    default=DEFAULT_TABLE_NAME,
+    show_default=True,
+    help="Bigquery table the data is written to.",
+)
+@click.option(
+    "--threshold",
+    default=DEFAULT_BAD_REQUEST_THRESHOLD,
+    show_default=True,
+    help="Number of bad Requests to get metrics for glean apps before the job fails.",
+)
+def run_glean_metrics(bq_project_id, bq_dataset_id, bq_table_name, threshold):
+    """Download the data from the API and store it in BigQuery."""
+    glean_metrics = download_glean_metrics(API_BASE_URL, threshold)
+
+    destination_table_id = f"{bq_project_id}.{bq_dataset_id}.{bq_table_name}"
+    store_data_in_bigquery(
+        data=glean_metrics,
+        schema=SCHEMA,
+        destination_project=bq_project_id,
+        destination_table_id=destination_table_id,
+    )
+
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+    run_glean_metrics()
--- a/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/schema.yaml
+++ b/sql/moz-fx-data-shared-prod/telemetry_dev_cycle_external/glean_metrics_stats_v1/schema.yaml
@ -0,0 +1,31 @@
+fields:
+
+- name: glean_app
+  type: STRING
+  mode: NULLABLE
+  description: App name of the glean app.
+
+- name: metric
+  type: STRING
+  mode: NULLABLE
+  description: Name of the glean metric.
+
+- name: type
+  type: STRING
+  mode: NULLABLE
+  description: Type of the glean metric.
+
+- name: first_seen_date
+  type: DATE
+  mode: NULLABLE
+  description: First seen date according to the probe scraper API.
+
+- name: last_seen_date
+  type: DATE
+  mode: NULLABLE
+  description: Last seen date according to the probe scraper API.
+
+- name: expires
+  type: STRING
+  mode: NULLABLE
+  description: When this metric will expire. Can be "never" or a date or a version number.