DENG-1728 adding glean_app metrics to bigquery-etl (#4720)
* adding telemetry_dev_cycle_derived to bigquery_etl * Update dags.yaml Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com> * changes after code review * move to _external dataset * rename table * fix defaults * schema from file --------- Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com>
This commit is contained in:
Родитель
d96618dce2
Коммит
c05aec0f9b
20
dags.yaml
20
dags.yaml
|
@ -1399,3 +1399,23 @@ bqetl_mobile_feature_usage:
|
|||
schedule_interval: 0 6 * * *
|
||||
tags:
|
||||
- impact/tier_3
|
||||
|
||||
bqetl_telemetry_dev_cycle:
|
||||
default_args:
|
||||
depends_on_past: false
|
||||
email:
|
||||
- telemetry-alerts@mozilla.com
|
||||
- leli@mozilla.com
|
||||
email_on_failure: true
|
||||
email_on_retry: false
|
||||
end_date: null
|
||||
owner: leli@mozilla.com
|
||||
retries: 2
|
||||
retry_delay: 30m
|
||||
start_date: '2023-12-19'
|
||||
description: DAG for Telemetry Dev Cycle Dashboard
|
||||
repo: bigquery-etl
|
||||
schedule_interval: 0 18 * * *
|
||||
tags:
|
||||
- impact/tier_3
|
||||
- repo/bigquery-etl
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Telemetry Dev Cycle Derived
|
||||
description: |-
|
||||
Data for the Telemetry Dev Cycle Dashboard.
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Telemetry Dev Cycle Derived
|
||||
description: |-
|
||||
External data for the Telemetry Dev Cycle Dashboard.
|
||||
The data in this dataset is coming from external source like API requests.
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Glean Metrics External
|
||||
description: |-
|
||||
This table represents data downloaded from https://probeinfo.telemetry.mozilla.org/glean/{glean_app}/metric.
|
||||
The downloaded data is cleaned and only relevant fields are shown.
|
||||
owners:
|
||||
- leli@mozilla.com
|
||||
labels:
|
||||
incremental: false
|
||||
owner1: leli@mozilla.com
|
||||
scheduling:
|
||||
dag_name: bqetl_telemetry_dev_cycle
|
|
@ -0,0 +1,125 @@
|
|||
"""Glean metric data - download from API, clean and upload to BigQuery."""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import requests
|
||||
import yaml
|
||||
from google.cloud import bigquery
|
||||
|
||||
API_BASE_URL = "https://probeinfo.telemetry.mozilla.org"
|
||||
|
||||
DEFAULT_PROJECT_ID = Path(__file__).parent.parent.parent.name
|
||||
DEFAULT_DATASET_ID = Path(__file__).parent.parent.name
|
||||
DEFAULT_TABLE_NAME = Path(__file__).parent.name
|
||||
DEFAULT_BAD_REQUEST_THRESHOLD = 5
|
||||
|
||||
SCHEMA_FILE = Path(__file__).parent / "schema.yaml"
|
||||
SCHEMA = bigquery.SchemaField.from_api_repr(
|
||||
{"name": "root", "type": "RECORD", **yaml.safe_load(SCHEMA_FILE.read_text())}
|
||||
).fields
|
||||
|
||||
|
||||
def get_api_response(url):
|
||||
"""Get json of response if the requests to the API was successful."""
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def store_data_in_bigquery(data, schema, destination_project, destination_table_id):
|
||||
"""Upload data to Bigquery in a single, non partitioned table."""
|
||||
client = bigquery.Client(project=destination_project)
|
||||
|
||||
job_config = bigquery.LoadJobConfig(
|
||||
create_disposition="CREATE_IF_NEEDED",
|
||||
schema=schema,
|
||||
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
||||
write_disposition="WRITE_TRUNCATE",
|
||||
)
|
||||
|
||||
load_job = client.load_table_from_json(
|
||||
data, destination_table_id, location="US", job_config=job_config
|
||||
)
|
||||
load_job.result()
|
||||
stored_table = client.get_table(destination_table_id)
|
||||
logging.info(f"Loaded {stored_table.num_rows} rows into {destination_table_id}.")
|
||||
|
||||
|
||||
def download_glean_metrics(url, threshold):
|
||||
"""Download metrics for glean products and parse the data."""
|
||||
# get a list of all glean apps
|
||||
glean_apps_response = get_api_response(f"{url}/glean/repositories")
|
||||
glean_apps = [glean_app["name"] for glean_app in glean_apps_response]
|
||||
|
||||
glean_metrics = []
|
||||
error_counter = 0
|
||||
for glean_app in glean_apps:
|
||||
try:
|
||||
metrics = get_api_response(f"{url}/glean/{glean_app}/metrics")
|
||||
for name, metric in metrics.items():
|
||||
first_seen = metric["history"][0]["dates"]["first"][:10]
|
||||
last_seen = metric["history"][-1]["dates"]["last"][:10]
|
||||
expires = metric["history"][0]["expires"]
|
||||
glean_metrics.append(
|
||||
{
|
||||
"glean_app": glean_app,
|
||||
"metric": name,
|
||||
"type": metric["history"][0]["type"],
|
||||
"first_seen_date": first_seen,
|
||||
"last_seen_date": last_seen,
|
||||
"expires": expires,
|
||||
}
|
||||
)
|
||||
except requests.exceptions.HTTPError as err:
|
||||
error_counter += 1
|
||||
logging.error(err)
|
||||
if error_counter > threshold:
|
||||
raise Exception(
|
||||
f"More then the accepted threshold of {threshold} requests failed."
|
||||
)
|
||||
return glean_metrics
|
||||
|
||||
|
||||
@click.command
|
||||
@click.option(
|
||||
"--bq_project_id",
|
||||
default=DEFAULT_PROJECT_ID,
|
||||
show_default=True,
|
||||
help="BigQuery project the data is written to.",
|
||||
)
|
||||
@click.option(
|
||||
"--bq_dataset_id",
|
||||
default=DEFAULT_DATASET_ID,
|
||||
show_default=True,
|
||||
help="BigQuery dataset the data is written to.",
|
||||
)
|
||||
@click.option(
|
||||
"--bq_table_name",
|
||||
default=DEFAULT_TABLE_NAME,
|
||||
show_default=True,
|
||||
help="Bigquery table the data is written to.",
|
||||
)
|
||||
@click.option(
|
||||
"--threshold",
|
||||
default=DEFAULT_BAD_REQUEST_THRESHOLD,
|
||||
show_default=True,
|
||||
help="Number of bad Requests to get metrics for glean apps before the job fails.",
|
||||
)
|
||||
def run_glean_metrics(bq_project_id, bq_dataset_id, bq_table_name, threshold):
|
||||
"""Download the data from the API and store it in BigQuery."""
|
||||
glean_metrics = download_glean_metrics(API_BASE_URL, threshold)
|
||||
|
||||
destination_table_id = f"{bq_project_id}.{bq_dataset_id}.{bq_table_name}"
|
||||
store_data_in_bigquery(
|
||||
data=glean_metrics,
|
||||
schema=SCHEMA,
|
||||
destination_project=bq_project_id,
|
||||
destination_table_id=destination_table_id,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
run_glean_metrics()
|
|
@ -0,0 +1,31 @@
|
|||
fields:
|
||||
|
||||
- name: glean_app
|
||||
type: STRING
|
||||
mode: NULLABLE
|
||||
description: App name of the glean app.
|
||||
|
||||
- name: metric
|
||||
type: STRING
|
||||
mode: NULLABLE
|
||||
description: Name of the glean metric.
|
||||
|
||||
- name: type
|
||||
type: STRING
|
||||
mode: NULLABLE
|
||||
description: Type of the glean metric.
|
||||
|
||||
- name: first_seen_date
|
||||
type: DATE
|
||||
mode: NULLABLE
|
||||
description: First seen date according to the probe scraper API.
|
||||
|
||||
- name: last_seen_date
|
||||
type: DATE
|
||||
mode: NULLABLE
|
||||
description: Last seen date according to the probe scraper API.
|
||||
|
||||
- name: expires
|
||||
type: STRING
|
||||
mode: NULLABLE
|
||||
description: When this metric will expire. Can be "never" or a date or a version number.
|
Загрузка…
Ссылка в новой задаче