DENG-1728 adding glean_app metrics to bigquery-etl (#4720)

* adding telemetry_dev_cycle_derived to bigquery_etl

* Update dags.yaml

Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com>

* changes after code review

* move to _external dataset

* rename table

* fix defaults

* schema from file

---------

Co-authored-by: kik-kik <42538694+kik-kik@users.noreply.github.com>
This commit is contained in:
Leli 2023-12-20 20:14:11 +01:00 коммит произвёл GitHub
Родитель d96618dce2
Коммит c05aec0f9b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 208 добавлений и 0 удалений

Просмотреть файл

@ -1399,3 +1399,23 @@ bqetl_mobile_feature_usage:
schedule_interval: 0 6 * * *
tags:
- impact/tier_3
bqetl_telemetry_dev_cycle:
default_args:
depends_on_past: false
email:
- telemetry-alerts@mozilla.com
- leli@mozilla.com
email_on_failure: true
email_on_retry: false
end_date: null
owner: leli@mozilla.com
retries: 2
retry_delay: 30m
start_date: '2023-12-19'
description: DAG for Telemetry Dev Cycle Dashboard
repo: bigquery-etl
schedule_interval: 0 18 * * *
tags:
- impact/tier_3
- repo/bigquery-etl

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Telemetry Dev Cycle Derived
description: |-
Data for the Telemetry Dev Cycle Dashboard.
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Telemetry Dev Cycle Derived
description: |-
External data for the Telemetry Dev Cycle Dashboard.
The data in this dataset is coming from external source like API requests.
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Glean Metrics External
description: |-
This table represents data downloaded from https://probeinfo.telemetry.mozilla.org/glean/{glean_app}/metric.
The downloaded data is cleaned and only relevant fields are shown.
owners:
- leli@mozilla.com
labels:
incremental: false
owner1: leli@mozilla.com
scheduling:
dag_name: bqetl_telemetry_dev_cycle

Просмотреть файл

@ -0,0 +1,125 @@
"""Glean metric data - download from API, clean and upload to BigQuery."""
import logging
from pathlib import Path
import click
import requests
import yaml
from google.cloud import bigquery
API_BASE_URL = "https://probeinfo.telemetry.mozilla.org"
DEFAULT_PROJECT_ID = Path(__file__).parent.parent.parent.name
DEFAULT_DATASET_ID = Path(__file__).parent.parent.name
DEFAULT_TABLE_NAME = Path(__file__).parent.name
DEFAULT_BAD_REQUEST_THRESHOLD = 5
SCHEMA_FILE = Path(__file__).parent / "schema.yaml"
SCHEMA = bigquery.SchemaField.from_api_repr(
{"name": "root", "type": "RECORD", **yaml.safe_load(SCHEMA_FILE.read_text())}
).fields
def get_api_response(url):
"""Get json of response if the requests to the API was successful."""
response = requests.get(url)
response.raise_for_status()
return response.json()
def store_data_in_bigquery(data, schema, destination_project, destination_table_id):
"""Upload data to Bigquery in a single, non partitioned table."""
client = bigquery.Client(project=destination_project)
job_config = bigquery.LoadJobConfig(
create_disposition="CREATE_IF_NEEDED",
schema=schema,
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
write_disposition="WRITE_TRUNCATE",
)
load_job = client.load_table_from_json(
data, destination_table_id, location="US", job_config=job_config
)
load_job.result()
stored_table = client.get_table(destination_table_id)
logging.info(f"Loaded {stored_table.num_rows} rows into {destination_table_id}.")
def download_glean_metrics(url, threshold):
"""Download metrics for glean products and parse the data."""
# get a list of all glean apps
glean_apps_response = get_api_response(f"{url}/glean/repositories")
glean_apps = [glean_app["name"] for glean_app in glean_apps_response]
glean_metrics = []
error_counter = 0
for glean_app in glean_apps:
try:
metrics = get_api_response(f"{url}/glean/{glean_app}/metrics")
for name, metric in metrics.items():
first_seen = metric["history"][0]["dates"]["first"][:10]
last_seen = metric["history"][-1]["dates"]["last"][:10]
expires = metric["history"][0]["expires"]
glean_metrics.append(
{
"glean_app": glean_app,
"metric": name,
"type": metric["history"][0]["type"],
"first_seen_date": first_seen,
"last_seen_date": last_seen,
"expires": expires,
}
)
except requests.exceptions.HTTPError as err:
error_counter += 1
logging.error(err)
if error_counter > threshold:
raise Exception(
f"More then the accepted threshold of {threshold} requests failed."
)
return glean_metrics
@click.command
@click.option(
"--bq_project_id",
default=DEFAULT_PROJECT_ID,
show_default=True,
help="BigQuery project the data is written to.",
)
@click.option(
"--bq_dataset_id",
default=DEFAULT_DATASET_ID,
show_default=True,
help="BigQuery dataset the data is written to.",
)
@click.option(
"--bq_table_name",
default=DEFAULT_TABLE_NAME,
show_default=True,
help="Bigquery table the data is written to.",
)
@click.option(
"--threshold",
default=DEFAULT_BAD_REQUEST_THRESHOLD,
show_default=True,
help="Number of bad Requests to get metrics for glean apps before the job fails.",
)
def run_glean_metrics(bq_project_id, bq_dataset_id, bq_table_name, threshold):
"""Download the data from the API and store it in BigQuery."""
glean_metrics = download_glean_metrics(API_BASE_URL, threshold)
destination_table_id = f"{bq_project_id}.{bq_dataset_id}.{bq_table_name}"
store_data_in_bigquery(
data=glean_metrics,
schema=SCHEMA,
destination_project=bq_project_id,
destination_table_id=destination_table_id,
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run_glean_metrics()

Просмотреть файл

@ -0,0 +1,31 @@
fields:
- name: glean_app
type: STRING
mode: NULLABLE
description: App name of the glean app.
- name: metric
type: STRING
mode: NULLABLE
description: Name of the glean metric.
- name: type
type: STRING
mode: NULLABLE
description: Type of the glean metric.
- name: first_seen_date
type: DATE
mode: NULLABLE
description: First seen date according to the probe scraper API.
- name: last_seen_date
type: DATE
mode: NULLABLE
description: Last seen date according to the probe scraper API.
- name: expires
type: STRING
mode: NULLABLE
description: When this metric will expire. Can be "never" or a date or a version number.