add query_id and username columns to bigquery_usage_v2 table - joinin… (#4203)

* add query_id and username columns to bigquery_usage_v2 table - joining JOBS_BY_ORG and JOBS_BY_PROJECT tables

* change jobs_by_project query to use variable {project} instead of

* adjust formatting

* refactor query to use jobs_by_organization_derived table

* remove data ops from codeowner of bigquery_table_usage_v2

* add marlene as a code owner to the bigquery_table_usage_v2 table

* add marlene as owner to DAG for bq_table_usage_v2
This commit is contained in:
Marlene Hirose 2023-09-22 10:15:37 -07:00 коммит произвёл GitHub
Родитель e3208aeecc
Коммит 7040463033
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 95 добавлений и 44 удалений

Просмотреть файл

@ -15,6 +15,5 @@
# Contextual Services
/sql/moz-fx-data-shared-prod/contextual_services_derived/request_payload_suggest_v2 @mozilla/request_payload_reviewers
/sql/moz-fx-data-shared-prod/contextual_services_derived/request_payload_tiles_v2 @mozilla/request_payload_reviewers
/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_usage_v2 @mozilla/dataops
/sql/moz-fx-data-shared-prod/contextual_services_derived/suggest_revenue_levers_daily_v1 @mozilla/revenue_forecasting_data_reviewers
/sql/moz-fx-data-shared-prod/monitoring_derived/jobs_by_organization_v1 @mozilla/dataops

Просмотреть файл

@ -138,7 +138,7 @@ with DAG(
+ ["--date", "{{ ds }}"],
docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
owner="wichan@mozilla.com",
email=["ascholtz@mozilla.com", "wichan@mozilla.com"],
email=["ascholtz@mozilla.com", "mhirose@mozilla.com", "wichan@mozilla.com"],
)
monitoring_derived__column_size__v1 = gke_command(

Просмотреть файл

@ -2,6 +2,7 @@ friendly_name: BigQuery Query Usage
description: BigQuery usage, partitioned by day.
owners:
- wichan@mozilla.com
- mhirose@mozilla.com
labels:
incremental: true
schedule: daily

Просмотреть файл

@ -11,13 +11,6 @@ from argparse import ArgumentParser
from google.cloud import bigquery
DEFAULT_PROJECTS = [
"mozdata",
"moz-fx-data-shared-prod",
"moz-fx-data-marketing-prod",
"moz-fx-data-bq-data-science",
]
parser = ArgumentParser(description=__doc__)
parser.add_argument("--date", required=True) # expect string with format yyyy-mm-dd
parser.add_argument("--project", default="moz-fx-data-shared-prod")
@ -28,42 +21,90 @@ parser.add_argument("--destination_table", default="bigquery_usage_v2")
def create_query(date, project):
"""Create query with filter for source projects."""
return f"""
SELECT
t1.project_id AS source_project,
DATE('{date}') AS creation_date,
return """
WITH jobs_by_org AS (
SELECT
t1.project_id AS source_project,
creation_date,
job_id,
job_type,
reservation_id,
cache_hit,
state,
statement_type,
referenced_tables.project_id AS reference_project_id,
referenced_tables.dataset_id AS reference_dataset_id,
referenced_tables.table_id AS reference_table_id,
destination_table.project_id AS destination_project_id,
destination_table.dataset_id AS destination_dataset_id,
destination_table.table_id AS destination_table_id,
user_email,
end_time-start_time as task_duration,
ROUND(total_bytes_processed / 1024 / 1024 / 1024 / 1024, 4)
AS total_terabytes_processed,
ROUND(total_bytes_billed / 1024 / 1024 / 1024 / 1024, 4)
AS total_terabytes_billed,
total_slot_ms,
error_result.location AS error_location,
error_result.reason AS error_reason,
error_result.message AS error_message,
query_info_resource_warning AS resource_warning,
FROM
`moz-fx-data-shared-prod.monitoring_derived.jobs_by_organization_v1` AS t1
LEFT JOIN
UNNEST(referenced_tables) AS referenced_tables
),
jobs_by_project AS (
SELECT
jp.project_id AS source_project,
date(creation_time) as creation_date,
job_id,
referenced_tables.project_id AS reference_project_id,
referenced_tables.dataset_id AS reference_dataset_id,
referenced_tables.table_id AS reference_table_id,
user_email,
REGEXP_EXTRACT(query, r'Username: (.*?),') AS username,
REGEXP_EXTRACT(query, r'Query ID: (\\w+),') AS query_id,
FROM
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` jp
LEFT JOIN
UNNEST(referenced_tables) AS referenced_tables
)
SELECT
jo.source_project,
jo.creation_date,
jo.job_id,
jo.job_type,
jo.reservation_id,
jo.cache_hit,
jo.state,
jo.statement_type,
jp.query_id,
jo.reference_project_id,
jo.reference_dataset_id,
jo.reference_table_id,
jo.destination_project_id,
jo.destination_dataset_id,
jo.destination_table_id,
jo.user_email,
jp.username,
jo.task_duration,
jo.total_terabytes_processed,
jo.total_terabytes_billed,
jo.total_slot_ms,
jo.error_location,
jo.error_reason,
jo.error_message,
jo.resource_warning
FROM jobs_by_org jo
LEFT JOIN jobs_by_project jp
USING(source_project,
creation_date,
job_id,
job_type,
reservation_id,
cache_hit,
state,
statement_type,
referenced_tables.project_id AS reference_project_id,
dataset_id AS reference_dataset_id,
table_id AS reference_table_id,
destination_table.project_id AS destination_project_id,
destination_table.dataset_id AS destination_dataset_id,
destination_table.table_id AS destination_table_id,
user_email,
end_time-start_time as task_duration,
ROUND(total_bytes_processed / 1024 / 1024 / 1024 / 1024, 4)
AS total_terabytes_processed,
ROUND(total_bytes_billed / 1024 / 1024 / 1024 / 1024, 4)
AS total_terabytes_billed,
total_slot_ms,
error_result.location AS error_location,
error_result.reason AS error_reason,
error_result.message AS error_message,
query_info.resource_warning AS resource_warning,
FROM
`{project}.region-us.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION` AS t1
LEFT JOIN
UNNEST(referenced_tables) AS referenced_tables
WHERE
DATE(creation_time) = '{date}'
AND (t1.project_id IN UNNEST({DEFAULT_PROJECTS})
OR referenced_tables.project_id IN UNNEST({DEFAULT_PROJECTS})
OR destination_table.project_id IN UNNEST({DEFAULT_PROJECTS}))
reference_project_id,
reference_dataset_id,
reference_table_id)
WHERE creation_date > '2023-03-01'
"""

Просмотреть файл

@ -40,6 +40,11 @@ fields:
type: STRING
description: The type of query statement
- mode: NULLABLE
name: query_id
type: STRING
description: The id of the query
- mode: NULLABLE
name: reference_project_id
type: STRING
@ -75,6 +80,11 @@ fields:
type: STRING
description: Email address or service account of the user who ran the job
- mode: NULLABLE
name: username
type: STRING
description: The name of the user who ran the job
- mode: NULLABLE
name: task_duration
type: INTERVAL