Use live tables for structured error counts (#4598)

* Use live tables for structured error counts

* Prevent from old records being deleted
This commit is contained in:
Anna Scholtz 2023-12-06 11:13:53 -08:00 коммит произвёл GitHub
Родитель 16bdbcbcc8
Коммит be60f5aa56
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 175 добавлений и 69 удалений

Просмотреть файл

@ -2,6 +2,7 @@ CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.monitoring.structured_error_counts`
AS
SELECT
*
*,
CAST(submission_date AS TIMESTAMP) AS hour -- for backwards compatibility
FROM
`moz-fx-data-shared-prod.monitoring_derived.structured_error_counts_v1`
`moz-fx-data-shared-prod.monitoring_derived.structured_error_counts_v2`

Просмотреть файл

@ -1,5 +1,4 @@
DECLARE dummy INT64; -- declare a dummy variable to indicate that this is a script to bigquery-etl
CREATE TEMP TABLE
deletion_counts(submission_date DATE, dataset_id STRING, num_rows INT64);
@ -43,5 +42,6 @@ THEN
VALUES
(d.submission_date, d.dataset_id, num_rows)
WHEN NOT MATCHED BY SOURCE
AND r.submission_date = @submission_date
THEN
DELETE;

Просмотреть файл

@ -1,66 +0,0 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.monitoring_derived.structured_error_counts_v1`
AS
WITH ping_counts AS (
SELECT
TIMESTAMP_TRUNC(submission_timestamp, HOUR) AS hour,
metadata.document_namespace,
metadata.document_type,
metadata.document_version,
COUNT(*) AS ping_count
FROM
`moz-fx-data-shared-prod.monitoring.payload_bytes_decoded_structured`
WHERE
submission_timestamp >= TIMESTAMP_SUB(current_timestamp, INTERVAL(28 * 24) HOUR)
GROUP BY
hour,
document_namespace,
document_type,
document_version
),
error_counts AS (
SELECT
TIMESTAMP_TRUNC(submission_timestamp, HOUR) AS hour,
document_namespace,
document_type,
document_version,
error_type,
COUNT(*) AS error_count
FROM
`moz-fx-data-shared-prod.monitoring.payload_bytes_error_structured`
WHERE
submission_timestamp >= TIMESTAMP_SUB(current_timestamp, INTERVAL(28 * 24) HOUR)
GROUP BY
hour,
document_namespace,
document_type,
document_version,
error_type
),
structured_hourly_errors AS (
SELECT
hour,
document_namespace,
document_type,
document_version,
error_type,
COALESCE(ping_count, 0) + COALESCE(error_count, 0) AS ping_count,
COALESCE(error_count, 0) AS error_count
FROM
ping_counts
FULL OUTER JOIN
error_counts
USING
(hour, document_namespace, document_type, document_version)
),
with_ratio AS (
SELECT
*,
SAFE_DIVIDE(1.0 * error_count, ping_count) AS error_ratio
FROM
structured_hourly_errors
)
SELECT
*
FROM
with_ratio

Просмотреть файл

@ -0,0 +1,19 @@
---
friendly_name: Structured Error Counts
description: >
A daily count of structured errors by document namespace
owners:
- ascholtz@mozilla.com
labels:
schedule: daily
scheduling:
dag_name: bqetl_monitoring
referenced_tables:
- ['moz-fx-data-shared-prod', '*_live', '*']
date_partition_parameter: null
parameters: ["submission_date:DATE:{{ds}}"]
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: false

Просмотреть файл

@ -0,0 +1,25 @@
fields:
- mode: NULLABLE
name: submission_date
type: DATE
- mode: NULLABLE
name: document_namespace
type: STRING
- mode: NULLABLE
name: document_type
type: STRING
- mode: NULLABLE
name: document_version
type: STRING
- mode: NULLABLE
name: ping_count
type: INTEGER
- mode: NULLABLE
name: error_type
type: STRING
- mode: NULLABLE
name: error_count
type: INTEGER
- mode: NULLABLE
name: error_ratio
type: FLOAT

Просмотреть файл

@ -0,0 +1,127 @@
DECLARE dummy INT64; -- dummy variable to indicate to bigquery-etl that this is a script
CREATE TEMP TABLE
ping_counts(
submission_date DATE,
document_namespace STRING,
document_type STRING,
document_version STRING,
ping_count INT64
);
FOR record IN (
SELECT
schema_name AS dataset_id
FROM
`moz-fx-data-shared-prod.INFORMATION_SCHEMA.SCHEMATA`
WHERE
schema_name LIKE "%_live%"
)
DO
EXECUTE IMMEDIATE CONCAT(
"INSERT ping_counts (submission_date, document_namespace, document_type, document_version, ping_count) ",
"SELECT PARSE_DATE('%Y%m%d', PARTITION_ID) AS submission_date, ",
"REPLACE(TABLE_SCHEMA, '_live', '') AS document_namespace, ",
"REGEXP_EXTRACT(TABLE_NAME, r'(.+)_v[0-9]+') AS document_type, ",
"REGEXP_EXTRACT(TABLE_NAME, r'.+_v([0-9]+)') AS document_version, ",
"TOTAL_ROWS AS ping_count ",
"FROM ",
record.dataset_id,
".INFORMATION_SCHEMA.PARTITIONS ",
"WHERE PARTITION_ID != '__NULL__' AND ",
"PARSE_DATE('%Y%m%d', PARTITION_ID) < CURRENT_DATE AND ('",
@submission_date,
"' IS NULL OR '",
@submission_date,
"' = PARSE_DATE('%Y%m%d', PARTITION_ID))"
);
END
FOR;
CREATE TEMP TABLE
error_counts(
submission_date DATE,
document_namespace STRING,
document_type STRING,
document_version STRING,
ping_count INTEGER,
error_type STRING,
error_count INTEGER,
error_ratio FLOAT64
)
AS
WITH errors AS (
SELECT
DATE(submission_timestamp) AS submission_date,
document_namespace,
document_type,
document_version,
error_type,
COUNT(*) AS error_count
FROM
`moz-fx-data-shared-prod.monitoring.payload_bytes_error_structured`
WHERE
DATE(submission_timestamp) = @submission_date
GROUP BY
submission_date,
document_namespace,
document_type,
document_version,
error_type
)
SELECT
submission_date,
document_namespace,
document_type,
document_version,
ping_count,
error_type,
COALESCE(ping_count, 0) + COALESCE(error_count, 0) AS ping_count,
COALESCE(error_count, 0) AS error_count,
SAFE_DIVIDE(
1.0 * COALESCE(error_count, 0),
COALESCE(ping_count, 0) + COALESCE(error_count, 0)
) AS error_ratio
FROM
ping_counts
FULL OUTER JOIN
errors
USING
(submission_date, document_namespace, document_type, document_version);
MERGE
`moz-fx-data-shared-prod.monitoring_derived.structured_error_counts_v2` r
USING
error_counts d
ON
d.submission_date = r.submission_date
AND r.document_namespace = d.document_namespace
AND r.document_type = d.document_type
AND r.document_version = d.document_version
WHEN NOT MATCHED
THEN
INSERT
(
submission_date,
document_namespace,
document_type,
document_version,
ping_count,
error_type,
error_count,
error_ratio
)
VALUES
(
d.submission_date,
d.document_namespace,
d.document_type,
d.document_version,
d.ping_count,
d.error_type,
d.error_count,
d.error_ratio
)
WHEN NOT MATCHED BY SOURCE
AND r.submission_date = @submission_date
THEN
DELETE;