[Bug 1823724] Add flag to missing columns views to indicate that column exists in schema (#6215)
* Add `column_exists_in_schema` field to structured_missing_columns * Add column_exists_in_schema to telemetry_missing_columns * Add UDF to convert column names to be compatible with schema conventions * Add UDF test for snake_case_columns * Fix stage deploys for INFORMATION_SCHEMA * Fix UDF test * Code review feedback * Review feedback
This commit is contained in:
Родитель
cebe9ca32b
Коммит
61f920cd3e
|
@ -342,14 +342,15 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
|
||||||
original_project = artifact_file.parent.parent.parent.name
|
original_project = artifact_file.parent.parent.parent.name
|
||||||
|
|
||||||
deployed_dataset = original_dataset
|
deployed_dataset = original_dataset
|
||||||
deployed_dataset += f"_{original_project.replace('-', '_')}"
|
|
||||||
|
|
||||||
if dataset_suffix and original_dataset not in (
|
if original_dataset not in (
|
||||||
"INFORMATION_SCHEMA",
|
"INFORMATION_SCHEMA",
|
||||||
"region-eu",
|
"region-eu",
|
||||||
"region-us",
|
"region-us",
|
||||||
):
|
):
|
||||||
deployed_dataset += f"_{dataset_suffix}"
|
deployed_dataset += f"_{original_project.replace('-', '_')}"
|
||||||
|
if dataset_suffix:
|
||||||
|
deployed_dataset += f"_{dataset_suffix}"
|
||||||
|
|
||||||
deployed_project = project_id
|
deployed_project = project_id
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
|
||||||
`moz-fx-data-shared-prod.monitoring.structured_missing_columns`
|
`moz-fx-data-shared-prod.monitoring.structured_missing_columns`
|
||||||
AS
|
AS
|
||||||
SELECT
|
SELECT
|
||||||
*
|
missing_columns.*,
|
||||||
|
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
|
||||||
FROM
|
FROM
|
||||||
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1`
|
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1` AS missing_columns
|
||||||
|
LEFT JOIN
|
||||||
|
-- Check whether the column actually exists in the schema.
|
||||||
|
-- In some cases columns first show up as missing, but are added to the schema after some delay.
|
||||||
|
-- In other cases columns show up as missing due to some invalid data being sent that did not
|
||||||
|
-- get caught during schema validation in ingestion. For example, sometimes integer values that
|
||||||
|
-- are too large for BigQuery cause columns to show up here.
|
||||||
|
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
|
||||||
|
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
|
||||||
|
AND existing_schema.table_name = CONCAT(
|
||||||
|
missing_columns.document_type,
|
||||||
|
"_v",
|
||||||
|
missing_columns.document_version
|
||||||
|
)
|
||||||
|
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
|
||||||
|
-- The `path` format looks like this: `events`.[...].`timestamp`
|
||||||
|
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
|
||||||
|
AND ARRAY_TO_STRING(
|
||||||
|
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
|
||||||
|
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
|
||||||
|
),
|
||||||
|
"."
|
||||||
|
) = existing_schema.field_path
|
||||||
|
|
|
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
|
||||||
`moz-fx-data-shared-prod.monitoring.telemetry_missing_columns`
|
`moz-fx-data-shared-prod.monitoring.telemetry_missing_columns`
|
||||||
AS
|
AS
|
||||||
SELECT
|
SELECT
|
||||||
*
|
missing_columns.*,
|
||||||
|
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
|
||||||
FROM
|
FROM
|
||||||
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3`
|
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3` AS missing_columns
|
||||||
|
LEFT JOIN
|
||||||
|
-- Check whether the column actually exists in the schema.
|
||||||
|
-- In some cases columns first show up as missing, but are added to the schema after some delay.
|
||||||
|
-- In other cases columns show up as missing due to some invalid data being sent that did not
|
||||||
|
-- get caught during schema validation in ingestion. For example, sometimes integer values that
|
||||||
|
-- are too large for BigQuery cause columns to show up here.
|
||||||
|
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
|
||||||
|
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
|
||||||
|
AND existing_schema.table_name = CONCAT(
|
||||||
|
missing_columns.document_type,
|
||||||
|
"_v",
|
||||||
|
missing_columns.document_version
|
||||||
|
)
|
||||||
|
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
|
||||||
|
-- The `path` format looks like this: `events`.[...].`timestamp`
|
||||||
|
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
|
||||||
|
AND ARRAY_TO_STRING(
|
||||||
|
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
|
||||||
|
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
|
||||||
|
),
|
||||||
|
"."
|
||||||
|
) = existing_schema.field_path
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
---
|
||||||
|
description: This UDF takes a list of column names to snake case and transform them
|
||||||
|
to be compatible with the BigQuery column naming format.
|
||||||
|
Based on the existing ingestion logic
|
||||||
|
https://github.com/mozilla/gcp-ingestion/blob/dad29698271e543018eddbb3b771ad7942bf4ce5/
|
||||||
|
ingestion-core/src/main/java/com/mozilla/telemetry/ingestion/core/transform/PubsubMessageToObjectNode.java#L824
|
||||||
|
friendly_name: Snake Case Columns
|
|
@ -0,0 +1,65 @@
|
||||||
|
CREATE OR REPLACE FUNCTION udf_js.snake_case_columns(input ARRAY<STRING>)
|
||||||
|
RETURNS ARRAY<STRING> DETERMINISTIC
|
||||||
|
LANGUAGE js AS r"""
|
||||||
|
const REV_WORD_BOUND_PAT = new RegExp(
|
||||||
|
"\\b" // standard word boundary
|
||||||
|
+ "|(?<=[a-z][A-Z])(?=\\d*[A-Z])" // A7Aa -> A7|Aa boundary
|
||||||
|
+ "|(?<=[a-z][A-Z])(?=\\d*[a-z])" // a7Aa -> a7|Aa boundary
|
||||||
|
+ "|(?<=[A-Z])(?=\\d*[a-z])" // a7A -> a7|A boundary
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a name to snake case.
|
||||||
|
*
|
||||||
|
* The specific implementation here uses regular expressions in order to be compatible across languages.
|
||||||
|
* See https://github.com/acmiyaguchi/test-casing
|
||||||
|
*/
|
||||||
|
function format(input) {
|
||||||
|
const subbed = input.split('').reverse().join('').replace(/[^\w]|_/g, " ");
|
||||||
|
const reversedResult = subbed.split(REV_WORD_BOUND_PAT)
|
||||||
|
.map(s => s.trim())
|
||||||
|
.map(s => s.toLowerCase())
|
||||||
|
.filter(s => s.length > 0)
|
||||||
|
.join('_');
|
||||||
|
return reversedResult.split('').reverse().join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a name to a BigQuery compatible format.
|
||||||
|
*
|
||||||
|
* If the name starts with a digit, prepend an underscore.
|
||||||
|
* Otherwise, convert the name to snake_case.
|
||||||
|
*/
|
||||||
|
function convertNameForBq(name) {
|
||||||
|
let sb = '';
|
||||||
|
if (name.length > 0 && !isNaN(parseInt(name.charAt(0)))) {
|
||||||
|
sb += '_';
|
||||||
|
}
|
||||||
|
sb += format(name);
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
return input.map((field) => convertNameForBq(field));
|
||||||
|
""";
|
||||||
|
|
||||||
|
-- Tests
|
||||||
|
WITH input AS (
|
||||||
|
SELECT
|
||||||
|
['metrics', 'color'] AS test_input,
|
||||||
|
['metrics', 'color'] AS expected
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
['user_prefs', 'foo.bar', 'camelCase'],
|
||||||
|
['user_prefs', 'foo_bar', 'camel_case']
|
||||||
|
),
|
||||||
|
formatted AS (
|
||||||
|
SELECT
|
||||||
|
udf_js.snake_case_columns(test_input) AS result,
|
||||||
|
expected
|
||||||
|
FROM
|
||||||
|
input
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
mozfun.assert.array_equals(expected, result)
|
||||||
|
FROM
|
||||||
|
formatted
|
Загрузка…
Ссылка в новой задаче