[Bug 1823724] Add flag to missing columns views to indicate that column exists in schema (#6215)
* Add `column_exists_in_schema` field to structured_missing_columns * Add column_exists_in_schema to telemetry_missing_columns * Add UDF to convert column names to be compatible with schema conventions * Add UDF test for snake_case_columns * Fix stage deploys for INFORMATION_SCHEMA * Fix UDF test * Code review feedback * Review feedback
This commit is contained in:
Родитель
cebe9ca32b
Коммит
61f920cd3e
|
@ -342,13 +342,14 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
|
|||
original_project = artifact_file.parent.parent.parent.name
|
||||
|
||||
deployed_dataset = original_dataset
|
||||
deployed_dataset += f"_{original_project.replace('-', '_')}"
|
||||
|
||||
if dataset_suffix and original_dataset not in (
|
||||
if original_dataset not in (
|
||||
"INFORMATION_SCHEMA",
|
||||
"region-eu",
|
||||
"region-us",
|
||||
):
|
||||
deployed_dataset += f"_{original_project.replace('-', '_')}"
|
||||
if dataset_suffix:
|
||||
deployed_dataset += f"_{dataset_suffix}"
|
||||
|
||||
deployed_project = project_id
|
||||
|
|
|
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
|
|||
`moz-fx-data-shared-prod.monitoring.structured_missing_columns`
|
||||
AS
|
||||
SELECT
|
||||
*
|
||||
missing_columns.*,
|
||||
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1`
|
||||
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1` AS missing_columns
|
||||
LEFT JOIN
|
||||
-- Check whether the column actually exists in the schema.
|
||||
-- In some cases columns first show up as missing, but are added to the schema after some delay.
|
||||
-- In other cases columns show up as missing due to some invalid data being sent that did not
|
||||
-- get caught during schema validation in ingestion. For example, sometimes integer values that
|
||||
-- are too large for BigQuery cause columns to show up here.
|
||||
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
|
||||
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
|
||||
AND existing_schema.table_name = CONCAT(
|
||||
missing_columns.document_type,
|
||||
"_v",
|
||||
missing_columns.document_version
|
||||
)
|
||||
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
|
||||
-- The `path` format looks like this: `events`.[...].`timestamp`
|
||||
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
|
||||
AND ARRAY_TO_STRING(
|
||||
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
|
||||
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
|
||||
),
|
||||
"."
|
||||
) = existing_schema.field_path
|
||||
|
|
|
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
|
|||
`moz-fx-data-shared-prod.monitoring.telemetry_missing_columns`
|
||||
AS
|
||||
SELECT
|
||||
*
|
||||
missing_columns.*,
|
||||
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3`
|
||||
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3` AS missing_columns
|
||||
LEFT JOIN
|
||||
-- Check whether the column actually exists in the schema.
|
||||
-- In some cases columns first show up as missing, but are added to the schema after some delay.
|
||||
-- In other cases columns show up as missing due to some invalid data being sent that did not
|
||||
-- get caught during schema validation in ingestion. For example, sometimes integer values that
|
||||
-- are too large for BigQuery cause columns to show up here.
|
||||
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
|
||||
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
|
||||
AND existing_schema.table_name = CONCAT(
|
||||
missing_columns.document_type,
|
||||
"_v",
|
||||
missing_columns.document_version
|
||||
)
|
||||
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
|
||||
-- The `path` format looks like this: `events`.[...].`timestamp`
|
||||
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
|
||||
AND ARRAY_TO_STRING(
|
||||
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
|
||||
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
|
||||
),
|
||||
"."
|
||||
) = existing_schema.field_path
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
description: This UDF takes a list of column names to snake case and transform them
|
||||
to be compatible with the BigQuery column naming format.
|
||||
Based on the existing ingestion logic
|
||||
https://github.com/mozilla/gcp-ingestion/blob/dad29698271e543018eddbb3b771ad7942bf4ce5/
|
||||
ingestion-core/src/main/java/com/mozilla/telemetry/ingestion/core/transform/PubsubMessageToObjectNode.java#L824
|
||||
friendly_name: Snake Case Columns
|
|
@ -0,0 +1,65 @@
|
|||
CREATE OR REPLACE FUNCTION udf_js.snake_case_columns(input ARRAY<STRING>)
|
||||
RETURNS ARRAY<STRING> DETERMINISTIC
|
||||
LANGUAGE js AS r"""
|
||||
const REV_WORD_BOUND_PAT = new RegExp(
|
||||
"\\b" // standard word boundary
|
||||
+ "|(?<=[a-z][A-Z])(?=\\d*[A-Z])" // A7Aa -> A7|Aa boundary
|
||||
+ "|(?<=[a-z][A-Z])(?=\\d*[a-z])" // a7Aa -> a7|Aa boundary
|
||||
+ "|(?<=[A-Z])(?=\\d*[a-z])" // a7A -> a7|A boundary
|
||||
);
|
||||
|
||||
/**
|
||||
* Convert a name to snake case.
|
||||
*
|
||||
* The specific implementation here uses regular expressions in order to be compatible across languages.
|
||||
* See https://github.com/acmiyaguchi/test-casing
|
||||
*/
|
||||
function format(input) {
|
||||
const subbed = input.split('').reverse().join('').replace(/[^\w]|_/g, " ");
|
||||
const reversedResult = subbed.split(REV_WORD_BOUND_PAT)
|
||||
.map(s => s.trim())
|
||||
.map(s => s.toLowerCase())
|
||||
.filter(s => s.length > 0)
|
||||
.join('_');
|
||||
return reversedResult.split('').reverse().join('');
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a name to a BigQuery compatible format.
|
||||
*
|
||||
* If the name starts with a digit, prepend an underscore.
|
||||
* Otherwise, convert the name to snake_case.
|
||||
*/
|
||||
function convertNameForBq(name) {
|
||||
let sb = '';
|
||||
if (name.length > 0 && !isNaN(parseInt(name.charAt(0)))) {
|
||||
sb += '_';
|
||||
}
|
||||
sb += format(name);
|
||||
return sb;
|
||||
}
|
||||
|
||||
return input.map((field) => convertNameForBq(field));
|
||||
""";
|
||||
|
||||
-- Tests
|
||||
WITH input AS (
|
||||
SELECT
|
||||
['metrics', 'color'] AS test_input,
|
||||
['metrics', 'color'] AS expected
|
||||
UNION ALL
|
||||
SELECT
|
||||
['user_prefs', 'foo.bar', 'camelCase'],
|
||||
['user_prefs', 'foo_bar', 'camel_case']
|
||||
),
|
||||
formatted AS (
|
||||
SELECT
|
||||
udf_js.snake_case_columns(test_input) AS result,
|
||||
expected
|
||||
FROM
|
||||
input
|
||||
)
|
||||
SELECT
|
||||
mozfun.assert.array_equals(expected, result)
|
||||
FROM
|
||||
formatted
|
Загрузка…
Ссылка в новой задаче