[Bug 1823724] Add flag to missing columns views to indicate that column exists in schema (#6215)

* Add `column_exists_in_schema` field to structured_missing_columns

* Add column_exists_in_schema to telemetry_missing_columns

* Add UDF to convert column names to be compatible with schema conventions

* Add UDF test for snake_case_columns

* Fix stage deploys for INFORMATION_SCHEMA

* Fix UDF test

* Code review feedback

* Review feedback
This commit is contained in:
Anna Scholtz 2024-09-19 14:12:31 -07:00 коммит произвёл GitHub
Родитель cebe9ca32b
Коммит 61f920cd3e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
5 изменённых файлов: 126 добавлений и 7 удалений

Просмотреть файл

@ -342,14 +342,15 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
original_project = artifact_file.parent.parent.parent.name
deployed_dataset = original_dataset
deployed_dataset += f"_{original_project.replace('-', '_')}"
if dataset_suffix and original_dataset not in (
if original_dataset not in (
"INFORMATION_SCHEMA",
"region-eu",
"region-us",
):
deployed_dataset += f"_{dataset_suffix}"
deployed_dataset += f"_{original_project.replace('-', '_')}"
if dataset_suffix:
deployed_dataset += f"_{dataset_suffix}"
deployed_project = project_id

Просмотреть файл

@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.monitoring.structured_missing_columns`
AS
SELECT
*
missing_columns.*,
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
FROM
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1`
`moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1` AS missing_columns
LEFT JOIN
-- Check whether the column actually exists in the schema.
-- In some cases columns first show up as missing, but are added to the schema after some delay.
-- In other cases columns show up as missing due to some invalid data being sent that did not
-- get caught during schema validation in ingestion. For example, sometimes integer values that
-- are too large for BigQuery cause columns to show up here.
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
AND existing_schema.table_name = CONCAT(
missing_columns.document_type,
"_v",
missing_columns.document_version
)
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
-- The `path` format looks like this: `events`.[...].`timestamp`
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
AND ARRAY_TO_STRING(
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
),
"."
) = existing_schema.field_path

Просмотреть файл

@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.monitoring.telemetry_missing_columns`
AS
SELECT
*
missing_columns.*,
existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
FROM
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3`
`moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3` AS missing_columns
LEFT JOIN
-- Check whether the column actually exists in the schema.
-- In some cases columns first show up as missing, but are added to the schema after some delay.
-- In other cases columns show up as missing due to some invalid data being sent that did not
-- get caught during schema validation in ingestion. For example, sometimes integer values that
-- are too large for BigQuery cause columns to show up here.
`moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
AND existing_schema.table_name = CONCAT(
missing_columns.document_type,
"_v",
missing_columns.document_version
)
-- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
-- The `path` format looks like this: `events`.[...].`timestamp`
-- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
AND ARRAY_TO_STRING(
`moz-fx-data-shared-prod.udf_js.snake_case_columns`(
REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
),
"."
) = existing_schema.field_path

Просмотреть файл

@ -0,0 +1,7 @@
---
description: This UDF takes a list of column names to snake case and transform them
to be compatible with the BigQuery column naming format.
Based on the existing ingestion logic
https://github.com/mozilla/gcp-ingestion/blob/dad29698271e543018eddbb3b771ad7942bf4ce5/
ingestion-core/src/main/java/com/mozilla/telemetry/ingestion/core/transform/PubsubMessageToObjectNode.java#L824
friendly_name: Snake Case Columns

Просмотреть файл

@ -0,0 +1,65 @@
CREATE OR REPLACE FUNCTION udf_js.snake_case_columns(input ARRAY<STRING>)
RETURNS ARRAY<STRING> DETERMINISTIC
LANGUAGE js AS r"""
const REV_WORD_BOUND_PAT = new RegExp(
"\\b" // standard word boundary
+ "|(?<=[a-z][A-Z])(?=\\d*[A-Z])" // A7Aa -> A7|Aa boundary
+ "|(?<=[a-z][A-Z])(?=\\d*[a-z])" // a7Aa -> a7|Aa boundary
+ "|(?<=[A-Z])(?=\\d*[a-z])" // a7A -> a7|A boundary
);
/**
* Convert a name to snake case.
*
* The specific implementation here uses regular expressions in order to be compatible across languages.
* See https://github.com/acmiyaguchi/test-casing
*/
function format(input) {
const subbed = input.split('').reverse().join('').replace(/[^\w]|_/g, " ");
const reversedResult = subbed.split(REV_WORD_BOUND_PAT)
.map(s => s.trim())
.map(s => s.toLowerCase())
.filter(s => s.length > 0)
.join('_');
return reversedResult.split('').reverse().join('');
}
/**
* Convert a name to a BigQuery compatible format.
*
* If the name starts with a digit, prepend an underscore.
* Otherwise, convert the name to snake_case.
*/
function convertNameForBq(name) {
let sb = '';
if (name.length > 0 && !isNaN(parseInt(name.charAt(0)))) {
sb += '_';
}
sb += format(name);
return sb;
}
return input.map((field) => convertNameForBq(field));
""";
-- Tests
WITH input AS (
SELECT
['metrics', 'color'] AS test_input,
['metrics', 'color'] AS expected
UNION ALL
SELECT
['user_prefs', 'foo.bar', 'camelCase'],
['user_prefs', 'foo_bar', 'camel_case']
),
formatted AS (
SELECT
udf_js.snake_case_columns(test_input) AS result,
expected
FROM
input
)
SELECT
mozfun.assert.array_equals(expected, result)
FROM
formatted