[Bug 1823724] Add flag to missing columns views to indicate that column exists in schema (#6215)

* Add `column_exists_in_schema` field to structured_missing_columns * Add column_exists_in_schema to telemetry_missing_columns * Add UDF to convert column names to be compatible with schema conventions * Add UDF test for snake_case_columns * Fix stage deploys for INFORMATION_SCHEMA * Fix UDF test * Code review feedback * Review feedback
2024-09-19 14:12:31 -07:00 · 2024-09-19 14:12:31 -07:00 · 61f920cd3e
--- a/bigquery_etl/cli/stage.py
+++ b/bigquery_etl/cli/stage.py
@ -342,13 +342,14 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
        original_project = artifact_file.parent.parent.parent.name

        deployed_dataset = original_dataset
-        deployed_dataset += f"_{original_project.replace('-', '_')}"

-        if dataset_suffix and original_dataset not in (
+        if original_dataset not in (
            "INFORMATION_SCHEMA",
            "region-eu",
            "region-us",
        ):
+            deployed_dataset += f"_{original_project.replace('-', '_')}"
+            if dataset_suffix:
                deployed_dataset += f"_{dataset_suffix}"

        deployed_project = project_id
--- a/sql/moz-fx-data-shared-prod/monitoring/structured_missing_columns/view.sql
+++ b/sql/moz-fx-data-shared-prod/monitoring/structured_missing_columns/view.sql
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
  `moz-fx-data-shared-prod.monitoring.structured_missing_columns`
 AS
 SELECT
-  *
+  missing_columns.*,
+  existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
 FROM
-  `moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1`
+  `moz-fx-data-shared-prod.monitoring_derived.structured_missing_columns_v1` AS missing_columns
+LEFT JOIN
+  -- Check whether the column actually exists in the schema.
+  -- In some cases columns first show up as missing, but are added to the schema after some delay.
+  -- In other cases columns show up as missing due to some invalid data being sent that did not
+  -- get caught during schema validation in ingestion. For example, sometimes integer values that
+  -- are too large for BigQuery cause columns to show up here.
+  `moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
+  ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
+  AND existing_schema.table_name = CONCAT(
+    missing_columns.document_type,
+    "_v",
+    missing_columns.document_version
+  )
+  -- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
+  -- The `path` format looks like this: `events`.[...].`timestamp`
+  -- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
+  AND ARRAY_TO_STRING(
+    `moz-fx-data-shared-prod.udf_js.snake_case_columns`(
+      REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
+    ),
+    "."
+  ) = existing_schema.field_path
--- a/sql/moz-fx-data-shared-prod/monitoring/telemetry_missing_columns/view.sql
+++ b/sql/moz-fx-data-shared-prod/monitoring/telemetry_missing_columns/view.sql
@ -2,6 +2,29 @@ CREATE OR REPLACE VIEW
  `moz-fx-data-shared-prod.monitoring.telemetry_missing_columns`
 AS
 SELECT
-  *
+  missing_columns.*,
+  existing_schema.table_schema IS NOT NULL AS column_exists_in_schema
 FROM
-  `moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3`
+  `moz-fx-data-shared-prod.monitoring_derived.telemetry_missing_columns_v3` AS missing_columns
+LEFT JOIN
+  -- Check whether the column actually exists in the schema.
+  -- In some cases columns first show up as missing, but are added to the schema after some delay.
+  -- In other cases columns show up as missing due to some invalid data being sent that did not
+  -- get caught during schema validation in ingestion. For example, sometimes integer values that
+  -- are too large for BigQuery cause columns to show up here.
+  `moz-fx-data-shared-prod.region-us.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS existing_schema
+  ON existing_schema.table_schema = CONCAT(missing_columns.document_namespace, "_stable")
+  AND existing_schema.table_name = CONCAT(
+    missing_columns.document_type,
+    "_v",
+    missing_columns.document_version
+  )
+  -- Normalize the column paths and convert them to follow the BigQuery column naming conventions.
+  -- The `path` format looks like this: `events`.[...].`timestamp`
+  -- The `field_path` format in INFORMATION_SCHEMA.COLUMN_FIELD_PATHS looks like this: events.timestamp
+  AND ARRAY_TO_STRING(
+    `moz-fx-data-shared-prod.udf_js.snake_case_columns`(
+      REGEXP_EXTRACT_ALL(missing_columns.path, '`(.+?)`')
+    ),
+    "."
+  ) = existing_schema.field_path
--- a/sql/moz-fx-data-shared-prod/udf_js/snake_case_columns/metadata.yaml
+++ b/sql/moz-fx-data-shared-prod/udf_js/snake_case_columns/metadata.yaml
@ -0,0 +1,7 @@
+---
+description: This UDF takes a list of column names to snake case and transform them
+  to be compatible with the BigQuery column naming format.
+  Based on the existing ingestion logic
+  https://github.com/mozilla/gcp-ingestion/blob/dad29698271e543018eddbb3b771ad7942bf4ce5/
+  ingestion-core/src/main/java/com/mozilla/telemetry/ingestion/core/transform/PubsubMessageToObjectNode.java#L824
+friendly_name: Snake Case Columns
--- a/sql/moz-fx-data-shared-prod/udf_js/snake_case_columns/udf.sql
+++ b/sql/moz-fx-data-shared-prod/udf_js/snake_case_columns/udf.sql
@ -0,0 +1,65 @@
+CREATE OR REPLACE FUNCTION udf_js.snake_case_columns(input ARRAY<STRING>)
+RETURNS ARRAY<STRING> DETERMINISTIC
+LANGUAGE js AS r"""
+const REV_WORD_BOUND_PAT = new RegExp(
+  "\\b"  // standard word boundary
+  + "|(?<=[a-z][A-Z])(?=\\d*[A-Z])"  // A7Aa -> A7|Aa boundary
+  + "|(?<=[a-z][A-Z])(?=\\d*[a-z])"  // a7Aa -> a7|Aa boundary
+  + "|(?<=[A-Z])(?=\\d*[a-z])"  // a7A -> a7|A boundary
+);
+
+/**
+ * Convert a name to snake case.
+ * 
+ * The specific implementation here uses regular expressions in order to be compatible across languages.
+ * See https://github.com/acmiyaguchi/test-casing
+ */
+function format(input) {
+  const subbed = input.split('').reverse().join('').replace(/[^\w]|_/g, " ");
+  const reversedResult = subbed.split(REV_WORD_BOUND_PAT)
+    .map(s => s.trim())
+    .map(s => s.toLowerCase())
+    .filter(s => s.length > 0)
+    .join('_');
+  return reversedResult.split('').reverse().join('');
+}
+
+/**
+ * Convert a name to a BigQuery compatible format.
+ *
+ * If the name starts with a digit, prepend an underscore.
+ * Otherwise, convert the name to snake_case.
+ */
+function convertNameForBq(name) {
+  let sb = '';
+  if (name.length > 0 && !isNaN(parseInt(name.charAt(0)))) {
+    sb += '_';
+  }
+  sb += format(name);
+  return sb;
+}
+
+return input.map((field) => convertNameForBq(field));
+""";
+
+-- Tests
+WITH input AS (
+  SELECT
+    ['metrics', 'color'] AS test_input,
+    ['metrics', 'color'] AS expected
+  UNION ALL
+  SELECT
+    ['user_prefs', 'foo.bar', 'camelCase'],
+    ['user_prefs', 'foo_bar', 'camel_case']
+),
+formatted AS (
+  SELECT
+    udf_js.snake_case_columns(test_input) AS result,
+    expected
+  FROM
+    input
+)
+SELECT
+  mozfun.assert.array_equals(expected, result)
+FROM
+  formatted