diff --git a/bigquery_etl/shredder/config.py b/bigquery_etl/shredder/config.py index e983af6c9b..b00efa8afb 100755 --- a/bigquery_etl/shredder/config.py +++ b/bigquery_etl/shredder/config.py @@ -499,6 +499,15 @@ SEARCH_IGNORE_FIELDS = { ("telemetry_stable.prio_v4", ID), } +# list of dataset_id.table_id to ignore in find_glean_targets function +GLEAN_IGNORE_LIST = { + # deletion request table + "firefox_desktop_derived.migration_esr_incorrect_deletion_request_v1", + # subset of firefox_desktop_stable.pageload_v1 which doesn't have client ids + "firefox_desktop_derived.pageload_1pct_v1", + "firefox_desktop_derived.pageload_nightly_v1", +} + def find_glean_targets( pool: ThreadPool, client: bigquery.Client, project: str = SHARED_PROD @@ -602,6 +611,7 @@ def find_glean_targets( # skip tables already added to DELETE_TARGETS manually_added_tables = {target.table for target in DELETE_TARGETS.keys()} + skipped_tables = manually_added_tables | GLEAN_IGNORE_LIST return { **{ @@ -618,7 +628,7 @@ def find_glean_targets( and not table.table_id.startswith("migration") # skip tables with explicitly excluded client ids and table.labels.get("include_client_id", "true").lower() != "false" - and qualified_table_id(table) not in manually_added_tables + and qualified_table_id(table) not in skipped_tables }, **{ # glean derived tables that contain client_id @@ -630,7 +640,26 @@ def find_glean_targets( for table in glean_derived_tables if any(field.name == CLIENT_ID for field in table.schema) and not table.table_id.startswith(derived_source_prefix) - and qualified_table_id(table) not in manually_added_tables + and qualified_table_id(table) not in skipped_tables + }, + **{ + # glean derived tables that contain client_info.client_id but not client_id + DeleteTarget( + table=qualified_table_id(table), + # field must be repeated for each deletion source + field=(GLEAN_CLIENT_ID,) * len(sources[table.dataset_id]), + ): sources[table.dataset_id] + for table in glean_derived_tables + if any( + field.name == "client_info" + and any( + [nested_field.name == "client_id" for nested_field in field.fields] + ) + for field in table.schema + ) + and all(field.name != CLIENT_ID for field in table.schema) + and not table.table_id.startswith(derived_source_prefix) + and qualified_table_id(table) not in skipped_tables }, } diff --git a/tests/shredder/test_config.py b/tests/shredder/test_config.py index 8e8956714f..799db38b26 100644 --- a/tests/shredder/test_config.py +++ b/tests/shredder/test_config.py @@ -317,6 +317,7 @@ def test_glean_targets_override(mock_requests): table_ids = [ "adclick_history_v1", # should use value from override "other_table_v1", + "pageload_1pct_v1", # should be ignored ] else: raise Exception(f"unexpected dataset: {dataset_ref}")