Make shredder detect glean derived tables with client_info.client_id (#6459)

* Make shredder detect glean derived tables with client_info.client_id

* Add skip list
This commit is contained in:
Ben Wu 2024-11-12 23:01:47 +00:00 коммит произвёл GitHub
Родитель bf6852e4b1
Коммит 3c13b9af65
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
2 изменённых файлов: 32 добавлений и 2 удалений

Просмотреть файл

@ -499,6 +499,15 @@ SEARCH_IGNORE_FIELDS = {
("telemetry_stable.prio_v4", ID), ("telemetry_stable.prio_v4", ID),
} }
# list of dataset_id.table_id to ignore in find_glean_targets function
GLEAN_IGNORE_LIST = {
# deletion request table
"firefox_desktop_derived.migration_esr_incorrect_deletion_request_v1",
# subset of firefox_desktop_stable.pageload_v1 which doesn't have client ids
"firefox_desktop_derived.pageload_1pct_v1",
"firefox_desktop_derived.pageload_nightly_v1",
}
def find_glean_targets( def find_glean_targets(
pool: ThreadPool, client: bigquery.Client, project: str = SHARED_PROD pool: ThreadPool, client: bigquery.Client, project: str = SHARED_PROD
@ -602,6 +611,7 @@ def find_glean_targets(
# skip tables already added to DELETE_TARGETS # skip tables already added to DELETE_TARGETS
manually_added_tables = {target.table for target in DELETE_TARGETS.keys()} manually_added_tables = {target.table for target in DELETE_TARGETS.keys()}
skipped_tables = manually_added_tables | GLEAN_IGNORE_LIST
return { return {
**{ **{
@ -618,7 +628,7 @@ def find_glean_targets(
and not table.table_id.startswith("migration") and not table.table_id.startswith("migration")
# skip tables with explicitly excluded client ids # skip tables with explicitly excluded client ids
and table.labels.get("include_client_id", "true").lower() != "false" and table.labels.get("include_client_id", "true").lower() != "false"
and qualified_table_id(table) not in manually_added_tables and qualified_table_id(table) not in skipped_tables
}, },
**{ **{
# glean derived tables that contain client_id # glean derived tables that contain client_id
@ -630,7 +640,26 @@ def find_glean_targets(
for table in glean_derived_tables for table in glean_derived_tables
if any(field.name == CLIENT_ID for field in table.schema) if any(field.name == CLIENT_ID for field in table.schema)
and not table.table_id.startswith(derived_source_prefix) and not table.table_id.startswith(derived_source_prefix)
and qualified_table_id(table) not in manually_added_tables and qualified_table_id(table) not in skipped_tables
},
**{
# glean derived tables that contain client_info.client_id but not client_id
DeleteTarget(
table=qualified_table_id(table),
# field must be repeated for each deletion source
field=(GLEAN_CLIENT_ID,) * len(sources[table.dataset_id]),
): sources[table.dataset_id]
for table in glean_derived_tables
if any(
field.name == "client_info"
and any(
[nested_field.name == "client_id" for nested_field in field.fields]
)
for field in table.schema
)
and all(field.name != CLIENT_ID for field in table.schema)
and not table.table_id.startswith(derived_source_prefix)
and qualified_table_id(table) not in skipped_tables
}, },
} }

Просмотреть файл

@ -317,6 +317,7 @@ def test_glean_targets_override(mock_requests):
table_ids = [ table_ids = [
"adclick_history_v1", # should use value from override "adclick_history_v1", # should use value from override
"other_table_v1", "other_table_v1",
"pageload_1pct_v1", # should be ignored
] ]
else: else:
raise Exception(f"unexpected dataset: {dataset_ref}") raise Exception(f"unexpected dataset: {dataset_ref}")