Improve checks to account for null replacements (#6508)

* Backfill info.

* Update tests,

* Update tests,
This commit is contained in:
Lucia 2024-11-15 18:03:02 +01:00 коммит произвёл GitHub
Родитель 944d6d55df
Коммит b83224393c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 40 добавлений и 26 удалений

Просмотреть файл

@ -30,6 +30,7 @@ THIS_PATH = Path(os.path.dirname(__file__))
DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
SHREDDER_MITIGATION_QUERY_NAME = "shredder_mitigation_query"
SHREDDER_MITIGATION_CHECKS_NAME = "shredder_mitigation_checks"
DEFAULT_FOR_NULLS = "??"
WILDCARD_STRING = "???????"
WILDCARD_NUMBER = -9999999
QUERY_FILE_RE = re.compile(
@ -202,7 +203,7 @@ class Subset:
if not select_list or not from_clause:
raise click.ClickException(
f"Missing required clause to generate query.\n"
f"Actuals: SELECT: {select_list}, FROM: {self.full_table_id}"
f"Actual: SELECT: {select_list}, FROM: {self.full_table_id}"
)
query = f"SELECT {', '.join(map(str, select_list))}"
query += f" FROM {from_clause}" if from_clause is not None else ""
@ -575,7 +576,8 @@ def generate_query_with_shredder_mitigation(
common_select = (
[previous.partitioning["field"]]
+ [
f"COALESCE({dim.name}, '{WILDCARD_STRING}') AS {dim.name}"
f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
f" {dim.name}) AS {dim.name}"
for dim in common_dimensions
if (
dim.name != previous.partitioning["field"]
@ -688,7 +690,7 @@ def generate_query_with_shredder_mitigation(
if metric.data_type != DataTypeGroup.FLOAT
]
+ [
f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - " # Round FLOAT to avoid exponentials.
f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - " # Round FLOAT to avoid exponential numbers.
f"ROUND(COALESCE({new_agg.query_cte}.{metric.name}, 0), 10) AS {metric.name}"
for metric in metrics
if metric.data_type == DataTypeGroup.FLOAT
@ -758,13 +760,13 @@ def generate_query_with_shredder_mitigation(
final_select = f"{', '.join(combined_list)}"
# Generate formatted output strings to display generated-query information in console.
common_ouput = "".join(
common_output = "".join(
[
f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
for dim in common_dimensions
]
)
metrics_ouput = "".join(
metrics_output = "".join(
[
f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
for dim in metrics
@ -778,7 +780,7 @@ def generate_query_with_shredder_mitigation(
)
click.echo(
click.style(
f"Query columns:\n" f"{common_ouput + metrics_ouput + changed_output}",
f"Query columns:\n" f"{common_output + metrics_output + changed_output}",
fg="yellow",
)
)
@ -816,10 +818,22 @@ def generate_query_with_shredder_mitigation(
# Generate checks to compare versions after each partition backfill.
checks_select = (
[new.partitioning["field"]]
+ [
f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
f" {dim.name}) AS {dim.name}"
for dim in common_dimensions
if (
dim.name != previous.partitioning["field"]
and dim.data_type == DataTypeGroup.STRING
)
]
+ [
dim.name
for dim in common_dimensions
if (dim.name != new.partitioning["field"])
if (
dim.name != new.partitioning["field"]
and dim.data_type != DataTypeGroup.STRING
)
]
+ [f"SUM({metric.name})" f" AS {metric.name}" for metric in metrics]
)

Просмотреть файл

@ -27,8 +27,8 @@ SELECT
CONCAT(
((SELECT COUNT(*) FROM previous_not_matching)),
" rows in the previous data don't match backfilled data! Run auto-generated checks for ",
"all mismatches & search for rows missing or with differences in metrics. 5 sample rows: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 5)))
"all mismatches & search for rows missing or with differences in metrics. Sample row in previous version: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 1)))
)
),
NULL
@ -61,8 +61,8 @@ SELECT
CONCAT(
((SELECT COUNT(*) FROM backfilled_not_matching)),
" rows in backfill don't match previous version of data! Run auto-generated checks for ",
"all mismatches & search for rows added or with differences in metrics. 5 sample rows: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 5)))
"all mismatches & search for rows added or with differences in metrics. Sample row in new_version: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 1)))
)
),
NULL

Просмотреть файл

@ -932,7 +932,7 @@ class TestSubset:
)
assert str(e.value.message) == (
f"Missing required clause to generate query.\n"
f"Actuals: SELECT: [], FROM: {test_subset.full_table_id}"
f"Actual: SELECT: [], FROM: {test_subset.full_table_id}"
)
@patch("google.cloud.bigquery.Client")
@ -1007,7 +1007,7 @@ class TestGenerateQueryWithShredderMitigation:
new_agg AS (
SELECT
submission_date,
COALESCE(column_1, '???????') AS column_1,
IF(column_1 IS NULL OR column_1 = '??', '???????', column_1) AS column_1,
SUM(metric_1) AS metric_1
FROM
new_version
@ -1017,7 +1017,7 @@ class TestGenerateQueryWithShredderMitigation:
previous_agg AS (
SELECT
submission_date,
COALESCE(column_1, '???????') AS column_1,
IF(column_1 IS NULL OR column_1 = '??', '???????', column_1) AS column_1,
SUM(metric_1) AS metric_1
FROM
`moz-fx-data-shared-prod.test.test_query_v1`
@ -1259,7 +1259,7 @@ class TestGenerateQueryWithShredderMitigation:
WITH previous AS (
SELECT
column_1,
column_2,
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
SUM(metric_1) AS metric_1,
SUM(metric_2) AS metric_2
FROM
@ -1272,7 +1272,7 @@ class TestGenerateQueryWithShredderMitigation:
new_version AS (
SELECT
column_1,
column_2,
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
SUM(metric_1) AS metric_1,
SUM(metric_2) AS metric_2
FROM
@ -1300,8 +1300,8 @@ class TestGenerateQueryWithShredderMitigation:
CONCAT(
((SELECT COUNT(*) FROM previous_not_matching)),
" rows in the previous data don't match backfilled data! Run auto-generated checks for ",
"all mismatches & search for rows missing or with differences in metrics. 5 sample rows: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 5)))
"all mismatches & search for rows missing or with differences in metrics. Sample row in previous version: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 1)))
)
),
NULL
@ -1313,7 +1313,7 @@ class TestGenerateQueryWithShredderMitigation:
WITH previous AS (
SELECT
column_1,
column_2,
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
SUM(metric_1) AS metric_1,
SUM(metric_2) AS metric_2
FROM
@ -1326,7 +1326,7 @@ class TestGenerateQueryWithShredderMitigation:
new_version AS (
SELECT
column_1,
column_2,
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
SUM(metric_1) AS metric_1,
SUM(metric_2) AS metric_2
FROM
@ -1354,8 +1354,8 @@ class TestGenerateQueryWithShredderMitigation:
CONCAT(
((SELECT COUNT(*) FROM backfilled_not_matching)),
" rows in backfill don't match previous version of data! Run auto-generated checks for ",
"all mismatches & search for rows added or with differences in metrics. 5 sample rows: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 5)))
"all mismatches & search for rows added or with differences in metrics. Sample row in new_version: ",
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 1)))
)
),
NULL
@ -1746,7 +1746,7 @@ class TestGenerateQueryWithShredderMitigation:
call(
select_list=[
"column_1",
"COALESCE(column_2, '???????') AS column_2",
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
"SUM(metric_1) AS metric_1",
],
from_clause="new_version",
@ -1755,7 +1755,7 @@ class TestGenerateQueryWithShredderMitigation:
call(
select_list=[
"column_1",
"COALESCE(column_2, '???????') AS column_2",
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
"SUM(metric_1) AS metric_1",
],
from_clause="`moz-fx-data-shared-prod.test.test_query_v1`",
@ -1779,7 +1779,7 @@ class TestGenerateQueryWithShredderMitigation:
call(
select_list=[
"column_1",
"column_2",
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
"SUM(metric_1) AS metric_1",
],
from_clause="`moz-fx-data-shared-prod.test.test_query_v1`",
@ -1789,7 +1789,7 @@ class TestGenerateQueryWithShredderMitigation:
call(
select_list=[
"column_1",
"column_2",
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
"SUM(metric_1) AS metric_1",
],
from_clause="`moz-fx-data-shared-prod.test.test_query_v2__2021_01_01`",