Improve checks to account for null replacements (#6508)
* Backfill info. * Update tests, * Update tests,
This commit is contained in:
Родитель
944d6d55df
Коммит
b83224393c
|
@ -30,6 +30,7 @@ THIS_PATH = Path(os.path.dirname(__file__))
|
|||
DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
|
||||
SHREDDER_MITIGATION_QUERY_NAME = "shredder_mitigation_query"
|
||||
SHREDDER_MITIGATION_CHECKS_NAME = "shredder_mitigation_checks"
|
||||
DEFAULT_FOR_NULLS = "??"
|
||||
WILDCARD_STRING = "???????"
|
||||
WILDCARD_NUMBER = -9999999
|
||||
QUERY_FILE_RE = re.compile(
|
||||
|
@ -202,7 +203,7 @@ class Subset:
|
|||
if not select_list or not from_clause:
|
||||
raise click.ClickException(
|
||||
f"Missing required clause to generate query.\n"
|
||||
f"Actuals: SELECT: {select_list}, FROM: {self.full_table_id}"
|
||||
f"Actual: SELECT: {select_list}, FROM: {self.full_table_id}"
|
||||
)
|
||||
query = f"SELECT {', '.join(map(str, select_list))}"
|
||||
query += f" FROM {from_clause}" if from_clause is not None else ""
|
||||
|
@ -575,7 +576,8 @@ def generate_query_with_shredder_mitigation(
|
|||
common_select = (
|
||||
[previous.partitioning["field"]]
|
||||
+ [
|
||||
f"COALESCE({dim.name}, '{WILDCARD_STRING}') AS {dim.name}"
|
||||
f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
|
||||
f" {dim.name}) AS {dim.name}"
|
||||
for dim in common_dimensions
|
||||
if (
|
||||
dim.name != previous.partitioning["field"]
|
||||
|
@ -688,7 +690,7 @@ def generate_query_with_shredder_mitigation(
|
|||
if metric.data_type != DataTypeGroup.FLOAT
|
||||
]
|
||||
+ [
|
||||
f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - " # Round FLOAT to avoid exponentials.
|
||||
f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - " # Round FLOAT to avoid exponential numbers.
|
||||
f"ROUND(COALESCE({new_agg.query_cte}.{metric.name}, 0), 10) AS {metric.name}"
|
||||
for metric in metrics
|
||||
if metric.data_type == DataTypeGroup.FLOAT
|
||||
|
@ -758,13 +760,13 @@ def generate_query_with_shredder_mitigation(
|
|||
final_select = f"{', '.join(combined_list)}"
|
||||
|
||||
# Generate formatted output strings to display generated-query information in console.
|
||||
common_ouput = "".join(
|
||||
common_output = "".join(
|
||||
[
|
||||
f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
|
||||
for dim in common_dimensions
|
||||
]
|
||||
)
|
||||
metrics_ouput = "".join(
|
||||
metrics_output = "".join(
|
||||
[
|
||||
f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
|
||||
for dim in metrics
|
||||
|
@ -778,7 +780,7 @@ def generate_query_with_shredder_mitigation(
|
|||
)
|
||||
click.echo(
|
||||
click.style(
|
||||
f"Query columns:\n" f"{common_ouput + metrics_ouput + changed_output}",
|
||||
f"Query columns:\n" f"{common_output + metrics_output + changed_output}",
|
||||
fg="yellow",
|
||||
)
|
||||
)
|
||||
|
@ -816,10 +818,22 @@ def generate_query_with_shredder_mitigation(
|
|||
# Generate checks to compare versions after each partition backfill.
|
||||
checks_select = (
|
||||
[new.partitioning["field"]]
|
||||
+ [
|
||||
f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
|
||||
f" {dim.name}) AS {dim.name}"
|
||||
for dim in common_dimensions
|
||||
if (
|
||||
dim.name != previous.partitioning["field"]
|
||||
and dim.data_type == DataTypeGroup.STRING
|
||||
)
|
||||
]
|
||||
+ [
|
||||
dim.name
|
||||
for dim in common_dimensions
|
||||
if (dim.name != new.partitioning["field"])
|
||||
if (
|
||||
dim.name != new.partitioning["field"]
|
||||
and dim.data_type != DataTypeGroup.STRING
|
||||
)
|
||||
]
|
||||
+ [f"SUM({metric.name})" f" AS {metric.name}" for metric in metrics]
|
||||
)
|
||||
|
|
|
@ -27,8 +27,8 @@ SELECT
|
|||
CONCAT(
|
||||
((SELECT COUNT(*) FROM previous_not_matching)),
|
||||
" rows in the previous data don't match backfilled data! Run auto-generated checks for ",
|
||||
"all mismatches & search for rows missing or with differences in metrics. 5 sample rows: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 5)))
|
||||
"all mismatches & search for rows missing or with differences in metrics. Sample row in previous version: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 1)))
|
||||
)
|
||||
),
|
||||
NULL
|
||||
|
@ -61,8 +61,8 @@ SELECT
|
|||
CONCAT(
|
||||
((SELECT COUNT(*) FROM backfilled_not_matching)),
|
||||
" rows in backfill don't match previous version of data! Run auto-generated checks for ",
|
||||
"all mismatches & search for rows added or with differences in metrics. 5 sample rows: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 5)))
|
||||
"all mismatches & search for rows added or with differences in metrics. Sample row in new_version: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 1)))
|
||||
)
|
||||
),
|
||||
NULL
|
||||
|
|
|
@ -932,7 +932,7 @@ class TestSubset:
|
|||
)
|
||||
assert str(e.value.message) == (
|
||||
f"Missing required clause to generate query.\n"
|
||||
f"Actuals: SELECT: [], FROM: {test_subset.full_table_id}"
|
||||
f"Actual: SELECT: [], FROM: {test_subset.full_table_id}"
|
||||
)
|
||||
|
||||
@patch("google.cloud.bigquery.Client")
|
||||
|
@ -1007,7 +1007,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
new_agg AS (
|
||||
SELECT
|
||||
submission_date,
|
||||
COALESCE(column_1, '???????') AS column_1,
|
||||
IF(column_1 IS NULL OR column_1 = '??', '???????', column_1) AS column_1,
|
||||
SUM(metric_1) AS metric_1
|
||||
FROM
|
||||
new_version
|
||||
|
@ -1017,7 +1017,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
previous_agg AS (
|
||||
SELECT
|
||||
submission_date,
|
||||
COALESCE(column_1, '???????') AS column_1,
|
||||
IF(column_1 IS NULL OR column_1 = '??', '???????', column_1) AS column_1,
|
||||
SUM(metric_1) AS metric_1
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.test.test_query_v1`
|
||||
|
@ -1259,7 +1259,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
WITH previous AS (
|
||||
SELECT
|
||||
column_1,
|
||||
column_2,
|
||||
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
|
||||
SUM(metric_1) AS metric_1,
|
||||
SUM(metric_2) AS metric_2
|
||||
FROM
|
||||
|
@ -1272,7 +1272,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
new_version AS (
|
||||
SELECT
|
||||
column_1,
|
||||
column_2,
|
||||
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
|
||||
SUM(metric_1) AS metric_1,
|
||||
SUM(metric_2) AS metric_2
|
||||
FROM
|
||||
|
@ -1300,8 +1300,8 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
CONCAT(
|
||||
((SELECT COUNT(*) FROM previous_not_matching)),
|
||||
" rows in the previous data don't match backfilled data! Run auto-generated checks for ",
|
||||
"all mismatches & search for rows missing or with differences in metrics. 5 sample rows: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 5)))
|
||||
"all mismatches & search for rows missing or with differences in metrics. Sample row in previous version: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 1)))
|
||||
)
|
||||
),
|
||||
NULL
|
||||
|
@ -1313,7 +1313,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
WITH previous AS (
|
||||
SELECT
|
||||
column_1,
|
||||
column_2,
|
||||
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
|
||||
SUM(metric_1) AS metric_1,
|
||||
SUM(metric_2) AS metric_2
|
||||
FROM
|
||||
|
@ -1326,7 +1326,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
new_version AS (
|
||||
SELECT
|
||||
column_1,
|
||||
column_2,
|
||||
IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2,
|
||||
SUM(metric_1) AS metric_1,
|
||||
SUM(metric_2) AS metric_2
|
||||
FROM
|
||||
|
@ -1354,8 +1354,8 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
CONCAT(
|
||||
((SELECT COUNT(*) FROM backfilled_not_matching)),
|
||||
" rows in backfill don't match previous version of data! Run auto-generated checks for ",
|
||||
"all mismatches & search for rows added or with differences in metrics. 5 sample rows: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 5)))
|
||||
"all mismatches & search for rows added or with differences in metrics. Sample row in new_version: ",
|
||||
(SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 1)))
|
||||
)
|
||||
),
|
||||
NULL
|
||||
|
@ -1746,7 +1746,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
call(
|
||||
select_list=[
|
||||
"column_1",
|
||||
"COALESCE(column_2, '???????') AS column_2",
|
||||
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
|
||||
"SUM(metric_1) AS metric_1",
|
||||
],
|
||||
from_clause="new_version",
|
||||
|
@ -1755,7 +1755,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
call(
|
||||
select_list=[
|
||||
"column_1",
|
||||
"COALESCE(column_2, '???????') AS column_2",
|
||||
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
|
||||
"SUM(metric_1) AS metric_1",
|
||||
],
|
||||
from_clause="`moz-fx-data-shared-prod.test.test_query_v1`",
|
||||
|
@ -1779,7 +1779,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
call(
|
||||
select_list=[
|
||||
"column_1",
|
||||
"column_2",
|
||||
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
|
||||
"SUM(metric_1) AS metric_1",
|
||||
],
|
||||
from_clause="`moz-fx-data-shared-prod.test.test_query_v1`",
|
||||
|
@ -1789,7 +1789,7 @@ class TestGenerateQueryWithShredderMitigation:
|
|||
call(
|
||||
select_list=[
|
||||
"column_1",
|
||||
"column_2",
|
||||
"IF(column_2 IS NULL OR column_2 = '??', '???????', column_2) AS column_2",
|
||||
"SUM(metric_1) AS metric_1",
|
||||
],
|
||||
from_clause="`moz-fx-data-shared-prod.test.test_query_v2__2021_01_01`",
|
||||
|
|
Загрузка…
Ссылка в новой задаче