feat(DENG-2175): added matches_pattern etl checks macro and updated tests to use it (#4683)

* added matches_pattern etl checks macro and updated tests to use it

* added matches_pattern etl check macro to the data checks docs
This commit is contained in:
kik-kik 2023-12-13 16:10:34 +01:00 коммит произвёл GitHub
Родитель cbff1eec17
Коммит 34d6a463dc
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 98 добавлений и 9 удалений

Просмотреть файл

@ -188,14 +188,14 @@ Example:
### value_length([source](../../tests/checks/value_length.jinja))
Checks that the columns have values of specific character length.
Checks that the column has values of specific character length.
Usage:
```
Arguments:
columns: List[str] - Columns which will be checked against the `expected_length`.
column: str - Column which will be checked against the `expected_length`.
expected_length: int - Describes the expected character length of the value inside the specified columns.
where: Optional[str]: Any additional filtering rules that should be applied when retrieving the data to run the check against.
```
@ -206,6 +206,27 @@ Example:
{{ value_length(column="country", expected_length=2, where="submission_date = @submission_date") }}
```
### matches_pattern([source](../../tests/checks/matches_pattern.jinja))
Checks that the column values adhere to a pattern based on a regex expression.
Usage:
```
Arguments:
column: str - Column which values will be checked against the regex.
pattern: str - Regex pattern specifying the expected shape / pattern of the values inside the column.
where: Optional[str]: Any additional filtering rules that should be applied when retrieving the data to run the check against.
message: Optional[str]: Custom error message.
```
Example:
```sql
#warn
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date", message="Oops") }}
```
## Running checks locally / Commands

Просмотреть файл

@ -87,10 +87,4 @@ FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
WHERE submission_date = @submission_date;
#warn
SELECT IF(
COUNTIF(NOT REGEXP_CONTAINS(CAST(country AS STRING), r"^[A-Z]{2}|\?\?$")) > 0,
ERROR("Unexpected values for field normalized_channel detected."),
null
)
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
WHERE submission_date = @submission_date;
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date", message="Some values in this field do not adhere to the ISO 3166-1 specification (2 character country code, for example: DE).") }}

Просмотреть файл

@ -0,0 +1,60 @@
{{ header }}
{#
We use raw here b/c the first pass is rendered to create the checks.sql
files, and the second pass is rendering of the checks themselves.
For example, the header above is rendered for every checks file
when we create the checks file, when `bqetl generate glean_usage`
is called.
However the second part, where we render the check is_unique() below,
is rendered when we _run_ the check, during `bqetl query backfill`
(you can also run them locally with `bqetl check run`).
#}
{% raw -%}
#warn
{{ is_unique(["client_id"], where="submission_date = @submission_date") }}
#warn
{{ min_row_count(1, where="submission_date = @submission_date") }}
# warn
{{ not_null([
"submission_date",
"client_id",
"sample_id",
"first_seen_date",
"days_seen_bits",
"days_created_profile_bits",
"days_seen_session_start_bits",
"days_seen_session_end_bits"
], where="submission_date = @submission_date") }}
#warn
SELECT IF(
COUNTIF(normalized_channel NOT IN (
"nightly",
"aurora",
"release",
"Other",
"beta",
"esr"
)) > 0,
ERROR("Unexpected values for field normalized_channel detected."),
null
)
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
WHERE submission_date = @submission_date;
#warn
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date") }}
#warn
{{ matches_pattern(column="telemetry_sdk_build", pattern="^\d+\.\d+\.\d+$", where="submission_date = @submission_date", message="Values inside field telemetry_sdk_build not adhere to the expected format. Example: 23.43.33") }}
#warn
{{ value_length(column="client_id", expected_length=36, where="submission_date = @submission_date") }}
{% endraw %}

Просмотреть файл

@ -0,0 +1,14 @@
{% macro matches_pattern(column, pattern, where, message) %}
{% set message = message|default('Some values inside the `' ~ column ~ '` column do not match the expected pattern: `' ~ pattern ~ '`') %}
SELECT
IF(
COUNTIF(NOT REGEXP_CONTAINS({{ column }}, r"{{ pattern }}")) > 0,
ERROR("{{ message }}"),
NULL
)
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
{% if where %}
WHERE {{ where }}
{% endif %};
{% endmacro %}