feat(DENG-2175): added matches_pattern etl checks macro and updated tests to use it (#4683)
* added matches_pattern etl checks macro and updated tests to use it * added matches_pattern etl check macro to the data checks docs
This commit is contained in:
Родитель
cbff1eec17
Коммит
34d6a463dc
|
@ -188,14 +188,14 @@ Example:
|
|||
|
||||
### value_length([source](../../tests/checks/value_length.jinja))
|
||||
|
||||
Checks that the columns have values of specific character length.
|
||||
Checks that the column has values of specific character length.
|
||||
|
||||
Usage:
|
||||
|
||||
```
|
||||
Arguments:
|
||||
|
||||
columns: List[str] - Columns which will be checked against the `expected_length`.
|
||||
column: str - Column which will be checked against the `expected_length`.
|
||||
expected_length: int - Describes the expected character length of the value inside the specified columns.
|
||||
where: Optional[str]: Any additional filtering rules that should be applied when retrieving the data to run the check against.
|
||||
```
|
||||
|
@ -206,6 +206,27 @@ Example:
|
|||
{{ value_length(column="country", expected_length=2, where="submission_date = @submission_date") }}
|
||||
```
|
||||
|
||||
### matches_pattern([source](../../tests/checks/matches_pattern.jinja))
|
||||
|
||||
Checks that the column values adhere to a pattern based on a regex expression.
|
||||
|
||||
Usage:
|
||||
|
||||
```
|
||||
Arguments:
|
||||
|
||||
column: str - Column which values will be checked against the regex.
|
||||
pattern: str - Regex pattern specifying the expected shape / pattern of the values inside the column.
|
||||
where: Optional[str]: Any additional filtering rules that should be applied when retrieving the data to run the check against.
|
||||
message: Optional[str]: Custom error message.
|
||||
```
|
||||
|
||||
Example:
|
||||
```sql
|
||||
#warn
|
||||
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date", message="Oops") }}
|
||||
```
|
||||
|
||||
|
||||
## Running checks locally / Commands
|
||||
|
||||
|
|
|
@ -87,10 +87,4 @@ FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
|
|||
WHERE submission_date = @submission_date;
|
||||
|
||||
#warn
|
||||
SELECT IF(
|
||||
COUNTIF(NOT REGEXP_CONTAINS(CAST(country AS STRING), r"^[A-Z]{2}|\?\?$")) > 0,
|
||||
ERROR("Unexpected values for field normalized_channel detected."),
|
||||
null
|
||||
)
|
||||
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
|
||||
WHERE submission_date = @submission_date;
|
||||
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date", message="Some values in this field do not adhere to the ISO 3166-1 specification (2 character country code, for example: DE).") }}
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
{{ header }}
|
||||
|
||||
{#
|
||||
We use raw here b/c the first pass is rendered to create the checks.sql
|
||||
files, and the second pass is rendering of the checks themselves.
|
||||
|
||||
For example, the header above is rendered for every checks file
|
||||
when we create the checks file, when `bqetl generate glean_usage`
|
||||
is called.
|
||||
|
||||
However the second part, where we render the check is_unique() below,
|
||||
is rendered when we _run_ the check, during `bqetl query backfill`
|
||||
(you can also run them locally with `bqetl check run`).
|
||||
#}
|
||||
{% raw -%}
|
||||
#warn
|
||||
{{ is_unique(["client_id"], where="submission_date = @submission_date") }}
|
||||
|
||||
#warn
|
||||
{{ min_row_count(1, where="submission_date = @submission_date") }}
|
||||
|
||||
# warn
|
||||
{{ not_null([
|
||||
"submission_date",
|
||||
"client_id",
|
||||
"sample_id",
|
||||
"first_seen_date",
|
||||
"days_seen_bits",
|
||||
"days_created_profile_bits",
|
||||
"days_seen_session_start_bits",
|
||||
"days_seen_session_end_bits"
|
||||
], where="submission_date = @submission_date") }}
|
||||
|
||||
#warn
|
||||
SELECT IF(
|
||||
COUNTIF(normalized_channel NOT IN (
|
||||
"nightly",
|
||||
"aurora",
|
||||
"release",
|
||||
"Other",
|
||||
"beta",
|
||||
"esr"
|
||||
)) > 0,
|
||||
ERROR("Unexpected values for field normalized_channel detected."),
|
||||
null
|
||||
)
|
||||
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
|
||||
WHERE submission_date = @submission_date;
|
||||
|
||||
#warn
|
||||
{{ matches_pattern(column="country", pattern="^[A-Z]{2}$", where="submission_date = @submission_date") }}
|
||||
|
||||
#warn
|
||||
{{ matches_pattern(column="telemetry_sdk_build", pattern="^\d+\.\d+\.\d+$", where="submission_date = @submission_date", message="Values inside field telemetry_sdk_build not adhere to the expected format. Example: 23.43.33") }}
|
||||
|
||||
#warn
|
||||
{{ value_length(column="client_id", expected_length=36, where="submission_date = @submission_date") }}
|
||||
|
||||
{% endraw %}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
{% macro matches_pattern(column, pattern, where, message) %}
|
||||
{% set message = message|default('Some values inside the `' ~ column ~ '` column do not match the expected pattern: `' ~ pattern ~ '`') %}
|
||||
|
||||
SELECT
|
||||
IF(
|
||||
COUNTIF(NOT REGEXP_CONTAINS({{ column }}, r"{{ pattern }}")) > 0,
|
||||
ERROR("{{ message }}"),
|
||||
NULL
|
||||
)
|
||||
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_name }}`
|
||||
{% if where %}
|
||||
WHERE {{ where }}
|
||||
{% endif %};
|
||||
{% endmacro %}
|
Загрузка…
Ссылка в новой задаче