* WIP: Add checks for glean_usage

* Ignore pycache in autogenerated click cmds

* Move check to backfill command

* Remove view checks
This commit is contained in:
Frank Bertsch 2023-10-17 17:03:41 -04:00 коммит произвёл GitHub
Родитель 34c8cf35e7
Коммит 164ba19abf
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 94 добавлений и 4 удалений

Просмотреть файл

@ -20,6 +20,8 @@ from ..cli.utils import (
)
from ..util.common import render as render_template
DEFAULT_MARKER = "fail"
def _build_jinja_parameters(query_args):
"""Convert the bqetl parameters to a dictionary for use by the Jinja template."""
@ -53,7 +55,7 @@ def _render_result_split_by_marker(marker, rendered_result):
for sql_statement in rendered_result:
sql_statement = sql_statement.strip()
if re.search(f"^#{marker}", sql_statement, re.IGNORECASE):
if re.search(f"#{marker}", sql_statement, re.IGNORECASE):
extracted_result.append(sql_statement)
return " ".join(extracted_result)
@ -194,7 +196,7 @@ def _render(
@click.argument("dataset")
@project_id_option()
@sql_dir_option
@click.option("--marker", default="fail", help="Marker to filter checks.")
@click.option("--marker", default=DEFAULT_MARKER, help="Marker to filter checks.")
@click.option(
"--dry_run",
"--dry-run",
@ -233,7 +235,7 @@ def _run_check(
dataset_id,
table,
query_arguments,
marker,
marker=DEFAULT_MARKER,
dry_run=False,
):
"""Run the check."""
@ -256,12 +258,13 @@ def _run_check(
**{"dataset_id": dataset_id, "table_name": table},
**parameters,
}
if "format" not in jinja_params:
jinja_params["format"] = False
rendered_result = render_template(
checks_file.name,
template_folder=str(checks_file.parent),
templates_dir="",
format=False,
**jinja_params,
)
result_split_by_marker = _render_result_split_by_marker(marker, rendered_result)

Просмотреть файл

@ -19,6 +19,9 @@ def generate_group():
generator_path = ROOT / SQL_GENERATORS_DIR
for path in generator_path.iterdir():
if "__pycache__" in path.parts:
# Ignore pycache subdirectories
continue
if path.is_dir():
# get Python modules for generators
spec = importlib.util.spec_from_file_location(

Просмотреть файл

@ -25,6 +25,7 @@ from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from ..backfill.utils import QUALIFIED_TABLE_NAME_RE, qualified_table_name_matching
from ..cli import check
from ..cli.format import format
from ..cli.utils import (
is_authenticated,
@ -567,6 +568,21 @@ def _backfill_query(
query_arguments=arguments,
)
# Run checks on the query
checks_file = query_file_path.parent / "checks.sql"
if checks_file.exists():
table_name = checks_file.parent.name
# query_args have things like format, which we don't want to push
# to the check; so we just take the query parameters
check_args = [qa for qa in arguments if qa.startswith("--parameter")]
check._run_check(
checks_file=checks_file,
project_id=project_id,
dataset_id=dataset,
table=table_name,
query_arguments=check_args,
dry_run=dry_run,
)
else:
click.echo(
f"Skip {query_file_path} with @{date_partition_parameter}={backfill_date}"

Просмотреть файл

@ -0,0 +1 @@
"""See https://mypy.readthedocs.io/en/stable/running_mypy.html#mapping-file-paths-to-modules."""

Просмотреть файл

@ -186,6 +186,7 @@ class GleanTable:
init_filename = f"{self.target_table_id}.init.sql"
query_filename = f"{self.target_table_id}.query.sql"
checks_filename = f"{self.target_table_id}.checks.sql"
view_filename = f"{self.target_table_id[:-3]}.view.sql"
view_metadata_filename = f"{self.target_table_id[:-3]}.metadata.yaml"
table_metadata_filename = f"{self.target_table_id}.metadata.yaml"
@ -208,6 +209,7 @@ class GleanTable:
view_sql = render(
view_filename, template_folder=PATH / "templates", **render_kwargs
)
view_metadata = render(
view_metadata_filename,
template_folder=PATH / "templates",
@ -221,6 +223,14 @@ class GleanTable:
**render_kwargs,
)
# Checks are optional, for now!
try:
checks_sql = render(
checks_filename, template_folder=PATH / "templates", **render_kwargs
)
except TemplateNotFound:
checks_sql = None
if not self.no_init:
try:
init_sql = render(
@ -254,6 +264,9 @@ class GleanTable:
if not self.no_init:
artifacts.append(Artifact(table, "init.sql", init_sql))
if checks_sql:
artifacts.append(Artifact(table, "checks.sql", checks_sql))
for artifact in artifacts:
destination = (
get_table_dir(output_dir, artifact.table_id) / artifact.basename

Просмотреть файл

@ -0,0 +1,18 @@
{{ header }}
#fail
{#
We use raw here b/c the first pass is rendered to create the checks.sql
files, and the second pass is rendering of the checks themselves.
For example, the header above is rendered for every checks file
when we create the checks file, when `bqetl generate glean_usage`
is called.
However the second part, where we render the check is_unique() below,
is rendered when we _run_ the check, during `bqetl query backfill`
(you can also run them locally with `bqetl check run`).
#}
{% raw -%}
{{ is_unique(["client_id"], "submission_date = @submission_date") }}
{% endraw %}

Просмотреть файл

@ -0,0 +1,18 @@
{{ header }}
#fail
{#
We use raw here b/c the first pass is rendered to create the checks.sql
files, and the second pass is rendering of the checks themselves.
For example, the header above is rendered for every checks file
when we create the checks file, when `bqetl generate glean_usage`
is called.
However the second part, where we render the check is_unique() below,
is rendered when we _run_ the check, during `bqetl query backfill`
(you can also run them locally with `bqetl check run`).
#}
{% raw -%}
{{ is_unique(["client_id"]) }}
{% endraw %}

Просмотреть файл

@ -0,0 +1,18 @@
{{ header }}
#fail
{#
We use raw here b/c the first pass is rendered to create the checks.sql
files, and the second pass is rendering of the checks themselves.
For example, the header above is rendered for every checks file
when we create the checks file, when `bqetl generate glean_usage`
is called.
However the second part, where we render the check is_unique() below,
is rendered when we _run_ the check, during `bqetl query backfill`
(you can also run them locally with `bqetl check run`).
#}
{% raw -%}
{{ is_unique(["client_id"], "submission_date = @submission_date") }}
{% endraw %}