Glean usage checks (#4445)
* WIP: Add checks for glean_usage * Ignore pycache in autogenerated click cmds * Move check to backfill command * Remove view checks
This commit is contained in:
Родитель
34c8cf35e7
Коммит
164ba19abf
|
@ -20,6 +20,8 @@ from ..cli.utils import (
|
|||
)
|
||||
from ..util.common import render as render_template
|
||||
|
||||
DEFAULT_MARKER = "fail"
|
||||
|
||||
|
||||
def _build_jinja_parameters(query_args):
|
||||
"""Convert the bqetl parameters to a dictionary for use by the Jinja template."""
|
||||
|
@ -53,7 +55,7 @@ def _render_result_split_by_marker(marker, rendered_result):
|
|||
|
||||
for sql_statement in rendered_result:
|
||||
sql_statement = sql_statement.strip()
|
||||
if re.search(f"^#{marker}", sql_statement, re.IGNORECASE):
|
||||
if re.search(f"#{marker}", sql_statement, re.IGNORECASE):
|
||||
extracted_result.append(sql_statement)
|
||||
return " ".join(extracted_result)
|
||||
|
||||
|
@ -194,7 +196,7 @@ def _render(
|
|||
@click.argument("dataset")
|
||||
@project_id_option()
|
||||
@sql_dir_option
|
||||
@click.option("--marker", default="fail", help="Marker to filter checks.")
|
||||
@click.option("--marker", default=DEFAULT_MARKER, help="Marker to filter checks.")
|
||||
@click.option(
|
||||
"--dry_run",
|
||||
"--dry-run",
|
||||
|
@ -233,7 +235,7 @@ def _run_check(
|
|||
dataset_id,
|
||||
table,
|
||||
query_arguments,
|
||||
marker,
|
||||
marker=DEFAULT_MARKER,
|
||||
dry_run=False,
|
||||
):
|
||||
"""Run the check."""
|
||||
|
@ -256,12 +258,13 @@ def _run_check(
|
|||
**{"dataset_id": dataset_id, "table_name": table},
|
||||
**parameters,
|
||||
}
|
||||
if "format" not in jinja_params:
|
||||
jinja_params["format"] = False
|
||||
|
||||
rendered_result = render_template(
|
||||
checks_file.name,
|
||||
template_folder=str(checks_file.parent),
|
||||
templates_dir="",
|
||||
format=False,
|
||||
**jinja_params,
|
||||
)
|
||||
result_split_by_marker = _render_result_split_by_marker(marker, rendered_result)
|
||||
|
|
|
@ -19,6 +19,9 @@ def generate_group():
|
|||
generator_path = ROOT / SQL_GENERATORS_DIR
|
||||
|
||||
for path in generator_path.iterdir():
|
||||
if "__pycache__" in path.parts:
|
||||
# Ignore pycache subdirectories
|
||||
continue
|
||||
if path.is_dir():
|
||||
# get Python modules for generators
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
|
|
|
@ -25,6 +25,7 @@ from google.cloud import bigquery
|
|||
from google.cloud.exceptions import NotFound
|
||||
|
||||
from ..backfill.utils import QUALIFIED_TABLE_NAME_RE, qualified_table_name_matching
|
||||
from ..cli import check
|
||||
from ..cli.format import format
|
||||
from ..cli.utils import (
|
||||
is_authenticated,
|
||||
|
@ -567,6 +568,21 @@ def _backfill_query(
|
|||
query_arguments=arguments,
|
||||
)
|
||||
|
||||
# Run checks on the query
|
||||
checks_file = query_file_path.parent / "checks.sql"
|
||||
if checks_file.exists():
|
||||
table_name = checks_file.parent.name
|
||||
# query_args have things like format, which we don't want to push
|
||||
# to the check; so we just take the query parameters
|
||||
check_args = [qa for qa in arguments if qa.startswith("--parameter")]
|
||||
check._run_check(
|
||||
checks_file=checks_file,
|
||||
project_id=project_id,
|
||||
dataset_id=dataset,
|
||||
table=table_name,
|
||||
query_arguments=check_args,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
else:
|
||||
click.echo(
|
||||
f"Skip {query_file_path} with @{date_partition_parameter}={backfill_date}"
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
"""See https://mypy.readthedocs.io/en/stable/running_mypy.html#mapping-file-paths-to-modules."""
|
|
@ -186,6 +186,7 @@ class GleanTable:
|
|||
|
||||
init_filename = f"{self.target_table_id}.init.sql"
|
||||
query_filename = f"{self.target_table_id}.query.sql"
|
||||
checks_filename = f"{self.target_table_id}.checks.sql"
|
||||
view_filename = f"{self.target_table_id[:-3]}.view.sql"
|
||||
view_metadata_filename = f"{self.target_table_id[:-3]}.metadata.yaml"
|
||||
table_metadata_filename = f"{self.target_table_id}.metadata.yaml"
|
||||
|
@ -208,6 +209,7 @@ class GleanTable:
|
|||
view_sql = render(
|
||||
view_filename, template_folder=PATH / "templates", **render_kwargs
|
||||
)
|
||||
|
||||
view_metadata = render(
|
||||
view_metadata_filename,
|
||||
template_folder=PATH / "templates",
|
||||
|
@ -221,6 +223,14 @@ class GleanTable:
|
|||
**render_kwargs,
|
||||
)
|
||||
|
||||
# Checks are optional, for now!
|
||||
try:
|
||||
checks_sql = render(
|
||||
checks_filename, template_folder=PATH / "templates", **render_kwargs
|
||||
)
|
||||
except TemplateNotFound:
|
||||
checks_sql = None
|
||||
|
||||
if not self.no_init:
|
||||
try:
|
||||
init_sql = render(
|
||||
|
@ -254,6 +264,9 @@ class GleanTable:
|
|||
if not self.no_init:
|
||||
artifacts.append(Artifact(table, "init.sql", init_sql))
|
||||
|
||||
if checks_sql:
|
||||
artifacts.append(Artifact(table, "checks.sql", checks_sql))
|
||||
|
||||
for artifact in artifacts:
|
||||
destination = (
|
||||
get_table_dir(output_dir, artifact.table_id) / artifact.basename
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
{{ header }}
|
||||
|
||||
#fail
|
||||
{#
|
||||
We use raw here b/c the first pass is rendered to create the checks.sql
|
||||
files, and the second pass is rendering of the checks themselves.
|
||||
|
||||
For example, the header above is rendered for every checks file
|
||||
when we create the checks file, when `bqetl generate glean_usage`
|
||||
is called.
|
||||
|
||||
However the second part, where we render the check is_unique() below,
|
||||
is rendered when we _run_ the check, during `bqetl query backfill`
|
||||
(you can also run them locally with `bqetl check run`).
|
||||
#}
|
||||
{% raw -%}
|
||||
{{ is_unique(["client_id"], "submission_date = @submission_date") }}
|
||||
{% endraw %}
|
|
@ -0,0 +1,18 @@
|
|||
{{ header }}
|
||||
|
||||
#fail
|
||||
{#
|
||||
We use raw here b/c the first pass is rendered to create the checks.sql
|
||||
files, and the second pass is rendering of the checks themselves.
|
||||
|
||||
For example, the header above is rendered for every checks file
|
||||
when we create the checks file, when `bqetl generate glean_usage`
|
||||
is called.
|
||||
|
||||
However the second part, where we render the check is_unique() below,
|
||||
is rendered when we _run_ the check, during `bqetl query backfill`
|
||||
(you can also run them locally with `bqetl check run`).
|
||||
#}
|
||||
{% raw -%}
|
||||
{{ is_unique(["client_id"]) }}
|
||||
{% endraw %}
|
|
@ -0,0 +1,18 @@
|
|||
{{ header }}
|
||||
|
||||
#fail
|
||||
{#
|
||||
We use raw here b/c the first pass is rendered to create the checks.sql
|
||||
files, and the second pass is rendering of the checks themselves.
|
||||
|
||||
For example, the header above is rendered for every checks file
|
||||
when we create the checks file, when `bqetl generate glean_usage`
|
||||
is called.
|
||||
|
||||
However the second part, where we render the check is_unique() below,
|
||||
is rendered when we _run_ the check, during `bqetl query backfill`
|
||||
(you can also run them locally with `bqetl check run`).
|
||||
#}
|
||||
{% raw -%}
|
||||
{{ is_unique(["client_id"], "submission_date = @submission_date") }}
|
||||
{% endraw %}
|
Загрузка…
Ссылка в новой задаче