From 164ba19abf6d8b80183dde535c99c9af0ad8ea67 Mon Sep 17 00:00:00 2001 From: Frank Bertsch Date: Tue, 17 Oct 2023 17:03:41 -0400 Subject: [PATCH] Glean usage checks (#4445) * WIP: Add checks for glean_usage * Ignore pycache in autogenerated click cmds * Move check to backfill command * Remove view checks --- bigquery_etl/cli/check.py | 11 +++++++---- bigquery_etl/cli/generate.py | 3 +++ bigquery_etl/cli/query.py | 16 ++++++++++++++++ sql_generators/__init__.py | 1 + sql_generators/glean_usage/common.py | 13 +++++++++++++ .../baseline_clients_daily_v1.checks.sql | 18 ++++++++++++++++++ .../baseline_clients_first_seen_v1.checks.sql | 18 ++++++++++++++++++ .../baseline_clients_last_seen_v1.checks.sql | 18 ++++++++++++++++++ 8 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 sql_generators/__init__.py create mode 100644 sql_generators/glean_usage/templates/baseline_clients_daily_v1.checks.sql create mode 100644 sql_generators/glean_usage/templates/baseline_clients_first_seen_v1.checks.sql create mode 100644 sql_generators/glean_usage/templates/baseline_clients_last_seen_v1.checks.sql diff --git a/bigquery_etl/cli/check.py b/bigquery_etl/cli/check.py index 7d87cc8238..3278fca663 100644 --- a/bigquery_etl/cli/check.py +++ b/bigquery_etl/cli/check.py @@ -20,6 +20,8 @@ from ..cli.utils import ( ) from ..util.common import render as render_template +DEFAULT_MARKER = "fail" + def _build_jinja_parameters(query_args): """Convert the bqetl parameters to a dictionary for use by the Jinja template.""" @@ -53,7 +55,7 @@ def _render_result_split_by_marker(marker, rendered_result): for sql_statement in rendered_result: sql_statement = sql_statement.strip() - if re.search(f"^#{marker}", sql_statement, re.IGNORECASE): + if re.search(f"#{marker}", sql_statement, re.IGNORECASE): extracted_result.append(sql_statement) return " ".join(extracted_result) @@ -194,7 +196,7 @@ def _render( @click.argument("dataset") @project_id_option() @sql_dir_option -@click.option("--marker", default="fail", help="Marker to filter checks.") +@click.option("--marker", default=DEFAULT_MARKER, help="Marker to filter checks.") @click.option( "--dry_run", "--dry-run", @@ -233,7 +235,7 @@ def _run_check( dataset_id, table, query_arguments, - marker, + marker=DEFAULT_MARKER, dry_run=False, ): """Run the check.""" @@ -256,12 +258,13 @@ def _run_check( **{"dataset_id": dataset_id, "table_name": table}, **parameters, } + if "format" not in jinja_params: + jinja_params["format"] = False rendered_result = render_template( checks_file.name, template_folder=str(checks_file.parent), templates_dir="", - format=False, **jinja_params, ) result_split_by_marker = _render_result_split_by_marker(marker, rendered_result) diff --git a/bigquery_etl/cli/generate.py b/bigquery_etl/cli/generate.py index 351391eda9..8223b71940 100644 --- a/bigquery_etl/cli/generate.py +++ b/bigquery_etl/cli/generate.py @@ -19,6 +19,9 @@ def generate_group(): generator_path = ROOT / SQL_GENERATORS_DIR for path in generator_path.iterdir(): + if "__pycache__" in path.parts: + # Ignore pycache subdirectories + continue if path.is_dir(): # get Python modules for generators spec = importlib.util.spec_from_file_location( diff --git a/bigquery_etl/cli/query.py b/bigquery_etl/cli/query.py index d431b6beac..3fc241e35e 100644 --- a/bigquery_etl/cli/query.py +++ b/bigquery_etl/cli/query.py @@ -25,6 +25,7 @@ from google.cloud import bigquery from google.cloud.exceptions import NotFound from ..backfill.utils import QUALIFIED_TABLE_NAME_RE, qualified_table_name_matching +from ..cli import check from ..cli.format import format from ..cli.utils import ( is_authenticated, @@ -567,6 +568,21 @@ def _backfill_query( query_arguments=arguments, ) + # Run checks on the query + checks_file = query_file_path.parent / "checks.sql" + if checks_file.exists(): + table_name = checks_file.parent.name + # query_args have things like format, which we don't want to push + # to the check; so we just take the query parameters + check_args = [qa for qa in arguments if qa.startswith("--parameter")] + check._run_check( + checks_file=checks_file, + project_id=project_id, + dataset_id=dataset, + table=table_name, + query_arguments=check_args, + dry_run=dry_run, + ) else: click.echo( f"Skip {query_file_path} with @{date_partition_parameter}={backfill_date}" diff --git a/sql_generators/__init__.py b/sql_generators/__init__.py new file mode 100644 index 0000000000..d3f8761e8f --- /dev/null +++ b/sql_generators/__init__.py @@ -0,0 +1 @@ +"""See https://mypy.readthedocs.io/en/stable/running_mypy.html#mapping-file-paths-to-modules.""" diff --git a/sql_generators/glean_usage/common.py b/sql_generators/glean_usage/common.py index b314c52690..f0c4f82531 100644 --- a/sql_generators/glean_usage/common.py +++ b/sql_generators/glean_usage/common.py @@ -186,6 +186,7 @@ class GleanTable: init_filename = f"{self.target_table_id}.init.sql" query_filename = f"{self.target_table_id}.query.sql" + checks_filename = f"{self.target_table_id}.checks.sql" view_filename = f"{self.target_table_id[:-3]}.view.sql" view_metadata_filename = f"{self.target_table_id[:-3]}.metadata.yaml" table_metadata_filename = f"{self.target_table_id}.metadata.yaml" @@ -208,6 +209,7 @@ class GleanTable: view_sql = render( view_filename, template_folder=PATH / "templates", **render_kwargs ) + view_metadata = render( view_metadata_filename, template_folder=PATH / "templates", @@ -221,6 +223,14 @@ class GleanTable: **render_kwargs, ) + # Checks are optional, for now! + try: + checks_sql = render( + checks_filename, template_folder=PATH / "templates", **render_kwargs + ) + except TemplateNotFound: + checks_sql = None + if not self.no_init: try: init_sql = render( @@ -254,6 +264,9 @@ class GleanTable: if not self.no_init: artifacts.append(Artifact(table, "init.sql", init_sql)) + if checks_sql: + artifacts.append(Artifact(table, "checks.sql", checks_sql)) + for artifact in artifacts: destination = ( get_table_dir(output_dir, artifact.table_id) / artifact.basename diff --git a/sql_generators/glean_usage/templates/baseline_clients_daily_v1.checks.sql b/sql_generators/glean_usage/templates/baseline_clients_daily_v1.checks.sql new file mode 100644 index 0000000000..91e97f0956 --- /dev/null +++ b/sql_generators/glean_usage/templates/baseline_clients_daily_v1.checks.sql @@ -0,0 +1,18 @@ +{{ header }} + +#fail +{# + We use raw here b/c the first pass is rendered to create the checks.sql + files, and the second pass is rendering of the checks themselves. + + For example, the header above is rendered for every checks file + when we create the checks file, when `bqetl generate glean_usage` + is called. + + However the second part, where we render the check is_unique() below, + is rendered when we _run_ the check, during `bqetl query backfill` + (you can also run them locally with `bqetl check run`). +#} +{% raw -%} + {{ is_unique(["client_id"], "submission_date = @submission_date") }} +{% endraw %} diff --git a/sql_generators/glean_usage/templates/baseline_clients_first_seen_v1.checks.sql b/sql_generators/glean_usage/templates/baseline_clients_first_seen_v1.checks.sql new file mode 100644 index 0000000000..0a8ba597cf --- /dev/null +++ b/sql_generators/glean_usage/templates/baseline_clients_first_seen_v1.checks.sql @@ -0,0 +1,18 @@ +{{ header }} + +#fail +{# + We use raw here b/c the first pass is rendered to create the checks.sql + files, and the second pass is rendering of the checks themselves. + + For example, the header above is rendered for every checks file + when we create the checks file, when `bqetl generate glean_usage` + is called. + + However the second part, where we render the check is_unique() below, + is rendered when we _run_ the check, during `bqetl query backfill` + (you can also run them locally with `bqetl check run`). +#} +{% raw -%} + {{ is_unique(["client_id"]) }} +{% endraw %} diff --git a/sql_generators/glean_usage/templates/baseline_clients_last_seen_v1.checks.sql b/sql_generators/glean_usage/templates/baseline_clients_last_seen_v1.checks.sql new file mode 100644 index 0000000000..91e97f0956 --- /dev/null +++ b/sql_generators/glean_usage/templates/baseline_clients_last_seen_v1.checks.sql @@ -0,0 +1,18 @@ +{{ header }} + +#fail +{# + We use raw here b/c the first pass is rendered to create the checks.sql + files, and the second pass is rendering of the checks themselves. + + For example, the header above is rendered for every checks file + when we create the checks file, when `bqetl generate glean_usage` + is called. + + However the second part, where we render the check is_unique() below, + is rendered when we _run_ the check, during `bqetl query backfill` + (you can also run them locally with `bqetl check run`). +#} +{% raw -%} + {{ is_unique(["client_id"], "submission_date = @submission_date") }} +{% endraw %}