Make schema validation part of dryrun (#2069)

2021-05-25 11:53:09 -07:00 · 2021-05-25 11:53:09 -07:00 · 3c8894fdf1
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -62,6 +62,8 @@ jobs:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - run:
          name: Dry run queries
          # yamllint disable rule:line-length
@ -80,7 +82,7 @@ jobs:
              PATHS="$(git diff origin/main... --name-only --diff-filter=d -- sql)"
            fi
            echo $PATHS
-            script/dryrun $PATHS
+            PATH="venv/bin:$PATH" script/dryrun --validate-schemas $PATHS
          # yamllint enable rule:line-length
  validate-metadata:
    docker: *docker
@ -230,17 +232,6 @@ jobs:
      - run:
          name: Validate views
          command: PATH="venv/bin:$PATH" script/validate_views
  validate-schemas:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - run:
          name: Validate query schemas
          command: |
            ./bqetl bootstrap
            ./bqetl query schema validate "*"
  docs:
    docker: *docker
    steps:
@ -356,7 +347,6 @@ workflows:
      - verify-requirements
      - dry-run-sql
      - validate-metadata
      - validate-schemas
      - integration
      - validate-dags
      - verify-dags-up-to-date
--- a/bigquery_etl/cli/dryrun.py
+++ b/bigquery_etl/cli/dryrun.py
@ -1,9 +1,12 @@
 """bigquery-etl CLI dryrun command."""
 import fnmatch
 import glob
 import os
 import re
 import sys
 from multiprocessing.pool import ThreadPool
-from pathlib import Path
+from typing import Set
 import click
 from google.cloud import bigquery
@ -27,8 +30,8 @@ from ..dryrun import SKIP, DryRun
    """,
 )
@click.argument(
-    "path",
+    "paths",
-    default="sql/",
+    nargs=-1,
    type=click.Path(file_okay=True),
 )
@click.option(
@ -42,61 +45,60 @@ from ..dryrun import SKIP, DryRun
    type=bool,
    default=True,
 )
@click.option(
    "--validate_schemas",
    "--validate-schemas",
    help="Require dry run schema to match destination table and file if present.",
    is_flag=True,
    default=False,
 )
@click.option(
    "--project",
    help="GCP project to perform dry run in when --use_cloud_function=False",
    default="moz-fx-data-shared-prod",
 )
-def dryrun(path, use_cloud_function, project):
+def dryrun(paths, use_cloud_function, validate_schemas, project):
    """Perform a dry run."""
-    if os.path.isdir(path) and os.path.exists(path):
+    file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
-        sql_files = [f for f in Path(path).rglob("*.sql") if str(f) not in SKIP]
+    file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
-    elif os.path.isfile(path) and os.path.exists(path):
+
-        sql_files = [path]
+    sql_files: Set[str] = set()
    for path in paths:
        if os.path.isdir(path):
            sql_files |= {
                sql_file
                for pattern in file_names
                for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
            }
        elif os.path.isfile(path):
            if file_re.fullmatch(os.path.basename(path)):
                sql_files.add(path)
        else:
            click.echo(f"Invalid path {path}", err=True)
            sys.exit(1)
    sql_files -= SKIP
    if not sql_files:
        print("Skipping dry run because no queries matched")
        sys.exit(0)
    if use_cloud_function:
-
+        client = None
        def cloud_function_dryrun(sqlfile):
            """Dry run SQL files."""
            return DryRun(sqlfile).is_valid()
        sql_file_valid = cloud_function_dryrun
    else:
        if not is_authenticated():
            click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.")
            sys.exit(1)
        client = bigquery.Client(project=project)
-        client = bigquery.Client()
+    def sql_file_valid(sqlfile):
        def gcp_dryrun(sqlfile):
        """Dry run the SQL file."""
-            dataset = Path(sqlfile).parent.parent.name
+        result = DryRun(sqlfile, use_cloud_function=use_cloud_function, client=client)
-            job_config = bigquery.QueryJobConfig(
+        if validate_schemas:
-                dry_run=True,
+            valid = result.validate_schema()
-                use_query_cache=False,
+            if not valid:
-                default_dataset=f"{project}.{dataset}",
+                click.echo(f"{sqlfile:59} ERROR schema invalid")
-                query_parameters=[
+            return valid
-                    bigquery.ScalarQueryParameter(
+        return result.is_valid()
                        "submission_date", "DATE", "2019-01-01"
                    )
                ],
            )
            with open(sqlfile) as query_stream:
                query = query_stream.read()
                try:
                    client.query(query, job_config=job_config)
                    click.echo(f"{sqlfile:59} OK")
                    return True
                except Exception as e:
                    click.echo(f"{sqlfile:59} ERROR: {e}")
                    return False
        sql_file_valid = gcp_dryrun
    with ThreadPool(8) as p:
        result = p.map(sql_file_valid, sql_files, chunksize=1)
--- a/bigquery_etl/cli/query.py
+++ b/bigquery_etl/cli/query.py
@ -15,19 +15,20 @@ import click
 from google.cloud import bigquery
 from google.cloud.exceptions import NotFound
 from ..cli.dryrun import SKIP, dryrun
 from ..cli.format import format
 from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
 from ..dependency import get_dependency_graph
 from ..dryrun import SKIP, DryRun
 from ..format_sql.formatter import reformat
 from ..metadata import validate_metadata
-from ..metadata.parse_metadata import METADATA_FILE, Metadata, DatasetMetadata
+from ..metadata.parse_metadata import METADATA_FILE, DatasetMetadata, Metadata
 from ..query_scheduling.dag_collection import DagCollection
 from ..query_scheduling.generate_airflow_dags import get_dags
 from ..run_query import run
 from ..schema import SCHEMA_FILE, Schema
 from ..util import extract_from_query_path
 from ..util.common import random_str
 from .dryrun import dryrun
 QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
 SQL_FILE_RE = re.compile(
@ -673,8 +674,15 @@ def backfill(
    type=bool,
    default=True,
 )
@click.option(
    "--validate_schemas",
    "--validate-schemas",
    help="Require dry run schema to match destination table and file if present.",
    is_flag=True,
    default=False,
 )
@click.pass_context
-def validate(ctx, name, sql_dir, project_id, use_cloud_function):
+def validate(ctx, name, sql_dir, project_id, use_cloud_function, validate_schemas):
    """Validate queries by dry running, formatting and checking scheduling configs."""
    if name is None:
        name = "*.*"
@ -686,9 +694,10 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
        ctx.invoke(format, path=str(query))
        ctx.invoke(
            dryrun,
-            path=str(query),
+            paths=[str(query)],
            use_cloud_function=use_cloud_function,
            project=project,
            validate_schemas=validate_schemas,
        )
        validate_metadata.validate(query.parent)
        dataset_dirs.add(query.parent.parent)
@ -696,8 +705,6 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
    for dataset_dir in dataset_dirs:
        validate_metadata.validate_datasets(dataset_dir)
    # todo: validate if new fields get added
@query.command(
    help="""Create and initialize the destination table for the query.
@ -1114,70 +1121,6 @@ def deploy(ctx, name, sql_dir, project_id, force):
            click.echo(f"No schema file found for {query_file}")
 def _validate_schema(query_file):
    """
    Check whether schema is valid.
    Returns tuple for whether schema is valid and path to schema.
    """
    if str(query_file) in SKIP or query_file.name == "script.sql":
        click.echo(f"{query_file} dry runs are skipped. Cannot validate schemas.")
        return (True, query_file)
    query_file_path = Path(query_file)
    query_schema = Schema.from_query_file(query_file_path)
    existing_schema_path = query_file_path.parent / SCHEMA_FILE
    if not existing_schema_path.is_file():
        click.echo(f"No schema file defined for {query_file_path}", err=True)
        return (True, query_file_path)
    table_name = query_file_path.parent.name
    dataset_name = query_file_path.parent.parent.name
    project_name = query_file_path.parent.parent.parent.name
    partitioned_by = None
    try:
        metadata = Metadata.of_query_file(query_file_path)
        if metadata.bigquery and metadata.bigquery.time_partitioning:
            partitioned_by = metadata.bigquery.time_partitioning.field
    except FileNotFoundError:
        pass
    table_schema = Schema.for_table(
        project_name, dataset_name, table_name, partitioned_by
    )
    if not query_schema.compatible(table_schema):
        click.echo(
            click.style(
                f"ERROR: Schema for query in {query_file_path} "
                f"incompatible with schema deployed for "
                f"{project_name}.{dataset_name}.{table_name}",
                fg="red",
            ),
            err=True,
        )
        return (False, query_file_path)
    else:
        existing_schema = Schema.from_schema_file(existing_schema_path)
        if not existing_schema.equal(query_schema):
            click.echo(
                click.style(
                    f"Schema defined in {existing_schema_path} "
                    f"incompatible with query {query_file_path}",
                    fg="red",
                ),
                err=True,
            )
            return (False, query_file_path)
    click.echo(f"Schemas for {query_file_path} are valid.")
    return (True, query_file_path)
@schema.command(
    help="""Validate the query schema
@ -1200,6 +1143,9 @@ def validate_schema(name, sql_dir, project_id):
    """Validate the defined query schema against the query and destination table."""
    query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
    def _validate_schema(query_file_path):
        return DryRun(query_file_path).validate_schema(), query_file_path
    with Pool(8) as p:
        result = p.map(_validate_schema, query_files, chunksize=1)
--- a/bigquery_etl/dryrun.py
+++ b/bigquery_etl/dryrun.py
@ -10,18 +10,21 @@ only dry runs can be performed. In order to reduce risk of CI or local users
 accidentally running queries during tests and overwriting production data, we
 proxy the queries through the dry run service endpoint.
 """
-import fnmatch
+
 import glob
 import json
 import re
 import sys
 from argparse import ArgumentParser
 from enum import Enum
-from multiprocessing.pool import Pool
+from os.path import basename, dirname, exists
-from os.path import basename, dirname, exists, isdir
+from pathlib import Path
 from typing import Set
 from urllib.request import Request, urlopen
 import click
 from google.cloud import bigquery
 from .metadata.parse_metadata import Metadata
 from .schema import SCHEMA_FILE, Schema
 try:
    from functools import cached_property  # type: ignore
 except ImportError:
@ -224,11 +227,20 @@ class DryRun:
        "bigquery-etl-dryrun"
    )
-    def __init__(self, sqlfile, content=None, strip_dml=False):
+    def __init__(
        self,
        sqlfile,
        content=None,
        strip_dml=False,
        use_cloud_function=True,
        client=None,
    ):
        """Instantiate DryRun class."""
        self.sqlfile = sqlfile
        self.content = content
        self.strip_dml = strip_dml
        self.use_cloud_function = use_cloud_function
        self.client = client if use_cloud_function or client else bigquery.Client()
    def get_sql(self):
        """Get SQL content."""
@ -253,26 +265,54 @@ class DryRun:
            sql = self.content
        else:
            sql = self.get_sql()
        dataset = basename(dirname(dirname(self.sqlfile)))
        try:
            if self.use_cloud_function:
                r = urlopen(
                    Request(
                        self.DRY_RUN_URL,
                        headers={"Content-Type": "application/json"},
                        data=json.dumps(
                            {
-                            "dataset": basename(dirname(dirname(self.sqlfile))),
+                                "dataset": dataset,
                                "query": sql,
                            }
                        ).encode("utf8"),
                        method="POST",
                    )
                )
                return json.load(r)
            else:
                project = basename(dirname(dirname(dirname(self.sqlfile))))
                job_config = bigquery.QueryJobConfig(
                    dry_run=True,
                    use_query_cache=False,
                    default_dataset=f"{project}.{dataset}",
                    query_parameters=[
                        bigquery.ScalarQueryParameter(
                            "submission_date", "DATE", "2019-01-01"
                        )
                    ],
                )
                job = self.client.query(sql, job_config=job_config)
                return {
                    "valid": True,
                    "referencedTables": [
                        ref.to_api_repr() for ref in job.referenced_tables
                    ],
                    "schema": (
                        job._properties.get("statistics", {})
                        .get("query", {})
                        .get("schema", {})
                    ),
                    "datasetLabels": (
                        self.client.get_dataset(job.default_dataset).labels
                    ),
                }
        except Exception as e:
            print(f"{self.sqlfile:59} ERROR\n", e)
            return None
        return json.load(r)
    def get_referenced_tables(self):
        """Return referenced tables by dry running the SQL file."""
        if self.sqlfile not in SKIP and not self.is_valid():
@ -433,6 +473,65 @@ class DryRun:
                return Errors.DATE_FILTER_NEEDED_AND_SYNTAX
        return error
    def validate_schema(self):
        """Check whether schema is valid."""
        if self.sqlfile in SKIP or basename(self.sqlfile) == "script.sql":
            print(f"\t...Ignoring dryrun results for {self.sqlfile}")
            return True
        query_file_path = Path(self.sqlfile)
        query_schema = Schema.from_json(self.get_schema())
        existing_schema_path = query_file_path.parent / SCHEMA_FILE
        if not existing_schema_path.is_file():
            click.echo(f"No schema file defined for {query_file_path}", err=True)
            return True
        table_name = query_file_path.parent.name
        dataset_name = query_file_path.parent.parent.name
        project_name = query_file_path.parent.parent.parent.name
        partitioned_by = None
        try:
            metadata = Metadata.of_query_file(query_file_path)
            if metadata.bigquery and metadata.bigquery.time_partitioning:
                partitioned_by = metadata.bigquery.time_partitioning.field
        except FileNotFoundError:
            pass
        table_schema = Schema.for_table(
            project_name, dataset_name, table_name, partitioned_by
        )
        if not query_schema.compatible(table_schema):
            click.echo(
                click.style(
                    f"ERROR: Schema for query in {query_file_path} "
                    f"incompatible with schema deployed for "
                    f"{project_name}.{dataset_name}.{table_name}",
                    fg="red",
                ),
                err=True,
            )
            return False
        else:
            existing_schema = Schema.from_schema_file(existing_schema_path)
            if not existing_schema.equal(query_schema):
                click.echo(
                    click.style(
                        f"Schema defined in {existing_schema_path} "
                        f"incompatible with query {query_file_path}",
                        fg="red",
                    ),
                    err=True,
                )
                return False
        click.echo(f"Schemas for {query_file_path} are valid.")
        return True
 def sql_file_valid(sqlfile):
    """Dry run SQL files."""
@ -446,47 +545,3 @@ def find_next_word(target, source):
        if w == target:
            # get the next word, and remove quotations from column name
            return split[i + 1].replace("'", "")
 def main():
    """Dry run all SQL files in the project directories."""
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument(
        "paths",
        metavar="PATH",
        nargs="*",
        help="Paths to search for queries to dry run. CI passes 'sql' on the default "
        "branch, and the paths that have been modified since branching otherwise",
    )
    args = parser.parse_args()
    file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
    file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
    sql_files: Set[str] = set()
    for path in args.paths:
        if isdir(path):
            sql_files |= {
                sql_file
                for pattern in file_names
                for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
            }
        elif file_re.fullmatch(basename(path)):
            sql_files.add(path)
    sql_files -= SKIP
    if not sql_files:
        print("Skipping dry run because no queries matched")
        sys.exit(0)
    with Pool(8) as p:
        result = p.map(sql_file_valid, sorted(sql_files), chunksize=1)
    if all(result):
        exitcode = 0
    else:
        exitcode = 1
    sys.exit(exitcode)
 if __name__ == "__main__":
    main()
--- a/bigquery_etl/schema/init.py
+++ b/bigquery_etl/schema/init.py
@ -1,16 +1,16 @@
 """Query schema."""
 from google.cloud import bigquery
 import json
 import os
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Dict, Optional, List
+from typing import Any, Dict, List, Optional
 import attr
 import os
 import yaml
 from google.cloud import bigquery
-from bigquery_etl.dryrun import DryRun
+from .. import dryrun
 SCHEMA_FILE = "schema.yaml"
@ -27,7 +27,7 @@ class Schema:
        if not query_file.is_file() or query_file.suffix != ".sql":
            raise Exception(f"{query_file} is not a valid SQL file.")
-        schema = DryRun(str(query_file), content=content).get_schema()
+        schema = dryrun.DryRun(str(query_file), content=content).get_schema()
        return cls(schema)
    @classmethod
@ -55,7 +55,7 @@ class Schema:
        try:
            return cls(
-                DryRun(
+                dryrun.DryRun(
                    os.path.join(project, dataset, table, "query.sql"), query
                ).get_schema()
            )
--- a/script/dryrun
+++ b/script/dryrun
@ -5,4 +5,4 @@
 cd "$(dirname "$0")/.."
-exec python3 -m bigquery_etl.dryrun "$@"
+script/bqetl dryrun "$@"
`@ -5,4 +5,4 @@`

	`cd "$(dirname "$0")/.."`	`cd "$(dirname "$0")/.."`

	`exec python3 -m bigquery_etl.dryrun "$@"`	`script/bqetl dryrun "$@"`