Make schema validation part of dryrun (#2069)

This commit is contained in:
Daniel Thorn 2021-05-25 11:53:09 -07:00 коммит произвёл GitHub
Родитель 13fc30f169
Коммит 3c8894fdf1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 193 добавлений и 200 удалений

Просмотреть файл

@ -62,6 +62,8 @@ jobs:
docker: *docker docker: *docker
steps: steps:
- checkout - checkout
- *restore_venv_cache
- *build
- run: - run:
name: Dry run queries name: Dry run queries
# yamllint disable rule:line-length # yamllint disable rule:line-length
@ -80,7 +82,7 @@ jobs:
PATHS="$(git diff origin/main... --name-only --diff-filter=d -- sql)" PATHS="$(git diff origin/main... --name-only --diff-filter=d -- sql)"
fi fi
echo $PATHS echo $PATHS
script/dryrun $PATHS PATH="venv/bin:$PATH" script/dryrun --validate-schemas $PATHS
# yamllint enable rule:line-length # yamllint enable rule:line-length
validate-metadata: validate-metadata:
docker: *docker docker: *docker
@ -230,17 +232,6 @@ jobs:
- run: - run:
name: Validate views name: Validate views
command: PATH="venv/bin:$PATH" script/validate_views command: PATH="venv/bin:$PATH" script/validate_views
validate-schemas:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- run:
name: Validate query schemas
command: |
./bqetl bootstrap
./bqetl query schema validate "*"
docs: docs:
docker: *docker docker: *docker
steps: steps:
@ -356,7 +347,6 @@ workflows:
- verify-requirements - verify-requirements
- dry-run-sql - dry-run-sql
- validate-metadata - validate-metadata
- validate-schemas
- integration - integration
- validate-dags - validate-dags
- verify-dags-up-to-date - verify-dags-up-to-date

Просмотреть файл

@ -1,9 +1,12 @@
"""bigquery-etl CLI dryrun command.""" """bigquery-etl CLI dryrun command."""
import fnmatch
import glob
import os import os
import re
import sys import sys
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from pathlib import Path from typing import Set
import click import click
from google.cloud import bigquery from google.cloud import bigquery
@ -27,8 +30,8 @@ from ..dryrun import SKIP, DryRun
""", """,
) )
@click.argument( @click.argument(
"path", "paths",
default="sql/", nargs=-1,
type=click.Path(file_okay=True), type=click.Path(file_okay=True),
) )
@click.option( @click.option(
@ -42,61 +45,60 @@ from ..dryrun import SKIP, DryRun
type=bool, type=bool,
default=True, default=True,
) )
@click.option(
"--validate_schemas",
"--validate-schemas",
help="Require dry run schema to match destination table and file if present.",
is_flag=True,
default=False,
)
@click.option( @click.option(
"--project", "--project",
help="GCP project to perform dry run in when --use_cloud_function=False", help="GCP project to perform dry run in when --use_cloud_function=False",
default="moz-fx-data-shared-prod", default="moz-fx-data-shared-prod",
) )
def dryrun(path, use_cloud_function, project): def dryrun(paths, use_cloud_function, validate_schemas, project):
"""Perform a dry run.""" """Perform a dry run."""
if os.path.isdir(path) and os.path.exists(path): file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
sql_files = [f for f in Path(path).rglob("*.sql") if str(f) not in SKIP] file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
elif os.path.isfile(path) and os.path.exists(path):
sql_files = [path] sql_files: Set[str] = set()
for path in paths:
if os.path.isdir(path):
sql_files |= {
sql_file
for pattern in file_names
for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
}
elif os.path.isfile(path):
if file_re.fullmatch(os.path.basename(path)):
sql_files.add(path)
else: else:
click.echo(f"Invalid path {path}", err=True) click.echo(f"Invalid path {path}", err=True)
sys.exit(1) sys.exit(1)
sql_files -= SKIP
if not sql_files:
print("Skipping dry run because no queries matched")
sys.exit(0)
if use_cloud_function: if use_cloud_function:
client = None
def cloud_function_dryrun(sqlfile):
"""Dry run SQL files."""
return DryRun(sqlfile).is_valid()
sql_file_valid = cloud_function_dryrun
else: else:
if not is_authenticated(): if not is_authenticated():
click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.") click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.")
sys.exit(1) sys.exit(1)
client = bigquery.Client(project=project)
client = bigquery.Client() def sql_file_valid(sqlfile):
def gcp_dryrun(sqlfile):
"""Dry run the SQL file.""" """Dry run the SQL file."""
dataset = Path(sqlfile).parent.parent.name result = DryRun(sqlfile, use_cloud_function=use_cloud_function, client=client)
job_config = bigquery.QueryJobConfig( if validate_schemas:
dry_run=True, valid = result.validate_schema()
use_query_cache=False, if not valid:
default_dataset=f"{project}.{dataset}", click.echo(f"{sqlfile:59} ERROR schema invalid")
query_parameters=[ return valid
bigquery.ScalarQueryParameter( return result.is_valid()
"submission_date", "DATE", "2019-01-01"
)
],
)
with open(sqlfile) as query_stream:
query = query_stream.read()
try:
client.query(query, job_config=job_config)
click.echo(f"{sqlfile:59} OK")
return True
except Exception as e:
click.echo(f"{sqlfile:59} ERROR: {e}")
return False
sql_file_valid = gcp_dryrun
with ThreadPool(8) as p: with ThreadPool(8) as p:
result = p.map(sql_file_valid, sql_files, chunksize=1) result = p.map(sql_file_valid, sql_files, chunksize=1)

Просмотреть файл

@ -15,19 +15,20 @@ import click
from google.cloud import bigquery from google.cloud import bigquery
from google.cloud.exceptions import NotFound from google.cloud.exceptions import NotFound
from ..cli.dryrun import SKIP, dryrun
from ..cli.format import format from ..cli.format import format
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
from ..dependency import get_dependency_graph from ..dependency import get_dependency_graph
from ..dryrun import SKIP, DryRun
from ..format_sql.formatter import reformat from ..format_sql.formatter import reformat
from ..metadata import validate_metadata from ..metadata import validate_metadata
from ..metadata.parse_metadata import METADATA_FILE, Metadata, DatasetMetadata from ..metadata.parse_metadata import METADATA_FILE, DatasetMetadata, Metadata
from ..query_scheduling.dag_collection import DagCollection from ..query_scheduling.dag_collection import DagCollection
from ..query_scheduling.generate_airflow_dags import get_dags from ..query_scheduling.generate_airflow_dags import get_dags
from ..run_query import run from ..run_query import run
from ..schema import SCHEMA_FILE, Schema from ..schema import SCHEMA_FILE, Schema
from ..util import extract_from_query_path from ..util import extract_from_query_path
from ..util.common import random_str from ..util.common import random_str
from .dryrun import dryrun
QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)") QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
SQL_FILE_RE = re.compile( SQL_FILE_RE = re.compile(
@ -673,8 +674,15 @@ def backfill(
type=bool, type=bool,
default=True, default=True,
) )
@click.option(
"--validate_schemas",
"--validate-schemas",
help="Require dry run schema to match destination table and file if present.",
is_flag=True,
default=False,
)
@click.pass_context @click.pass_context
def validate(ctx, name, sql_dir, project_id, use_cloud_function): def validate(ctx, name, sql_dir, project_id, use_cloud_function, validate_schemas):
"""Validate queries by dry running, formatting and checking scheduling configs.""" """Validate queries by dry running, formatting and checking scheduling configs."""
if name is None: if name is None:
name = "*.*" name = "*.*"
@ -686,9 +694,10 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
ctx.invoke(format, path=str(query)) ctx.invoke(format, path=str(query))
ctx.invoke( ctx.invoke(
dryrun, dryrun,
path=str(query), paths=[str(query)],
use_cloud_function=use_cloud_function, use_cloud_function=use_cloud_function,
project=project, project=project,
validate_schemas=validate_schemas,
) )
validate_metadata.validate(query.parent) validate_metadata.validate(query.parent)
dataset_dirs.add(query.parent.parent) dataset_dirs.add(query.parent.parent)
@ -696,8 +705,6 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
for dataset_dir in dataset_dirs: for dataset_dir in dataset_dirs:
validate_metadata.validate_datasets(dataset_dir) validate_metadata.validate_datasets(dataset_dir)
# todo: validate if new fields get added
@query.command( @query.command(
help="""Create and initialize the destination table for the query. help="""Create and initialize the destination table for the query.
@ -1114,70 +1121,6 @@ def deploy(ctx, name, sql_dir, project_id, force):
click.echo(f"No schema file found for {query_file}") click.echo(f"No schema file found for {query_file}")
def _validate_schema(query_file):
"""
Check whether schema is valid.
Returns tuple for whether schema is valid and path to schema.
"""
if str(query_file) in SKIP or query_file.name == "script.sql":
click.echo(f"{query_file} dry runs are skipped. Cannot validate schemas.")
return (True, query_file)
query_file_path = Path(query_file)
query_schema = Schema.from_query_file(query_file_path)
existing_schema_path = query_file_path.parent / SCHEMA_FILE
if not existing_schema_path.is_file():
click.echo(f"No schema file defined for {query_file_path}", err=True)
return (True, query_file_path)
table_name = query_file_path.parent.name
dataset_name = query_file_path.parent.parent.name
project_name = query_file_path.parent.parent.parent.name
partitioned_by = None
try:
metadata = Metadata.of_query_file(query_file_path)
if metadata.bigquery and metadata.bigquery.time_partitioning:
partitioned_by = metadata.bigquery.time_partitioning.field
except FileNotFoundError:
pass
table_schema = Schema.for_table(
project_name, dataset_name, table_name, partitioned_by
)
if not query_schema.compatible(table_schema):
click.echo(
click.style(
f"ERROR: Schema for query in {query_file_path} "
f"incompatible with schema deployed for "
f"{project_name}.{dataset_name}.{table_name}",
fg="red",
),
err=True,
)
return (False, query_file_path)
else:
existing_schema = Schema.from_schema_file(existing_schema_path)
if not existing_schema.equal(query_schema):
click.echo(
click.style(
f"Schema defined in {existing_schema_path} "
f"incompatible with query {query_file_path}",
fg="red",
),
err=True,
)
return (False, query_file_path)
click.echo(f"Schemas for {query_file_path} are valid.")
return (True, query_file_path)
@schema.command( @schema.command(
help="""Validate the query schema help="""Validate the query schema
@ -1200,6 +1143,9 @@ def validate_schema(name, sql_dir, project_id):
"""Validate the defined query schema against the query and destination table.""" """Validate the defined query schema against the query and destination table."""
query_files = _queries_matching_name_pattern(name, sql_dir, project_id) query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
def _validate_schema(query_file_path):
return DryRun(query_file_path).validate_schema(), query_file_path
with Pool(8) as p: with Pool(8) as p:
result = p.map(_validate_schema, query_files, chunksize=1) result = p.map(_validate_schema, query_files, chunksize=1)

Просмотреть файл

@ -10,18 +10,21 @@ only dry runs can be performed. In order to reduce risk of CI or local users
accidentally running queries during tests and overwriting production data, we accidentally running queries during tests and overwriting production data, we
proxy the queries through the dry run service endpoint. proxy the queries through the dry run service endpoint.
""" """
import fnmatch
import glob import glob
import json import json
import re import re
import sys
from argparse import ArgumentParser
from enum import Enum from enum import Enum
from multiprocessing.pool import Pool from os.path import basename, dirname, exists
from os.path import basename, dirname, exists, isdir from pathlib import Path
from typing import Set
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
import click
from google.cloud import bigquery
from .metadata.parse_metadata import Metadata
from .schema import SCHEMA_FILE, Schema
try: try:
from functools import cached_property # type: ignore from functools import cached_property # type: ignore
except ImportError: except ImportError:
@ -224,11 +227,20 @@ class DryRun:
"bigquery-etl-dryrun" "bigquery-etl-dryrun"
) )
def __init__(self, sqlfile, content=None, strip_dml=False): def __init__(
self,
sqlfile,
content=None,
strip_dml=False,
use_cloud_function=True,
client=None,
):
"""Instantiate DryRun class.""" """Instantiate DryRun class."""
self.sqlfile = sqlfile self.sqlfile = sqlfile
self.content = content self.content = content
self.strip_dml = strip_dml self.strip_dml = strip_dml
self.use_cloud_function = use_cloud_function
self.client = client if use_cloud_function or client else bigquery.Client()
def get_sql(self): def get_sql(self):
"""Get SQL content.""" """Get SQL content."""
@ -253,26 +265,54 @@ class DryRun:
sql = self.content sql = self.content
else: else:
sql = self.get_sql() sql = self.get_sql()
dataset = basename(dirname(dirname(self.sqlfile)))
try: try:
if self.use_cloud_function:
r = urlopen( r = urlopen(
Request( Request(
self.DRY_RUN_URL, self.DRY_RUN_URL,
headers={"Content-Type": "application/json"}, headers={"Content-Type": "application/json"},
data=json.dumps( data=json.dumps(
{ {
"dataset": basename(dirname(dirname(self.sqlfile))), "dataset": dataset,
"query": sql, "query": sql,
} }
).encode("utf8"), ).encode("utf8"),
method="POST", method="POST",
) )
) )
return json.load(r)
else:
project = basename(dirname(dirname(dirname(self.sqlfile))))
job_config = bigquery.QueryJobConfig(
dry_run=True,
use_query_cache=False,
default_dataset=f"{project}.{dataset}",
query_parameters=[
bigquery.ScalarQueryParameter(
"submission_date", "DATE", "2019-01-01"
)
],
)
job = self.client.query(sql, job_config=job_config)
return {
"valid": True,
"referencedTables": [
ref.to_api_repr() for ref in job.referenced_tables
],
"schema": (
job._properties.get("statistics", {})
.get("query", {})
.get("schema", {})
),
"datasetLabels": (
self.client.get_dataset(job.default_dataset).labels
),
}
except Exception as e: except Exception as e:
print(f"{self.sqlfile:59} ERROR\n", e) print(f"{self.sqlfile:59} ERROR\n", e)
return None return None
return json.load(r)
def get_referenced_tables(self): def get_referenced_tables(self):
"""Return referenced tables by dry running the SQL file.""" """Return referenced tables by dry running the SQL file."""
if self.sqlfile not in SKIP and not self.is_valid(): if self.sqlfile not in SKIP and not self.is_valid():
@ -433,6 +473,65 @@ class DryRun:
return Errors.DATE_FILTER_NEEDED_AND_SYNTAX return Errors.DATE_FILTER_NEEDED_AND_SYNTAX
return error return error
def validate_schema(self):
"""Check whether schema is valid."""
if self.sqlfile in SKIP or basename(self.sqlfile) == "script.sql":
print(f"\t...Ignoring dryrun results for {self.sqlfile}")
return True
query_file_path = Path(self.sqlfile)
query_schema = Schema.from_json(self.get_schema())
existing_schema_path = query_file_path.parent / SCHEMA_FILE
if not existing_schema_path.is_file():
click.echo(f"No schema file defined for {query_file_path}", err=True)
return True
table_name = query_file_path.parent.name
dataset_name = query_file_path.parent.parent.name
project_name = query_file_path.parent.parent.parent.name
partitioned_by = None
try:
metadata = Metadata.of_query_file(query_file_path)
if metadata.bigquery and metadata.bigquery.time_partitioning:
partitioned_by = metadata.bigquery.time_partitioning.field
except FileNotFoundError:
pass
table_schema = Schema.for_table(
project_name, dataset_name, table_name, partitioned_by
)
if not query_schema.compatible(table_schema):
click.echo(
click.style(
f"ERROR: Schema for query in {query_file_path} "
f"incompatible with schema deployed for "
f"{project_name}.{dataset_name}.{table_name}",
fg="red",
),
err=True,
)
return False
else:
existing_schema = Schema.from_schema_file(existing_schema_path)
if not existing_schema.equal(query_schema):
click.echo(
click.style(
f"Schema defined in {existing_schema_path} "
f"incompatible with query {query_file_path}",
fg="red",
),
err=True,
)
return False
click.echo(f"Schemas for {query_file_path} are valid.")
return True
def sql_file_valid(sqlfile): def sql_file_valid(sqlfile):
"""Dry run SQL files.""" """Dry run SQL files."""
@ -446,47 +545,3 @@ def find_next_word(target, source):
if w == target: if w == target:
# get the next word, and remove quotations from column name # get the next word, and remove quotations from column name
return split[i + 1].replace("'", "") return split[i + 1].replace("'", "")
def main():
"""Dry run all SQL files in the project directories."""
parser = ArgumentParser(description=main.__doc__)
parser.add_argument(
"paths",
metavar="PATH",
nargs="*",
help="Paths to search for queries to dry run. CI passes 'sql' on the default "
"branch, and the paths that have been modified since branching otherwise",
)
args = parser.parse_args()
file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
sql_files: Set[str] = set()
for path in args.paths:
if isdir(path):
sql_files |= {
sql_file
for pattern in file_names
for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
}
elif file_re.fullmatch(basename(path)):
sql_files.add(path)
sql_files -= SKIP
if not sql_files:
print("Skipping dry run because no queries matched")
sys.exit(0)
with Pool(8) as p:
result = p.map(sql_file_valid, sorted(sql_files), chunksize=1)
if all(result):
exitcode = 0
else:
exitcode = 1
sys.exit(exitcode)
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,16 +1,16 @@
"""Query schema.""" """Query schema."""
from google.cloud import bigquery
import json import json
import os
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from typing import Any, Dict, Optional, List from typing import Any, Dict, List, Optional
import attr import attr
import os
import yaml import yaml
from google.cloud import bigquery
from bigquery_etl.dryrun import DryRun from .. import dryrun
SCHEMA_FILE = "schema.yaml" SCHEMA_FILE = "schema.yaml"
@ -27,7 +27,7 @@ class Schema:
if not query_file.is_file() or query_file.suffix != ".sql": if not query_file.is_file() or query_file.suffix != ".sql":
raise Exception(f"{query_file} is not a valid SQL file.") raise Exception(f"{query_file} is not a valid SQL file.")
schema = DryRun(str(query_file), content=content).get_schema() schema = dryrun.DryRun(str(query_file), content=content).get_schema()
return cls(schema) return cls(schema)
@classmethod @classmethod
@ -55,7 +55,7 @@ class Schema:
try: try:
return cls( return cls(
DryRun( dryrun.DryRun(
os.path.join(project, dataset, table, "query.sql"), query os.path.join(project, dataset, table, "query.sql"), query
).get_schema() ).get_schema()
) )

Просмотреть файл

@ -5,4 +5,4 @@
cd "$(dirname "$0")/.." cd "$(dirname "$0")/.."
exec python3 -m bigquery_etl.dryrun "$@" script/bqetl dryrun "$@"