Make schema validation part of dryrun (#2069)
This commit is contained in:
Родитель
13fc30f169
Коммит
3c8894fdf1
|
@ -62,6 +62,8 @@ jobs:
|
||||||
docker: *docker
|
docker: *docker
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
|
- *restore_venv_cache
|
||||||
|
- *build
|
||||||
- run:
|
- run:
|
||||||
name: Dry run queries
|
name: Dry run queries
|
||||||
# yamllint disable rule:line-length
|
# yamllint disable rule:line-length
|
||||||
|
@ -80,7 +82,7 @@ jobs:
|
||||||
PATHS="$(git diff origin/main... --name-only --diff-filter=d -- sql)"
|
PATHS="$(git diff origin/main... --name-only --diff-filter=d -- sql)"
|
||||||
fi
|
fi
|
||||||
echo $PATHS
|
echo $PATHS
|
||||||
script/dryrun $PATHS
|
PATH="venv/bin:$PATH" script/dryrun --validate-schemas $PATHS
|
||||||
# yamllint enable rule:line-length
|
# yamllint enable rule:line-length
|
||||||
validate-metadata:
|
validate-metadata:
|
||||||
docker: *docker
|
docker: *docker
|
||||||
|
@ -230,17 +232,6 @@ jobs:
|
||||||
- run:
|
- run:
|
||||||
name: Validate views
|
name: Validate views
|
||||||
command: PATH="venv/bin:$PATH" script/validate_views
|
command: PATH="venv/bin:$PATH" script/validate_views
|
||||||
validate-schemas:
|
|
||||||
docker: *docker
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- *restore_venv_cache
|
|
||||||
- *build
|
|
||||||
- run:
|
|
||||||
name: Validate query schemas
|
|
||||||
command: |
|
|
||||||
./bqetl bootstrap
|
|
||||||
./bqetl query schema validate "*"
|
|
||||||
docs:
|
docs:
|
||||||
docker: *docker
|
docker: *docker
|
||||||
steps:
|
steps:
|
||||||
|
@ -356,7 +347,6 @@ workflows:
|
||||||
- verify-requirements
|
- verify-requirements
|
||||||
- dry-run-sql
|
- dry-run-sql
|
||||||
- validate-metadata
|
- validate-metadata
|
||||||
- validate-schemas
|
|
||||||
- integration
|
- integration
|
||||||
- validate-dags
|
- validate-dags
|
||||||
- verify-dags-up-to-date
|
- verify-dags-up-to-date
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
"""bigquery-etl CLI dryrun command."""
|
"""bigquery-etl CLI dryrun command."""
|
||||||
|
|
||||||
|
import fnmatch
|
||||||
|
import glob
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
from pathlib import Path
|
from typing import Set
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from google.cloud import bigquery
|
from google.cloud import bigquery
|
||||||
|
@ -27,8 +30,8 @@ from ..dryrun import SKIP, DryRun
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
@click.argument(
|
@click.argument(
|
||||||
"path",
|
"paths",
|
||||||
default="sql/",
|
nargs=-1,
|
||||||
type=click.Path(file_okay=True),
|
type=click.Path(file_okay=True),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
|
@ -42,61 +45,60 @@ from ..dryrun import SKIP, DryRun
|
||||||
type=bool,
|
type=bool,
|
||||||
default=True,
|
default=True,
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--validate_schemas",
|
||||||
|
"--validate-schemas",
|
||||||
|
help="Require dry run schema to match destination table and file if present.",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--project",
|
"--project",
|
||||||
help="GCP project to perform dry run in when --use_cloud_function=False",
|
help="GCP project to perform dry run in when --use_cloud_function=False",
|
||||||
default="moz-fx-data-shared-prod",
|
default="moz-fx-data-shared-prod",
|
||||||
)
|
)
|
||||||
def dryrun(path, use_cloud_function, project):
|
def dryrun(paths, use_cloud_function, validate_schemas, project):
|
||||||
"""Perform a dry run."""
|
"""Perform a dry run."""
|
||||||
if os.path.isdir(path) and os.path.exists(path):
|
file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
|
||||||
sql_files = [f for f in Path(path).rglob("*.sql") if str(f) not in SKIP]
|
file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
|
||||||
elif os.path.isfile(path) and os.path.exists(path):
|
|
||||||
sql_files = [path]
|
sql_files: Set[str] = set()
|
||||||
|
for path in paths:
|
||||||
|
if os.path.isdir(path):
|
||||||
|
sql_files |= {
|
||||||
|
sql_file
|
||||||
|
for pattern in file_names
|
||||||
|
for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
|
||||||
|
}
|
||||||
|
elif os.path.isfile(path):
|
||||||
|
if file_re.fullmatch(os.path.basename(path)):
|
||||||
|
sql_files.add(path)
|
||||||
else:
|
else:
|
||||||
click.echo(f"Invalid path {path}", err=True)
|
click.echo(f"Invalid path {path}", err=True)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
sql_files -= SKIP
|
||||||
|
|
||||||
|
if not sql_files:
|
||||||
|
print("Skipping dry run because no queries matched")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
if use_cloud_function:
|
if use_cloud_function:
|
||||||
|
client = None
|
||||||
def cloud_function_dryrun(sqlfile):
|
|
||||||
"""Dry run SQL files."""
|
|
||||||
return DryRun(sqlfile).is_valid()
|
|
||||||
|
|
||||||
sql_file_valid = cloud_function_dryrun
|
|
||||||
else:
|
else:
|
||||||
if not is_authenticated():
|
if not is_authenticated():
|
||||||
click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.")
|
click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
client = bigquery.Client(project=project)
|
||||||
|
|
||||||
client = bigquery.Client()
|
def sql_file_valid(sqlfile):
|
||||||
|
|
||||||
def gcp_dryrun(sqlfile):
|
|
||||||
"""Dry run the SQL file."""
|
"""Dry run the SQL file."""
|
||||||
dataset = Path(sqlfile).parent.parent.name
|
result = DryRun(sqlfile, use_cloud_function=use_cloud_function, client=client)
|
||||||
job_config = bigquery.QueryJobConfig(
|
if validate_schemas:
|
||||||
dry_run=True,
|
valid = result.validate_schema()
|
||||||
use_query_cache=False,
|
if not valid:
|
||||||
default_dataset=f"{project}.{dataset}",
|
click.echo(f"{sqlfile:59} ERROR schema invalid")
|
||||||
query_parameters=[
|
return valid
|
||||||
bigquery.ScalarQueryParameter(
|
return result.is_valid()
|
||||||
"submission_date", "DATE", "2019-01-01"
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(sqlfile) as query_stream:
|
|
||||||
query = query_stream.read()
|
|
||||||
|
|
||||||
try:
|
|
||||||
client.query(query, job_config=job_config)
|
|
||||||
click.echo(f"{sqlfile:59} OK")
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
click.echo(f"{sqlfile:59} ERROR: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
sql_file_valid = gcp_dryrun
|
|
||||||
|
|
||||||
with ThreadPool(8) as p:
|
with ThreadPool(8) as p:
|
||||||
result = p.map(sql_file_valid, sql_files, chunksize=1)
|
result = p.map(sql_file_valid, sql_files, chunksize=1)
|
||||||
|
|
|
@ -15,19 +15,20 @@ import click
|
||||||
from google.cloud import bigquery
|
from google.cloud import bigquery
|
||||||
from google.cloud.exceptions import NotFound
|
from google.cloud.exceptions import NotFound
|
||||||
|
|
||||||
from ..cli.dryrun import SKIP, dryrun
|
|
||||||
from ..cli.format import format
|
from ..cli.format import format
|
||||||
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
|
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
|
||||||
from ..dependency import get_dependency_graph
|
from ..dependency import get_dependency_graph
|
||||||
|
from ..dryrun import SKIP, DryRun
|
||||||
from ..format_sql.formatter import reformat
|
from ..format_sql.formatter import reformat
|
||||||
from ..metadata import validate_metadata
|
from ..metadata import validate_metadata
|
||||||
from ..metadata.parse_metadata import METADATA_FILE, Metadata, DatasetMetadata
|
from ..metadata.parse_metadata import METADATA_FILE, DatasetMetadata, Metadata
|
||||||
from ..query_scheduling.dag_collection import DagCollection
|
from ..query_scheduling.dag_collection import DagCollection
|
||||||
from ..query_scheduling.generate_airflow_dags import get_dags
|
from ..query_scheduling.generate_airflow_dags import get_dags
|
||||||
from ..run_query import run
|
from ..run_query import run
|
||||||
from ..schema import SCHEMA_FILE, Schema
|
from ..schema import SCHEMA_FILE, Schema
|
||||||
from ..util import extract_from_query_path
|
from ..util import extract_from_query_path
|
||||||
from ..util.common import random_str
|
from ..util.common import random_str
|
||||||
|
from .dryrun import dryrun
|
||||||
|
|
||||||
QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
|
QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
|
||||||
SQL_FILE_RE = re.compile(
|
SQL_FILE_RE = re.compile(
|
||||||
|
@ -673,8 +674,15 @@ def backfill(
|
||||||
type=bool,
|
type=bool,
|
||||||
default=True,
|
default=True,
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--validate_schemas",
|
||||||
|
"--validate-schemas",
|
||||||
|
help="Require dry run schema to match destination table and file if present.",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def validate(ctx, name, sql_dir, project_id, use_cloud_function):
|
def validate(ctx, name, sql_dir, project_id, use_cloud_function, validate_schemas):
|
||||||
"""Validate queries by dry running, formatting and checking scheduling configs."""
|
"""Validate queries by dry running, formatting and checking scheduling configs."""
|
||||||
if name is None:
|
if name is None:
|
||||||
name = "*.*"
|
name = "*.*"
|
||||||
|
@ -686,9 +694,10 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
|
||||||
ctx.invoke(format, path=str(query))
|
ctx.invoke(format, path=str(query))
|
||||||
ctx.invoke(
|
ctx.invoke(
|
||||||
dryrun,
|
dryrun,
|
||||||
path=str(query),
|
paths=[str(query)],
|
||||||
use_cloud_function=use_cloud_function,
|
use_cloud_function=use_cloud_function,
|
||||||
project=project,
|
project=project,
|
||||||
|
validate_schemas=validate_schemas,
|
||||||
)
|
)
|
||||||
validate_metadata.validate(query.parent)
|
validate_metadata.validate(query.parent)
|
||||||
dataset_dirs.add(query.parent.parent)
|
dataset_dirs.add(query.parent.parent)
|
||||||
|
@ -696,8 +705,6 @@ def validate(ctx, name, sql_dir, project_id, use_cloud_function):
|
||||||
for dataset_dir in dataset_dirs:
|
for dataset_dir in dataset_dirs:
|
||||||
validate_metadata.validate_datasets(dataset_dir)
|
validate_metadata.validate_datasets(dataset_dir)
|
||||||
|
|
||||||
# todo: validate if new fields get added
|
|
||||||
|
|
||||||
|
|
||||||
@query.command(
|
@query.command(
|
||||||
help="""Create and initialize the destination table for the query.
|
help="""Create and initialize the destination table for the query.
|
||||||
|
@ -1114,70 +1121,6 @@ def deploy(ctx, name, sql_dir, project_id, force):
|
||||||
click.echo(f"No schema file found for {query_file}")
|
click.echo(f"No schema file found for {query_file}")
|
||||||
|
|
||||||
|
|
||||||
def _validate_schema(query_file):
|
|
||||||
"""
|
|
||||||
Check whether schema is valid.
|
|
||||||
|
|
||||||
Returns tuple for whether schema is valid and path to schema.
|
|
||||||
"""
|
|
||||||
if str(query_file) in SKIP or query_file.name == "script.sql":
|
|
||||||
click.echo(f"{query_file} dry runs are skipped. Cannot validate schemas.")
|
|
||||||
return (True, query_file)
|
|
||||||
|
|
||||||
query_file_path = Path(query_file)
|
|
||||||
query_schema = Schema.from_query_file(query_file_path)
|
|
||||||
existing_schema_path = query_file_path.parent / SCHEMA_FILE
|
|
||||||
|
|
||||||
if not existing_schema_path.is_file():
|
|
||||||
click.echo(f"No schema file defined for {query_file_path}", err=True)
|
|
||||||
return (True, query_file_path)
|
|
||||||
|
|
||||||
table_name = query_file_path.parent.name
|
|
||||||
dataset_name = query_file_path.parent.parent.name
|
|
||||||
project_name = query_file_path.parent.parent.parent.name
|
|
||||||
|
|
||||||
partitioned_by = None
|
|
||||||
try:
|
|
||||||
metadata = Metadata.of_query_file(query_file_path)
|
|
||||||
|
|
||||||
if metadata.bigquery and metadata.bigquery.time_partitioning:
|
|
||||||
partitioned_by = metadata.bigquery.time_partitioning.field
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
table_schema = Schema.for_table(
|
|
||||||
project_name, dataset_name, table_name, partitioned_by
|
|
||||||
)
|
|
||||||
|
|
||||||
if not query_schema.compatible(table_schema):
|
|
||||||
click.echo(
|
|
||||||
click.style(
|
|
||||||
f"ERROR: Schema for query in {query_file_path} "
|
|
||||||
f"incompatible with schema deployed for "
|
|
||||||
f"{project_name}.{dataset_name}.{table_name}",
|
|
||||||
fg="red",
|
|
||||||
),
|
|
||||||
err=True,
|
|
||||||
)
|
|
||||||
return (False, query_file_path)
|
|
||||||
else:
|
|
||||||
existing_schema = Schema.from_schema_file(existing_schema_path)
|
|
||||||
|
|
||||||
if not existing_schema.equal(query_schema):
|
|
||||||
click.echo(
|
|
||||||
click.style(
|
|
||||||
f"Schema defined in {existing_schema_path} "
|
|
||||||
f"incompatible with query {query_file_path}",
|
|
||||||
fg="red",
|
|
||||||
),
|
|
||||||
err=True,
|
|
||||||
)
|
|
||||||
return (False, query_file_path)
|
|
||||||
|
|
||||||
click.echo(f"Schemas for {query_file_path} are valid.")
|
|
||||||
return (True, query_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
@schema.command(
|
@schema.command(
|
||||||
help="""Validate the query schema
|
help="""Validate the query schema
|
||||||
|
|
||||||
|
@ -1200,6 +1143,9 @@ def validate_schema(name, sql_dir, project_id):
|
||||||
"""Validate the defined query schema against the query and destination table."""
|
"""Validate the defined query schema against the query and destination table."""
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
|
def _validate_schema(query_file_path):
|
||||||
|
return DryRun(query_file_path).validate_schema(), query_file_path
|
||||||
|
|
||||||
with Pool(8) as p:
|
with Pool(8) as p:
|
||||||
result = p.map(_validate_schema, query_files, chunksize=1)
|
result = p.map(_validate_schema, query_files, chunksize=1)
|
||||||
|
|
||||||
|
|
|
@ -10,18 +10,21 @@ only dry runs can be performed. In order to reduce risk of CI or local users
|
||||||
accidentally running queries during tests and overwriting production data, we
|
accidentally running queries during tests and overwriting production data, we
|
||||||
proxy the queries through the dry run service endpoint.
|
proxy the queries through the dry run service endpoint.
|
||||||
"""
|
"""
|
||||||
import fnmatch
|
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from multiprocessing.pool import Pool
|
from os.path import basename, dirname, exists
|
||||||
from os.path import basename, dirname, exists, isdir
|
from pathlib import Path
|
||||||
from typing import Set
|
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
import click
|
||||||
|
from google.cloud import bigquery
|
||||||
|
|
||||||
|
from .metadata.parse_metadata import Metadata
|
||||||
|
from .schema import SCHEMA_FILE, Schema
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from functools import cached_property # type: ignore
|
from functools import cached_property # type: ignore
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -224,11 +227,20 @@ class DryRun:
|
||||||
"bigquery-etl-dryrun"
|
"bigquery-etl-dryrun"
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, sqlfile, content=None, strip_dml=False):
|
def __init__(
|
||||||
|
self,
|
||||||
|
sqlfile,
|
||||||
|
content=None,
|
||||||
|
strip_dml=False,
|
||||||
|
use_cloud_function=True,
|
||||||
|
client=None,
|
||||||
|
):
|
||||||
"""Instantiate DryRun class."""
|
"""Instantiate DryRun class."""
|
||||||
self.sqlfile = sqlfile
|
self.sqlfile = sqlfile
|
||||||
self.content = content
|
self.content = content
|
||||||
self.strip_dml = strip_dml
|
self.strip_dml = strip_dml
|
||||||
|
self.use_cloud_function = use_cloud_function
|
||||||
|
self.client = client if use_cloud_function or client else bigquery.Client()
|
||||||
|
|
||||||
def get_sql(self):
|
def get_sql(self):
|
||||||
"""Get SQL content."""
|
"""Get SQL content."""
|
||||||
|
@ -253,26 +265,54 @@ class DryRun:
|
||||||
sql = self.content
|
sql = self.content
|
||||||
else:
|
else:
|
||||||
sql = self.get_sql()
|
sql = self.get_sql()
|
||||||
|
dataset = basename(dirname(dirname(self.sqlfile)))
|
||||||
try:
|
try:
|
||||||
|
if self.use_cloud_function:
|
||||||
r = urlopen(
|
r = urlopen(
|
||||||
Request(
|
Request(
|
||||||
self.DRY_RUN_URL,
|
self.DRY_RUN_URL,
|
||||||
headers={"Content-Type": "application/json"},
|
headers={"Content-Type": "application/json"},
|
||||||
data=json.dumps(
|
data=json.dumps(
|
||||||
{
|
{
|
||||||
"dataset": basename(dirname(dirname(self.sqlfile))),
|
"dataset": dataset,
|
||||||
"query": sql,
|
"query": sql,
|
||||||
}
|
}
|
||||||
).encode("utf8"),
|
).encode("utf8"),
|
||||||
method="POST",
|
method="POST",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
return json.load(r)
|
||||||
|
else:
|
||||||
|
project = basename(dirname(dirname(dirname(self.sqlfile))))
|
||||||
|
job_config = bigquery.QueryJobConfig(
|
||||||
|
dry_run=True,
|
||||||
|
use_query_cache=False,
|
||||||
|
default_dataset=f"{project}.{dataset}",
|
||||||
|
query_parameters=[
|
||||||
|
bigquery.ScalarQueryParameter(
|
||||||
|
"submission_date", "DATE", "2019-01-01"
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
job = self.client.query(sql, job_config=job_config)
|
||||||
|
return {
|
||||||
|
"valid": True,
|
||||||
|
"referencedTables": [
|
||||||
|
ref.to_api_repr() for ref in job.referenced_tables
|
||||||
|
],
|
||||||
|
"schema": (
|
||||||
|
job._properties.get("statistics", {})
|
||||||
|
.get("query", {})
|
||||||
|
.get("schema", {})
|
||||||
|
),
|
||||||
|
"datasetLabels": (
|
||||||
|
self.client.get_dataset(job.default_dataset).labels
|
||||||
|
),
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{self.sqlfile:59} ERROR\n", e)
|
print(f"{self.sqlfile:59} ERROR\n", e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return json.load(r)
|
|
||||||
|
|
||||||
def get_referenced_tables(self):
|
def get_referenced_tables(self):
|
||||||
"""Return referenced tables by dry running the SQL file."""
|
"""Return referenced tables by dry running the SQL file."""
|
||||||
if self.sqlfile not in SKIP and not self.is_valid():
|
if self.sqlfile not in SKIP and not self.is_valid():
|
||||||
|
@ -433,6 +473,65 @@ class DryRun:
|
||||||
return Errors.DATE_FILTER_NEEDED_AND_SYNTAX
|
return Errors.DATE_FILTER_NEEDED_AND_SYNTAX
|
||||||
return error
|
return error
|
||||||
|
|
||||||
|
def validate_schema(self):
|
||||||
|
"""Check whether schema is valid."""
|
||||||
|
if self.sqlfile in SKIP or basename(self.sqlfile) == "script.sql":
|
||||||
|
print(f"\t...Ignoring dryrun results for {self.sqlfile}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
query_file_path = Path(self.sqlfile)
|
||||||
|
query_schema = Schema.from_json(self.get_schema())
|
||||||
|
existing_schema_path = query_file_path.parent / SCHEMA_FILE
|
||||||
|
|
||||||
|
if not existing_schema_path.is_file():
|
||||||
|
click.echo(f"No schema file defined for {query_file_path}", err=True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
table_name = query_file_path.parent.name
|
||||||
|
dataset_name = query_file_path.parent.parent.name
|
||||||
|
project_name = query_file_path.parent.parent.parent.name
|
||||||
|
|
||||||
|
partitioned_by = None
|
||||||
|
try:
|
||||||
|
metadata = Metadata.of_query_file(query_file_path)
|
||||||
|
|
||||||
|
if metadata.bigquery and metadata.bigquery.time_partitioning:
|
||||||
|
partitioned_by = metadata.bigquery.time_partitioning.field
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
table_schema = Schema.for_table(
|
||||||
|
project_name, dataset_name, table_name, partitioned_by
|
||||||
|
)
|
||||||
|
|
||||||
|
if not query_schema.compatible(table_schema):
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
f"ERROR: Schema for query in {query_file_path} "
|
||||||
|
f"incompatible with schema deployed for "
|
||||||
|
f"{project_name}.{dataset_name}.{table_name}",
|
||||||
|
fg="red",
|
||||||
|
),
|
||||||
|
err=True,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
existing_schema = Schema.from_schema_file(existing_schema_path)
|
||||||
|
|
||||||
|
if not existing_schema.equal(query_schema):
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
f"Schema defined in {existing_schema_path} "
|
||||||
|
f"incompatible with query {query_file_path}",
|
||||||
|
fg="red",
|
||||||
|
),
|
||||||
|
err=True,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
click.echo(f"Schemas for {query_file_path} are valid.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def sql_file_valid(sqlfile):
|
def sql_file_valid(sqlfile):
|
||||||
"""Dry run SQL files."""
|
"""Dry run SQL files."""
|
||||||
|
@ -446,47 +545,3 @@ def find_next_word(target, source):
|
||||||
if w == target:
|
if w == target:
|
||||||
# get the next word, and remove quotations from column name
|
# get the next word, and remove quotations from column name
|
||||||
return split[i + 1].replace("'", "")
|
return split[i + 1].replace("'", "")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Dry run all SQL files in the project directories."""
|
|
||||||
parser = ArgumentParser(description=main.__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"paths",
|
|
||||||
metavar="PATH",
|
|
||||||
nargs="*",
|
|
||||||
help="Paths to search for queries to dry run. CI passes 'sql' on the default "
|
|
||||||
"branch, and the paths that have been modified since branching otherwise",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
file_names = ("query.sql", "view.sql", "part*.sql", "init.sql")
|
|
||||||
file_re = re.compile("|".join(map(fnmatch.translate, file_names)))
|
|
||||||
|
|
||||||
sql_files: Set[str] = set()
|
|
||||||
for path in args.paths:
|
|
||||||
if isdir(path):
|
|
||||||
sql_files |= {
|
|
||||||
sql_file
|
|
||||||
for pattern in file_names
|
|
||||||
for sql_file in glob.glob(f"{path}/**/{pattern}", recursive=True)
|
|
||||||
}
|
|
||||||
elif file_re.fullmatch(basename(path)):
|
|
||||||
sql_files.add(path)
|
|
||||||
sql_files -= SKIP
|
|
||||||
|
|
||||||
if not sql_files:
|
|
||||||
print("Skipping dry run because no queries matched")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
with Pool(8) as p:
|
|
||||||
result = p.map(sql_file_valid, sorted(sql_files), chunksize=1)
|
|
||||||
if all(result):
|
|
||||||
exitcode = 0
|
|
||||||
else:
|
|
||||||
exitcode = 1
|
|
||||||
sys.exit(exitcode)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
"""Query schema."""
|
"""Query schema."""
|
||||||
|
|
||||||
from google.cloud import bigquery
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
from typing import Any, Dict, Optional, List
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
import os
|
|
||||||
import yaml
|
import yaml
|
||||||
|
from google.cloud import bigquery
|
||||||
|
|
||||||
from bigquery_etl.dryrun import DryRun
|
from .. import dryrun
|
||||||
|
|
||||||
SCHEMA_FILE = "schema.yaml"
|
SCHEMA_FILE = "schema.yaml"
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ class Schema:
|
||||||
if not query_file.is_file() or query_file.suffix != ".sql":
|
if not query_file.is_file() or query_file.suffix != ".sql":
|
||||||
raise Exception(f"{query_file} is not a valid SQL file.")
|
raise Exception(f"{query_file} is not a valid SQL file.")
|
||||||
|
|
||||||
schema = DryRun(str(query_file), content=content).get_schema()
|
schema = dryrun.DryRun(str(query_file), content=content).get_schema()
|
||||||
return cls(schema)
|
return cls(schema)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -55,7 +55,7 @@ class Schema:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return cls(
|
return cls(
|
||||||
DryRun(
|
dryrun.DryRun(
|
||||||
os.path.join(project, dataset, table, "query.sql"), query
|
os.path.join(project, dataset, table, "query.sql"), query
|
||||||
).get_schema()
|
).get_schema()
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,4 +5,4 @@
|
||||||
|
|
||||||
cd "$(dirname "$0")/.."
|
cd "$(dirname "$0")/.."
|
||||||
|
|
||||||
exec python3 -m bigquery_etl.dryrun "$@"
|
script/bqetl dryrun "$@"
|
||||||
|
|
Загрузка…
Ссылка в новой задаче