Add bqetl view validate command
This commit is contained in:
Родитель
6733000e25
Коммит
62c85cb36c
|
@ -0,0 +1,146 @@
|
||||||
|
"""Utility functions used by the CLI."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import fnmatch
|
||||||
|
from fnmatch import fnmatchcase
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
import re
|
||||||
|
from google.cloud import bigquery
|
||||||
|
|
||||||
|
from bigquery_etl.util.common import project_dirs
|
||||||
|
|
||||||
|
QUERY_FILE_RE = re.compile(
|
||||||
|
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
|
||||||
|
r"(?:query\.sql|part1\.sql|script\.sql|query\.py|view\.sql)$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_dir(ctx, param, value):
|
||||||
|
"""Check if the parameter provided via click is an existing directory."""
|
||||||
|
if not os.path.isdir(value) or not os.path.exists(value):
|
||||||
|
raise click.BadParameter(f"Invalid directory path to {value}")
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_file(ctx, param, value):
|
||||||
|
"""Check if the parameter provided via click is an existing file."""
|
||||||
|
if not os.path.isfile(value) or not os.path.exists(value):
|
||||||
|
raise click.BadParameter(f"Invalid file path to {value}")
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def is_authenticated(project_id=None):
|
||||||
|
"""Check if the user is authenticated to GCP and can access the project."""
|
||||||
|
client = bigquery.Client()
|
||||||
|
|
||||||
|
if project_id:
|
||||||
|
return client.project == project_id
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_project(ctx, param, value):
|
||||||
|
"""Check if the provided project_id corresponds to an existing project."""
|
||||||
|
if value is None or value in [Path(p).name for p in project_dirs()]:
|
||||||
|
return value
|
||||||
|
raise click.BadParameter(f"Invalid project {value}")
|
||||||
|
|
||||||
|
|
||||||
|
def table_matches_patterns(pattern, invert, table):
|
||||||
|
"""Check if tables match pattern."""
|
||||||
|
pattern = re.compile(fnmatch.translate(pattern))
|
||||||
|
return (pattern.match(table) is not None) != invert
|
||||||
|
|
||||||
|
|
||||||
|
def paths_matching_name_pattern(pattern, sql_path, project_id, files=("*.sql")):
|
||||||
|
"""Return paths to queries matching the name pattern."""
|
||||||
|
matching_files = []
|
||||||
|
|
||||||
|
if pattern is None:
|
||||||
|
pattern = "*.*"
|
||||||
|
|
||||||
|
if os.path.isdir(pattern):
|
||||||
|
for root, _, _ in os.walk(pattern):
|
||||||
|
for file in files:
|
||||||
|
matching_files.extend(Path(root).rglob(file))
|
||||||
|
elif os.path.isfile(pattern):
|
||||||
|
for file in files:
|
||||||
|
matching_files.extend(Path(sql_path).rglob(file))
|
||||||
|
else:
|
||||||
|
sql_path = Path(sql_path)
|
||||||
|
if project_id is not None:
|
||||||
|
sql_path = sql_path / project_id
|
||||||
|
|
||||||
|
all_matching_files = []
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
all_matching_files.extend(Path(sql_path).rglob(file))
|
||||||
|
|
||||||
|
for query_file in all_matching_files:
|
||||||
|
match = QUERY_FILE_RE.match(str(query_file))
|
||||||
|
if match:
|
||||||
|
project = match.group(1)
|
||||||
|
dataset = match.group(2)
|
||||||
|
table = match.group(3)
|
||||||
|
query_name = f"{project}.{dataset}.{table}"
|
||||||
|
if fnmatchcase(query_name, f"*{pattern}"):
|
||||||
|
matching_files.append(query_file)
|
||||||
|
elif project_id and fnmatchcase(query_name, f"{project_id}.{pattern}"):
|
||||||
|
matching_files.append(query_file)
|
||||||
|
|
||||||
|
if len(matching_files) == 0:
|
||||||
|
print(f"No files matching: {pattern}")
|
||||||
|
|
||||||
|
return matching_files
|
||||||
|
|
||||||
|
|
||||||
|
sql_dir_option = click.option(
|
||||||
|
"--sql_dir",
|
||||||
|
help="Path to directory which contains queries.",
|
||||||
|
type=click.Path(file_okay=False),
|
||||||
|
default="sql",
|
||||||
|
callback=is_valid_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
use_cloud_function_option = click.option(
|
||||||
|
"--use_cloud_function",
|
||||||
|
"--use-cloud-function",
|
||||||
|
help=(
|
||||||
|
"Use the Cloud Function for dry running SQL, if set to `True`. "
|
||||||
|
"The Cloud Function can only access tables in shared-prod. "
|
||||||
|
"If set to `False`, use active GCP credentials for the dry run."
|
||||||
|
),
|
||||||
|
type=bool,
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
parallelism_option = click.option(
|
||||||
|
"--parallelism",
|
||||||
|
"-p",
|
||||||
|
default=8,
|
||||||
|
type=int,
|
||||||
|
help="Number of threads for parallel processing",
|
||||||
|
)
|
||||||
|
|
||||||
|
def project_id_option(default=None):
|
||||||
|
"""Generate a project-id option, with optional default."""
|
||||||
|
return click.option(
|
||||||
|
"--project-id",
|
||||||
|
"--project_id",
|
||||||
|
help="GCP project ID",
|
||||||
|
default=default,
|
||||||
|
callback=is_valid_project,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def respect_dryrun_skip_option(default=True):
|
||||||
|
"""Generate a respect_dryrun_skip option."""
|
||||||
|
flags = {True: "--respect-dryrun-skip", False: "--ignore-dryrun-skip"}
|
||||||
|
return click.option(
|
||||||
|
f"{flags[True]}/{flags[False]}",
|
||||||
|
help="Respect or ignore dry run SKIP configuration. "
|
||||||
|
f"Default is {flags[default]}.",
|
||||||
|
default=default,
|
||||||
|
)
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
||||||
import click
|
import click
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from ..cli.utils import is_valid_dir, is_valid_file
|
from ..cli.common import is_valid_dir, is_valid_file
|
||||||
from ..metadata.parse_metadata import METADATA_FILE, Metadata
|
from ..metadata.parse_metadata import METADATA_FILE, Metadata
|
||||||
from ..query_scheduling.dag import Dag
|
from ..query_scheduling.dag import Dag
|
||||||
from ..query_scheduling.dag_collection import DagCollection
|
from ..query_scheduling.dag_collection import DagCollection
|
||||||
|
|
|
@ -11,7 +11,7 @@ from typing import Set
|
||||||
import click
|
import click
|
||||||
from google.cloud import bigquery
|
from google.cloud import bigquery
|
||||||
|
|
||||||
from ..cli.utils import is_authenticated
|
from ..cli.common import is_authenticated
|
||||||
from ..dryrun import SKIP, DryRun
|
from ..dryrun import SKIP, DryRun
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from multiprocessing.pool import ThreadPool
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import click
|
import click
|
||||||
|
|
||||||
from ..cli.utils import (
|
from ..cli.common import (
|
||||||
is_valid_project,
|
is_valid_project,
|
||||||
table_matches_patterns,
|
table_matches_patterns,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
import string
|
import string
|
||||||
import sys
|
import sys
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from fnmatch import fnmatchcase
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -16,7 +16,15 @@ from google.cloud import bigquery
|
||||||
from google.cloud.exceptions import NotFound
|
from google.cloud.exceptions import NotFound
|
||||||
|
|
||||||
from ..cli.format import format
|
from ..cli.format import format
|
||||||
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
|
from ..cli.common import (
|
||||||
|
is_authenticated,
|
||||||
|
is_valid_project,
|
||||||
|
sql_dir_option,
|
||||||
|
use_cloud_function_option,
|
||||||
|
paths_matching_name_pattern,
|
||||||
|
project_id_option,
|
||||||
|
respect_dryrun_skip_option,
|
||||||
|
)
|
||||||
from ..dependency import get_dependency_graph
|
from ..dependency import get_dependency_graph
|
||||||
from ..dryrun import SKIP, DryRun
|
from ..dryrun import SKIP, DryRun
|
||||||
from ..format_sql.formatter import reformat
|
from ..format_sql.formatter import reformat
|
||||||
|
@ -31,80 +39,9 @@ from ..util.common import random_str
|
||||||
from .dryrun import dryrun
|
from .dryrun import dryrun
|
||||||
|
|
||||||
QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
|
QUERY_NAME_RE = re.compile(r"(?P<dataset>[a-zA-z0-9_]+)\.(?P<name>[a-zA-z0-9_]+)")
|
||||||
SQL_FILE_RE = re.compile(
|
|
||||||
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+_v[0-9]+)/"
|
|
||||||
r"(?:query\.sql|part1\.sql|script\.sql)$"
|
|
||||||
)
|
|
||||||
VERSION_RE = re.compile(r"_v[0-9]+")
|
VERSION_RE = re.compile(r"_v[0-9]+")
|
||||||
|
|
||||||
|
|
||||||
def _queries_matching_name_pattern(pattern, sql_path, project_id):
|
|
||||||
"""Return paths to queries matching the name pattern."""
|
|
||||||
sql_path = Path(sql_path)
|
|
||||||
if project_id is not None:
|
|
||||||
sql_path = sql_path / project_id
|
|
||||||
|
|
||||||
all_sql_files = Path(sql_path).rglob("*.sql")
|
|
||||||
sql_files = []
|
|
||||||
|
|
||||||
for sql_file in all_sql_files:
|
|
||||||
match = SQL_FILE_RE.match(str(sql_file))
|
|
||||||
if match:
|
|
||||||
project = match.group(1)
|
|
||||||
dataset = match.group(2)
|
|
||||||
table = match.group(3)
|
|
||||||
query_name = f"{project}.{dataset}.{table}"
|
|
||||||
if fnmatchcase(query_name, f"*{pattern}"):
|
|
||||||
sql_files.append(sql_file)
|
|
||||||
elif project_id and fnmatchcase(query_name, f"{project_id}.{pattern}"):
|
|
||||||
sql_files.append(sql_file)
|
|
||||||
|
|
||||||
return sql_files
|
|
||||||
|
|
||||||
|
|
||||||
sql_dir_option = click.option(
|
|
||||||
"--sql_dir",
|
|
||||||
help="Path to directory which contains queries.",
|
|
||||||
type=click.Path(file_okay=False),
|
|
||||||
default="sql",
|
|
||||||
callback=is_valid_dir,
|
|
||||||
)
|
|
||||||
|
|
||||||
use_cloud_function_option = click.option(
|
|
||||||
"--use_cloud_function",
|
|
||||||
"--use-cloud-function",
|
|
||||||
help=(
|
|
||||||
"Use the Cloud Function for dry running SQL, if set to `True`. "
|
|
||||||
"The Cloud Function can only access tables in shared-prod. "
|
|
||||||
"If set to `False`, use active GCP credentials for the dry run."
|
|
||||||
),
|
|
||||||
type=bool,
|
|
||||||
default=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def respect_dryrun_skip_option(default=True):
|
|
||||||
"""Generate a respect_dryrun_skip option."""
|
|
||||||
flags = {True: "--respect-dryrun-skip", False: "--ignore-dryrun-skip"}
|
|
||||||
return click.option(
|
|
||||||
f"{flags[True]}/{flags[False]}",
|
|
||||||
help="Respect or ignore dry run SKIP configuration. "
|
|
||||||
f"Default is {flags[default]}.",
|
|
||||||
default=default,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def project_id_option(default=None):
|
|
||||||
"""Generate a project-id option, with optional default."""
|
|
||||||
return click.option(
|
|
||||||
"--project-id",
|
|
||||||
"--project_id",
|
|
||||||
help="GCP project ID",
|
|
||||||
default=default,
|
|
||||||
callback=is_valid_project,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@click.group(help="Commands for managing queries.")
|
@click.group(help="Commands for managing queries.")
|
||||||
def query():
|
def query():
|
||||||
"""Create the CLI group for the query command."""
|
"""Create the CLI group for the query command."""
|
||||||
|
@ -315,7 +252,7 @@ def create(name, sql_dir, project_id, owner, init):
|
||||||
)
|
)
|
||||||
def schedule(name, sql_dir, project_id, dag, depends_on_past, task_name):
|
def schedule(name, sql_dir, project_id, dag, depends_on_past, task_name):
|
||||||
"""CLI command for scheduling a query."""
|
"""CLI command for scheduling a query."""
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
if query_files == []:
|
if query_files == []:
|
||||||
click.echo(f"Name doesn't refer to any queries: {name}", err=True)
|
click.echo(f"Name doesn't refer to any queries: {name}", err=True)
|
||||||
|
@ -413,7 +350,7 @@ def info(name, sql_dir, project_id, cost, last_updated):
|
||||||
if name is None:
|
if name is None:
|
||||||
name = "*.*"
|
name = "*.*"
|
||||||
|
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
for query_file in query_files:
|
for query_file in query_files:
|
||||||
query_file_path = Path(query_file)
|
query_file_path = Path(query_file)
|
||||||
|
@ -619,7 +556,7 @@ def backfill(
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
dates = [start_date + timedelta(i) for i in range((end_date - start_date).days + 1)]
|
dates = [start_date + timedelta(i) for i in range((end_date - start_date).days + 1)]
|
||||||
|
|
||||||
for query_file in query_files:
|
for query_file in query_files:
|
||||||
|
@ -709,7 +646,7 @@ def validate(
|
||||||
if name is None:
|
if name is None:
|
||||||
name = "*.*"
|
name = "*.*"
|
||||||
|
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
dataset_dirs = set()
|
dataset_dirs = set()
|
||||||
for query in query_files:
|
for query in query_files:
|
||||||
project = query.parent.parent.parent.name
|
project = query.parent.parent.parent.name
|
||||||
|
@ -755,7 +692,7 @@ def initialize(name, sql_dir, project_id, dry_run):
|
||||||
# allow name to be a path
|
# allow name to be a path
|
||||||
query_files = [Path(name)]
|
query_files = [Path(name)]
|
||||||
else:
|
else:
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
for query_file in query_files:
|
for query_file in query_files:
|
||||||
init_files = Path(query_file.parent).rglob("init.sql")
|
init_files = Path(query_file.parent).rglob("init.sql")
|
||||||
|
@ -846,7 +783,7 @@ def update(
|
||||||
"and check that the project is set correctly."
|
"and check that the project is set correctly."
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
dependency_graph = get_dependency_graph([sql_dir], without_views=True)
|
dependency_graph = get_dependency_graph([sql_dir], without_views=True)
|
||||||
tmp_tables = {}
|
tmp_tables = {}
|
||||||
|
|
||||||
|
@ -886,7 +823,7 @@ def update(
|
||||||
dependencies = [
|
dependencies = [
|
||||||
p
|
p
|
||||||
for k, refs in dependency_graph.items()
|
for k, refs in dependency_graph.items()
|
||||||
for p in _queries_matching_name_pattern(k, sql_dir, project_id)
|
for p in paths_matching_name_pattern(k, sql_dir, project_id)
|
||||||
if identifier in refs
|
if identifier in refs
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -934,7 +871,7 @@ def _update_query_schema(
|
||||||
for derived_from in metadata.schema.derived_from:
|
for derived_from in metadata.schema.derived_from:
|
||||||
parent_queries = [
|
parent_queries = [
|
||||||
query
|
query
|
||||||
for query in _queries_matching_name_pattern(
|
for query in paths_matching_name_pattern(
|
||||||
".".join(derived_from.table), sql_dir, project_id
|
".".join(derived_from.table), sql_dir, project_id
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -1112,7 +1049,7 @@ def deploy(
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
client = bigquery.Client()
|
client = bigquery.Client()
|
||||||
|
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
for query_file in query_files:
|
for query_file in query_files:
|
||||||
if respect_dryrun_skip and str(query_file) in SKIP:
|
if respect_dryrun_skip and str(query_file) in SKIP:
|
||||||
|
@ -1203,7 +1140,7 @@ def deploy(
|
||||||
@respect_dryrun_skip_option(default=True)
|
@respect_dryrun_skip_option(default=True)
|
||||||
def validate_schema(name, sql_dir, project_id, use_cloud_function, respect_dryrun_skip):
|
def validate_schema(name, sql_dir, project_id, use_cloud_function, respect_dryrun_skip):
|
||||||
"""Validate the defined query schema against the query and destination table."""
|
"""Validate the defined query schema against the query and destination table."""
|
||||||
query_files = _queries_matching_name_pattern(name, sql_dir, project_id)
|
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
|
||||||
|
|
||||||
def _validate_schema(query_file_path):
|
def _validate_schema(query_file_path):
|
||||||
return (
|
return (
|
||||||
|
|
|
@ -14,7 +14,7 @@ import pytest
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from ..cli.format import format
|
from ..cli.format import format
|
||||||
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
|
from ..cli.common import is_authenticated, is_valid_dir, is_valid_project
|
||||||
from ..docs import validate_docs
|
from ..docs import validate_docs
|
||||||
from ..format_sql.formatter import reformat
|
from ..format_sql.formatter import reformat
|
||||||
from ..routine import publish_routines
|
from ..routine import publish_routines
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
"""Utility functions used by the CLI."""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import fnmatch
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import click
|
|
||||||
import re
|
|
||||||
from google.cloud import bigquery
|
|
||||||
|
|
||||||
from bigquery_etl.util.common import project_dirs
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_dir(ctx, param, value):
|
|
||||||
"""Check if the parameter provided via click is an existing directory."""
|
|
||||||
if not os.path.isdir(value) or not os.path.exists(value):
|
|
||||||
raise click.BadParameter(f"Invalid directory path to {value}")
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_file(ctx, param, value):
|
|
||||||
"""Check if the parameter provided via click is an existing file."""
|
|
||||||
if not os.path.isfile(value) or not os.path.exists(value):
|
|
||||||
raise click.BadParameter(f"Invalid file path to {value}")
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
def is_authenticated(project_id=None):
|
|
||||||
"""Check if the user is authenticated to GCP and can access the project."""
|
|
||||||
client = bigquery.Client()
|
|
||||||
|
|
||||||
if project_id:
|
|
||||||
return client.project == project_id
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_valid_project(ctx, param, value):
|
|
||||||
"""Check if the provided project_id corresponds to an existing project."""
|
|
||||||
if value is None or value in [Path(p).name for p in project_dirs()]:
|
|
||||||
return value
|
|
||||||
raise click.BadParameter(f"Invalid project {value}")
|
|
||||||
|
|
||||||
|
|
||||||
def table_matches_patterns(pattern, invert, table):
|
|
||||||
"""Check if tables match pattern."""
|
|
||||||
pattern = re.compile(fnmatch.translate(pattern))
|
|
||||||
return (pattern.match(table) is not None) != invert
|
|
|
@ -1,7 +1,19 @@
|
||||||
"""bigquery-etl CLI view command."""
|
"""bigquery-etl CLI view command."""
|
||||||
import click
|
import click
|
||||||
|
import sys
|
||||||
|
|
||||||
from bigquery_etl.view import publish_views
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
|
from ..view import View
|
||||||
|
from .dryrun import dryrun
|
||||||
|
from ..cli.common import (
|
||||||
|
sql_dir_option,
|
||||||
|
use_cloud_function_option,
|
||||||
|
parallelism_option,
|
||||||
|
paths_matching_name_pattern,
|
||||||
|
project_id_option,
|
||||||
|
respect_dryrun_skip_option,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@click.group(help="Commands for managing views.")
|
@click.group(help="Commands for managing views.")
|
||||||
|
@ -10,4 +22,120 @@ def view():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
view.add_command(publish_views.main, "publish")
|
|
||||||
|
@view.command(
|
||||||
|
help="""Validate a view.
|
||||||
|
Checks formatting, naming, references and dry runs the view.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
./bqetl view validate telemetry.clients_daily
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
@click.argument("name", required=False)
|
||||||
|
@sql_dir_option
|
||||||
|
@project_id_option()
|
||||||
|
@use_cloud_function_option
|
||||||
|
@click.option(
|
||||||
|
"--validate_schemas",
|
||||||
|
"--validate-schemas",
|
||||||
|
help="Require dry run schema to match destination table and file if present.",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
@parallelism_option
|
||||||
|
@respect_dryrun_skip_option()
|
||||||
|
@click.pass_context
|
||||||
|
def validate(
|
||||||
|
ctx,
|
||||||
|
name,
|
||||||
|
sql_dir,
|
||||||
|
project_id,
|
||||||
|
use_cloud_function,
|
||||||
|
validate_schemas,
|
||||||
|
parallelism,
|
||||||
|
respect_dryrun_skip,
|
||||||
|
):
|
||||||
|
"""Validate the view definition."""
|
||||||
|
view_files = paths_matching_name_pattern(
|
||||||
|
name, sql_dir, project_id, files=("*view.sql")
|
||||||
|
)
|
||||||
|
views = [View.from_file(f) for f in view_files]
|
||||||
|
|
||||||
|
with Pool(parallelism) as p:
|
||||||
|
result = p.map(_view_is_valid, views, chunksize=1)
|
||||||
|
if not all(result):
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# dryrun views
|
||||||
|
ctx.invoke(
|
||||||
|
dryrun,
|
||||||
|
paths=[str(f) for f in view_files],
|
||||||
|
use_cloud_function=use_cloud_function,
|
||||||
|
project=project_id,
|
||||||
|
validate_schemas=validate_schemas,
|
||||||
|
respect_skip=respect_dryrun_skip,
|
||||||
|
)
|
||||||
|
|
||||||
|
click.echo("All views are valid.")
|
||||||
|
|
||||||
|
|
||||||
|
def _view_is_valid(view):
|
||||||
|
return view.is_valid()
|
||||||
|
|
||||||
|
|
||||||
|
@view.command(help="""Publish views.
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
# Publish all views
|
||||||
|
./bqetl view publish
|
||||||
|
|
||||||
|
# Publish a specific view
|
||||||
|
./bqetl view validate telemetry.clients_daily
|
||||||
|
""")
|
||||||
|
@click.argument("name", required=False)
|
||||||
|
@sql_dir_option
|
||||||
|
@project_id_option()
|
||||||
|
@click.option(
|
||||||
|
"--target-project",
|
||||||
|
help=(
|
||||||
|
"If specified, create views in the target project rather than"
|
||||||
|
" the project specified in the file. Only views for "
|
||||||
|
" moz-fx-data-shared-prod will be published if this is set."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@click.option("--log-level", default="INFO", help="Defaults to INFO")
|
||||||
|
@parallelism_option
|
||||||
|
@click.option(
|
||||||
|
"--dry_run",
|
||||||
|
"--dry-run",
|
||||||
|
is_flag=True,
|
||||||
|
help="Validate view definitions, but do not publish them.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--user-facing-only",
|
||||||
|
"--user_facing_only",
|
||||||
|
is_flag=True,
|
||||||
|
help=(
|
||||||
|
"Publish user-facing views only. User-facing views are views"
|
||||||
|
" part of datasets without suffixes (such as telemetry,"
|
||||||
|
" but not telemetry_derived)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def publish(name, sql_dir, project_id, target_project, log_level, parallelism, dry_run, user_facing_only):
|
||||||
|
"""Publish views."""
|
||||||
|
view_files = paths_matching_name_pattern(
|
||||||
|
name, sql_dir, project_id, files=("view.sql",)
|
||||||
|
)
|
||||||
|
|
||||||
|
views = [View.from_file(f) for f in view_files]
|
||||||
|
|
||||||
|
with Pool(parallelism) as p:
|
||||||
|
result = p.map(_publish_view, views, chunksize=1)
|
||||||
|
if not all(result):
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
click.echo("All have been published.")
|
||||||
|
|
||||||
|
def _publish_view(view):
|
||||||
|
view.publish()
|
||||||
|
|
|
@ -9,7 +9,7 @@ from typing import Dict, Iterator, List, Tuple
|
||||||
import click
|
import click
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from .view.generate_stable_views import get_stable_table_schemas
|
from bigquery_etl.view.generate_stable_views import get_stable_table_schemas
|
||||||
|
|
||||||
stable_views = None
|
stable_views = None
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
"""Represents a SQL view."""
|
||||||
|
|
||||||
|
import attr
|
||||||
|
import sqlparse
|
||||||
|
|
||||||
|
from google.cloud import bigquery
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from bigquery_etl.util import extract_from_query_path
|
||||||
|
|
||||||
|
|
||||||
|
# skip validation for these views
|
||||||
|
SKIP_VALIDATION = {
|
||||||
|
# not matching directory structure, but created before validation was enforced
|
||||||
|
"sql/moz-fx-data-shared-prod/stripe/subscription/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/stripe/product/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/stripe/plan/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/client_probe_counts_v1/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/clients_daily_histogram_aggregates_v1/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/clients_scalar_aggregates_v1/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/clients_daily_scalar_aggregates_v1/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/clients_histogram_aggregates_v1/view.sql",
|
||||||
|
"sql/moz-fx-data-shared-prod/telemetry/clients_probe_processes/view.sql",
|
||||||
|
}
|
||||||
|
|
||||||
|
# skip publishing these views
|
||||||
|
SKIP_PUBLISHING = {
|
||||||
|
# Access Denied
|
||||||
|
"activity_stream/tile_id_types/view.sql",
|
||||||
|
"pocket/pocket_reach_mau/view.sql",
|
||||||
|
"telemetry/buildhub2/view.sql",
|
||||||
|
# Dataset glam-fenix-dev:glam_etl was not found
|
||||||
|
# TODO: this should be removed if views are to be automatically deployed
|
||||||
|
*[str(path) for path in Path("sql/glam-fenix-dev").glob("glam_etl/**/view.sql")],
|
||||||
|
}
|
||||||
|
|
||||||
|
# suffixes of datasets with non-user-facing views
|
||||||
|
NON_USER_FACING_DATASET_SUFFIXES = (
|
||||||
|
"_derived",
|
||||||
|
"_external",
|
||||||
|
"_bi",
|
||||||
|
"_restricted",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(auto_attribs=True)
|
||||||
|
class View:
|
||||||
|
"""Representation of a SQL view stored in a view.sql file."""
|
||||||
|
|
||||||
|
path: str = attr.ib()
|
||||||
|
name: str = attr.ib()
|
||||||
|
dataset: str = attr.ib()
|
||||||
|
project: str = attr.ib()
|
||||||
|
|
||||||
|
# todo: validators
|
||||||
|
|
||||||
|
def content(self):
|
||||||
|
"""Return the view SQL."""
|
||||||
|
return Path(self.path).read_text()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, path):
|
||||||
|
"""View from SQL file."""
|
||||||
|
project, dataset, name = extract_from_query_path(path)
|
||||||
|
return cls(path=str(path), name=name, dataset=dataset, project=project)
|
||||||
|
|
||||||
|
def is_valid(self):
|
||||||
|
"""Validate the SQL view definition."""
|
||||||
|
if self.path in SKIP_VALIDATION:
|
||||||
|
print(f"Skipped validation for {self.path}")
|
||||||
|
return True
|
||||||
|
return self._valid_fully_qualified_references() and self._valid_view_naming()
|
||||||
|
|
||||||
|
def _valid_fully_qualified_references(self):
|
||||||
|
"""Check that referenced tables and views are fully qualified."""
|
||||||
|
from bigquery_etl.dependency import extract_table_references
|
||||||
|
|
||||||
|
for table in extract_table_references(self.content()):
|
||||||
|
if len(table.split(".")) < 3:
|
||||||
|
print(f"{self.path} ERROR\n{table} missing project_id qualifier")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _valid_view_naming(self):
|
||||||
|
"""Validate that the created view naming matches the directory structure."""
|
||||||
|
parsed = sqlparse.parse(self.content())[0]
|
||||||
|
tokens = [
|
||||||
|
t
|
||||||
|
for t in parsed.tokens
|
||||||
|
if not (t.is_whitespace or isinstance(t, sqlparse.sql.Comment))
|
||||||
|
]
|
||||||
|
is_view_statement = (
|
||||||
|
" ".join(tokens[0].normalized.split()) == "CREATE OR REPLACE"
|
||||||
|
and tokens[1].normalized == "VIEW"
|
||||||
|
)
|
||||||
|
if is_view_statement:
|
||||||
|
target_view = str(tokens[2]).strip().split()[0]
|
||||||
|
try:
|
||||||
|
[project_id, dataset_id, view_id] = target_view.replace("`", "").split(
|
||||||
|
"."
|
||||||
|
)
|
||||||
|
if not (
|
||||||
|
self.name == view_id
|
||||||
|
and self.dataset == dataset_id
|
||||||
|
and self.project == project_id
|
||||||
|
):
|
||||||
|
print(
|
||||||
|
f"{self.path} ERROR\n"
|
||||||
|
f"View name {target_view} not matching directory structure."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
print(f"{self.path} ERROR\n{target_view} missing project ID qualifier.")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"ERROR: {self.path} does not appear to be "
|
||||||
|
"a CREATE OR REPLACE VIEW statement! Quitting..."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def publish(self):
|
||||||
|
"""Publish this view to BigQuery."""
|
||||||
|
client = bigquery.Client()
|
|
@ -5,7 +5,7 @@ import yaml
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
|
|
||||||
from bigquery_etl.cli.query import (
|
from bigquery_etl.cli.query import (
|
||||||
_queries_matching_name_pattern,
|
paths_matching_name_pattern,
|
||||||
create,
|
create,
|
||||||
info,
|
info,
|
||||||
schedule,
|
schedule,
|
||||||
|
@ -285,7 +285,7 @@ class TestQuery:
|
||||||
assert "telemetry_derived.query_v2" in result.output
|
assert "telemetry_derived.query_v2" in result.output
|
||||||
assert "telemetry_derived.query_v1" not in result.output
|
assert "telemetry_derived.query_v1" not in result.output
|
||||||
|
|
||||||
def test_queries_matching_name_pattern(self, runner):
|
def testpaths_matching_name_pattern(self, runner):
|
||||||
with runner.isolated_filesystem():
|
with runner.isolated_filesystem():
|
||||||
os.makedirs("sql/moz-fx-data-shared-prod/telemetry_derived/query_v1")
|
os.makedirs("sql/moz-fx-data-shared-prod/telemetry_derived/query_v1")
|
||||||
with open(
|
with open(
|
||||||
|
@ -311,10 +311,10 @@ class TestQuery:
|
||||||
) as f:
|
) as f:
|
||||||
f.write("SELECT 1")
|
f.write("SELECT 1")
|
||||||
|
|
||||||
assert len(_queries_matching_name_pattern("*", "sql/", None)) == 4
|
assert len(paths_matching_name_pattern("*", "sql/", None)) == 4
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"*.sql", "sql/", "moz-fx-data-shared-prod"
|
"*.sql", "sql/", "moz-fx-data-shared-prod"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -322,7 +322,7 @@ class TestQuery:
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"test", "sql/", "moz-fx-data-shared-prod"
|
"test", "sql/", "moz-fx-data-shared-prod"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -330,7 +330,7 @@ class TestQuery:
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"foo_derived", "sql/", "moz-fx-data-shared-prod"
|
"foo_derived", "sql/", "moz-fx-data-shared-prod"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -338,16 +338,16 @@ class TestQuery:
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"foo_derived*", "sql/", "moz-fx-data-shared-prod"
|
"foo_derived*", "sql/", "moz-fx-data-shared-prod"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
== 1
|
== 1
|
||||||
)
|
)
|
||||||
assert len(_queries_matching_name_pattern("*query*", "sql/", None)) == 4
|
assert len(paths_matching_name_pattern("*query*", "sql/", None)) == 4
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"foo_derived.query_v2", "sql/", "moz-fx-data-shared-prod"
|
"foo_derived.query_v2", "sql/", "moz-fx-data-shared-prod"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -356,7 +356,7 @@ class TestQuery:
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"telemetry_derived.query_v1", "sql/", "moz-fx-data-test-project"
|
"telemetry_derived.query_v1", "sql/", "moz-fx-data-test-project"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -365,7 +365,7 @@ class TestQuery:
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"moz-fx-data-test-project.telemetry_derived.query_v1",
|
"moz-fx-data-test-project.telemetry_derived.query_v1",
|
||||||
"sql/",
|
"sql/",
|
||||||
None,
|
None,
|
||||||
|
@ -376,7 +376,7 @@ class TestQuery:
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
len(
|
len(
|
||||||
_queries_matching_name_pattern(
|
paths_matching_name_pattern(
|
||||||
"moz-fx-data-test-project.telemetry_derived.*", "sql/", None
|
"moz-fx-data-test-project.telemetry_derived.*", "sql/", None
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,7 +3,7 @@ from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
from click.exceptions import BadParameter
|
from click.exceptions import BadParameter
|
||||||
|
|
||||||
from bigquery_etl.cli.utils import (
|
from bigquery_etl.cli.common import (
|
||||||
is_authenticated,
|
is_authenticated,
|
||||||
is_valid_dir,
|
is_valid_dir,
|
||||||
is_valid_file,
|
is_valid_file,
|
||||||
|
|
Загрузка…
Ссылка в новой задаче