254 строки
7.8 KiB
Python
254 строки
7.8 KiB
Python
"""Utility functions used by the CLI."""
|
|
|
|
import fnmatch
|
|
import os
|
|
import re
|
|
from fnmatch import fnmatchcase
|
|
from glob import glob
|
|
from pathlib import Path
|
|
from typing import Iterator, List, Optional, Tuple
|
|
|
|
import click
|
|
from google.auth.exceptions import DefaultCredentialsError
|
|
from google.cloud import bigquery
|
|
|
|
from bigquery_etl.config import ConfigLoader
|
|
from bigquery_etl.util.common import TempDatasetReference, project_dirs
|
|
|
|
QUERY_FILE_RE = re.compile(
|
|
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
|
|
r"(?:query\.sql|part1\.sql|script\.sql|query\.py|view\.sql|metadata\.yaml|backfill\.yaml)$"
|
|
)
|
|
CHECKS_FILE_RE = re.compile(
|
|
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
|
|
r"(?:checks\.sql)$"
|
|
)
|
|
|
|
|
|
def is_valid_dir(ctx, param, value):
|
|
"""Check if the parameter provided via click is an existing directory."""
|
|
if not os.path.isdir(value) or not os.path.exists(value):
|
|
raise click.BadParameter(f"Invalid directory path to {value}")
|
|
return value
|
|
|
|
|
|
def is_valid_file(ctx, param, value):
|
|
"""Check if the parameter provided via click is an existing file."""
|
|
if not os.path.isfile(value) or not os.path.exists(value):
|
|
raise click.BadParameter(f"Invalid file path to {value}")
|
|
return value
|
|
|
|
|
|
def is_authenticated():
|
|
"""Check if the user is authenticated to GCP."""
|
|
try:
|
|
bigquery.Client(project="")
|
|
except DefaultCredentialsError:
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_valid_project(ctx, param, value):
|
|
"""Check if the provided project_id corresponds to an existing project."""
|
|
if (
|
|
value is None
|
|
or value
|
|
in [Path(p).name for p in project_dirs()]
|
|
+ [
|
|
ConfigLoader.get("default", "test_project"),
|
|
ConfigLoader.get("default", "user_facing_project", fallback="mozdata"),
|
|
]
|
|
+ ConfigLoader.get("default", "additional_projects", fallback=[])
|
|
or value.startswith(ConfigLoader.get("default", "backfill_project"))
|
|
):
|
|
return value
|
|
raise click.BadParameter(f"Invalid project {value}")
|
|
|
|
|
|
def table_matches_patterns(pattern, invert, table):
|
|
"""Check if tables match pattern."""
|
|
if not isinstance(pattern, list):
|
|
pattern = [pattern]
|
|
|
|
matching = False
|
|
for p in pattern:
|
|
compiled_pattern = re.compile(fnmatch.translate(p))
|
|
if compiled_pattern.match(table) is not None:
|
|
matching = True
|
|
break
|
|
|
|
return matching != invert
|
|
|
|
|
|
def paths_matching_checks_pattern(
|
|
pattern: str, sql_path: Optional[str], project_id: Optional[str]
|
|
) -> Iterator[Tuple[Path, str, str, str]]:
|
|
"""Return single path to checks.sql matching the name pattern."""
|
|
checks_files = paths_matching_name_pattern(
|
|
pattern, sql_path, project_id, ["checks.sql"], CHECKS_FILE_RE
|
|
)
|
|
|
|
if checks_files:
|
|
for checks_file in checks_files:
|
|
match = CHECKS_FILE_RE.match(str(checks_file))
|
|
if match:
|
|
project = match.group(1)
|
|
dataset = match.group(2)
|
|
table = match.group(3)
|
|
yield checks_file, project, dataset, table
|
|
else:
|
|
print(f"No checks.sql file found in {sql_path}/{project_id}/{pattern}")
|
|
|
|
|
|
def paths_matching_name_pattern(
|
|
pattern, sql_path, project_id, files=["*.sql"], file_regex=QUERY_FILE_RE
|
|
) -> List[Path]:
|
|
"""Return paths to queries matching the name pattern."""
|
|
matching_files: List[Path] = []
|
|
|
|
if pattern is None:
|
|
pattern = "*.*"
|
|
|
|
# click nargs are passed in as a tuple
|
|
if isinstance(pattern, tuple) or isinstance(pattern, list):
|
|
for p in pattern:
|
|
matching_files += paths_matching_name_pattern(
|
|
str(p), sql_path, project_id, files, file_regex
|
|
)
|
|
elif os.path.isdir(pattern):
|
|
for root, _, _ in os.walk(pattern, followlinks=True):
|
|
for file in files:
|
|
matching_files.extend(
|
|
map(Path, glob(f"{root}/**/{file}", recursive=True))
|
|
)
|
|
elif os.path.isfile(pattern):
|
|
matching_files.append(Path(pattern))
|
|
else:
|
|
sql_path = Path(sql_path)
|
|
if project_id is not None:
|
|
sql_path = sql_path / project_id
|
|
|
|
all_matching_files: List[Path] = []
|
|
|
|
for file in files:
|
|
all_matching_files.extend(
|
|
map(Path, glob(f"{sql_path}/**/{file}", recursive=True))
|
|
)
|
|
|
|
for query_file in all_matching_files:
|
|
match = file_regex.match(str(query_file))
|
|
if match:
|
|
project = match.group(1)
|
|
dataset = match.group(2)
|
|
table = match.group(3)
|
|
query_name = f"{project}.{dataset}.{table}"
|
|
if fnmatchcase(query_name, f"*{pattern}"):
|
|
matching_files.append(query_file)
|
|
elif project_id and fnmatchcase(query_name, f"{project_id}.{pattern}"):
|
|
matching_files.append(query_file)
|
|
|
|
if len(matching_files) == 0:
|
|
print(f"No files matching: {pattern}")
|
|
|
|
return matching_files
|
|
|
|
|
|
sql_dir_option = click.option(
|
|
"--sql_dir",
|
|
"--sql-dir",
|
|
help="Path to directory which contains queries.",
|
|
type=click.Path(file_okay=False),
|
|
default=ConfigLoader.get("default", "sql_dir", fallback="sql"),
|
|
callback=is_valid_dir,
|
|
)
|
|
|
|
|
|
use_cloud_function_option = click.option(
|
|
"--use_cloud_function",
|
|
"--use-cloud-function",
|
|
help=(
|
|
"Use the Cloud Function for dry running SQL, if set to `True`. "
|
|
"The Cloud Function can only access tables in shared-prod. "
|
|
"If set to `False`, use active GCP credentials for the dry run."
|
|
),
|
|
type=bool,
|
|
default=True,
|
|
)
|
|
|
|
|
|
def parallelism_option(default=8):
|
|
"""Generate a parallelism option, with optional default."""
|
|
return click.option(
|
|
"--parallelism",
|
|
"-p",
|
|
default=default,
|
|
type=int,
|
|
help="Number of threads for parallel processing",
|
|
)
|
|
|
|
|
|
def project_id_option(default=None, required=False):
|
|
"""Generate a project-id option, with optional default."""
|
|
return click.option(
|
|
"--project-id",
|
|
"--project_id",
|
|
help="GCP project ID",
|
|
default=default,
|
|
callback=is_valid_project,
|
|
required=required,
|
|
)
|
|
|
|
|
|
def billing_project_option(default=None, required=False):
|
|
"""Generate a billing-project option, with optional default."""
|
|
return click.option(
|
|
"--billing-project",
|
|
"--billing_project",
|
|
help=(
|
|
"GCP project ID to run the query in. "
|
|
"This can be used to run a query using a different slot reservation "
|
|
"than the one used by the query's default project."
|
|
),
|
|
type=str,
|
|
default=default,
|
|
required=required,
|
|
)
|
|
|
|
|
|
def respect_dryrun_skip_option(default=True):
|
|
"""Generate a respect_dryrun_skip option."""
|
|
flags = {True: "--respect-dryrun-skip", False: "--ignore-dryrun-skip"}
|
|
return click.option(
|
|
f"{flags[True]}/{flags[False]}",
|
|
help="Respect or ignore dry run skip configuration. "
|
|
f"Default is {flags[default]}.",
|
|
default=default,
|
|
)
|
|
|
|
|
|
def no_dryrun_option(default=False):
|
|
"""Generate a skip_dryrun option."""
|
|
return click.option(
|
|
"--no-dryrun",
|
|
"--no_dryrun",
|
|
help="Skip running dryrun. " f"Default is {default}.",
|
|
default=default,
|
|
is_flag=True,
|
|
)
|
|
|
|
|
|
def temp_dataset_option(
|
|
default=f"{ConfigLoader.get('default', 'project', fallback='moz-fx-data-shared-prod')}.tmp",
|
|
):
|
|
"""Generate a temp-dataset option."""
|
|
return click.option(
|
|
"--temp-dataset",
|
|
"--temp_dataset",
|
|
"--temporary-dataset",
|
|
"--temporary_dataset",
|
|
default=default,
|
|
type=TempDatasetReference.from_string,
|
|
help="Dataset where intermediate query results will be temporarily stored, "
|
|
"formatted as PROJECT_ID.DATASET_ID",
|
|
)
|