Move routine config to bqetl_project.yaml (#4038)

This commit is contained in:
Anna Scholtz 2023-07-11 10:52:48 -07:00 коммит произвёл GitHub
Родитель 8ccd05433a
Коммит 8d72cfa9fe
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 89 добавлений и 63 удалений

Просмотреть файл

@ -14,7 +14,13 @@ import pytest
import yaml
from ..cli.format import format
from ..cli.utils import is_authenticated, is_valid_dir, is_valid_project
from ..cli.utils import (
is_authenticated,
is_valid_project,
project_id_option,
sql_dir_option,
)
from ..config import ConfigLoader
from ..docs import validate as validate_docs
from ..format_sql.formatter import reformat
from ..routine import publish_routines
@ -27,10 +33,6 @@ ROUTINE_FILE_RE = re.compile(
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+)/"
r"(udf\.sql|stored_procedure\.sql)$"
)
DEFAULT_UDF_DEPENDENCY_DIR = "udf_js_lib/"
DEFAULT_GCS_BUCKET = "moz-fx-data-prod-bigquery-etl"
DEFAULT_GCS_PATH = ""
DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
def _routines_matching_name_pattern(pattern, sql_path, project_id):
@ -57,22 +59,6 @@ def _routines_matching_name_pattern(pattern, sql_path, project_id):
return routine_files
sql_dir_option = click.option(
"--sql_dir",
help="Path to directory which contains queries.",
type=click.Path(file_okay=False),
default="sql",
callback=is_valid_dir,
)
project_id_option = click.option(
"--project-id",
"--project_id",
help="GCP project ID",
callback=lambda *args: is_valid_project(*args) if args[-1] else args[-1],
)
def get_project_id(ctx, project_id=None):
"""Return the project id with the option flag taking priority."""
if project_id:
@ -92,7 +78,7 @@ def get_project_id(ctx, project_id=None):
def routine(ctx):
"""Create the CLI group for the routine command."""
ctx.ensure_object(dict)
ctx.obj["DEFAULT_PROJECT"] = "moz-fx-data-shared-prod"
ctx.obj["DEFAULT_PROJECT"] = ConfigLoader.get("default", "project")
@click.group(help="Commands for managing public mozfun routines.")
@ -100,7 +86,7 @@ def routine(ctx):
def mozfun(ctx):
"""Create the CLI group for the mozfun command."""
ctx.ensure_object(dict)
ctx.obj["DEFAULT_PROJECT"] = "mozfun"
ctx.obj["DEFAULT_PROJECT"] = ConfigLoader.get("routine", "project")
@routine.command(
@ -124,7 +110,7 @@ def mozfun(ctx):
)
@click.argument("name")
@sql_dir_option
@project_id_option
@project_id_option()
@click.option("--udf", "-u", is_flag=True, help="Create a new UDF", default=False)
@click.option(
"--stored_procedure",
@ -270,7 +256,7 @@ Examples:
)
@click.argument("name", required=False)
@sql_dir_option
@project_id_option
@project_id_option()
@click.option("--usages", "-u", is_flag=True, help="Show routine usages", default=False)
@click.pass_context
def info(ctx, name, sql_dir, project_id, usages):
@ -352,7 +338,7 @@ Examples:
)
@click.argument("name", required=False)
@sql_dir_option
@project_id_option
@project_id_option()
@click.option(
"--docs-only",
"--docs_only",
@ -410,23 +396,23 @@ Examples:
""",
)
@click.argument("name", required=False)
@project_id_option
@project_id_option()
@click.option(
"--dependency-dir",
"--dependency_dir",
default=DEFAULT_UDF_DEPENDENCY_DIR,
default=ConfigLoader.get("routine", "dependency_dir"),
help="The directory JavaScript dependency files for UDFs are stored.",
)
@click.option(
"--gcs-bucket",
"--gcs_bucket",
default=DEFAULT_GCS_BUCKET,
default=ConfigLoader.get("routine", "publish", "gcs_bucket"),
help="The GCS bucket where dependency files are uploaded to.",
)
@click.option(
"--gcs-path",
"--gcs_path",
default=DEFAULT_GCS_PATH,
default=ConfigLoader.get("routine", "publish", "gcs_path"),
help="The GCS path in the bucket where dependency files are uploaded to.",
)
@click.option(
@ -484,7 +470,7 @@ by Airflow only."""
@click.argument("name", required=True)
@click.argument("new_name", required=True)
@sql_dir_option
@project_id_option
@project_id_option()
@click.pass_context
def rename(ctx, name, new_name, sql_dir, project_id):
"""Rename routines based on pattern."""

Просмотреть файл

@ -10,6 +10,7 @@ import click
from google.auth.exceptions import DefaultCredentialsError
from google.cloud import bigquery
from bigquery_etl.config import ConfigLoader
from bigquery_etl.util.common import TempDatasetReference, project_dirs
QUERY_FILE_RE = re.compile(
@ -20,7 +21,6 @@ CHECKS_FILE_RE = re.compile(
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
r"(?:checks\.sql)$"
)
TEST_PROJECT = "bigquery-etl-integration-test"
MOZDATA = "mozdata"
PIONEER_NONPROD = "moz-fx-data-pioneer-nonprod"
PIONEER_PROD = "moz-fx-data-pioneer-prod"
@ -57,7 +57,7 @@ def is_valid_project(ctx, param, value):
or value
in [Path(p).name for p in project_dirs()]
+ [
TEST_PROJECT,
ConfigLoader.get("default", "test_project"),
MOZDATA,
PIONEER_NONPROD,
PIONEER_PROD,
@ -149,7 +149,7 @@ sql_dir_option = click.option(
"--sql-dir",
help="Path to directory which contains queries.",
type=click.Path(file_okay=False),
default="sql",
default=ConfigLoader.get("default", "sql_dir", fallback="sql"),
callback=is_valid_dir,
)

Просмотреть файл

@ -80,7 +80,7 @@ class DryRun:
}
# update skip list to include renamed queries in stage.
test_project = ConfigLoader.get("dry_run", "test_project", fallback="")
test_project = ConfigLoader.get("default", "test_project", fallback="")
file_pattern_re = re.compile(r"sql/([^\/]+)/([^/]+)(/?.*|$)")
skip_files.update(
[

Просмотреть файл

@ -7,6 +7,8 @@ from argparse import ArgumentParser
import click
from bigquery_etl.config import ConfigLoader
from ..util import standard_args
from ..util.common import project_dirs
from .parse_metadata import DatasetMetadata, Metadata
@ -37,7 +39,7 @@ def validate_change_control(
metadata,
codeowners_file,
project_id="moz-fx-data-shared-prod",
sql_dir="sql",
sql_dir=ConfigLoader.get("default", "sql_dir", fallback="sql"),
):
"""Verify that a query is correctly setup for change control."""
path_to_add = file_path.partition(f"{project_id}/")[2]

Просмотреть файл

@ -14,6 +14,7 @@ import attr
import sqlparse
import yaml
from bigquery_etl.config import ConfigLoader
from bigquery_etl.metadata.parse_metadata import METADATA_FILE
from bigquery_etl.util.common import render
@ -21,7 +22,6 @@ UDF_CHAR = "[a-zA-Z0-9_]"
UDF_FILE = "udf.sql"
PROCEDURE_FILE = "stored_procedure.sql"
ROUTINE_FILE = (UDF_FILE, PROCEDURE_FILE)
EXAMPLE_DIR = "examples"
TEMP_UDF_RE = re.compile(f"(?:udf|assert)_{UDF_CHAR}+")
PERSISTENT_UDF_PREFIX_RE_STR = (
r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:FUNCTION|PROCEDURE)(?:\s+IF\s+NOT\s+EXISTS)?"
@ -33,8 +33,6 @@ PERSISTENT_UDF_RE = re.compile(
)
UDF_NAME_RE = re.compile(r"^([a-zA-Z0-9_]+\.)?[a-zA-Z][a-zA-Z0-9_]{0,255}$")
GENERIC_DATASET = "_generic_dataset_"
SQL_DIR = Path("sql/")
ASSERT_UDF_DIR = "tests/assert"
raw_routines = {}
@ -57,8 +55,10 @@ def get_routines(project):
"""Return all routines that could be referenced by the project."""
return (
get_routines_from_dir(project)
+ get_routines_from_dir(SQL_DIR / "mozfun")
+ get_routines_from_dir(ASSERT_UDF_DIR)
+ get_routines_from_dir(
Path(ConfigLoader.get("default", "sql_dir", fallback="sql")) / "mozfun"
)
+ get_routines_from_dir(ConfigLoader.get("routine", "assert_udf_dir"))
) # assert UDFs used for testing
@ -227,14 +227,17 @@ def read_routine_dir(*project_dirs):
global raw_routines
if not project_dirs:
project_dirs = (SQL_DIR, ASSERT_UDF_DIR)
project_dirs = (
ConfigLoader.get("default", "sql_dir"),
ConfigLoader.get("routine", "assert_udf_dir"),
)
if project_dirs not in raw_routines:
raw_routines[project_dirs] = {
raw_routine.name: raw_routine
for project_dir in project_dirs
for root, dirs, files in os.walk(project_dir)
if os.path.basename(root) != EXAMPLE_DIR
if os.path.basename(root) != ConfigLoader.get("routine", "example_dir")
for filename in files
if filename in ROUTINE_FILE
for raw_routine in (RawRoutine.from_file(os.path.join(root, filename)),)
@ -246,7 +249,11 @@ def read_routine_dir(*project_dirs):
def parse_routines(project_dir):
"""Read routine contents of the project dir into ParsedRoutine instances."""
# collect udfs to parse
raw_routines = read_routine_dir(project_dir, SQL_DIR / "mozfun", ASSERT_UDF_DIR)
raw_routines = read_routine_dir(
project_dir,
Path(ConfigLoader.get("default", "sql_dir", fallback="sql")) / "mozfun",
ConfigLoader.get("routine", "assert_udf_dir"),
)
# prepend udf definitions to tests
for raw_routine in raw_routines.values():

Просмотреть файл

@ -1,6 +1,7 @@
"""Publish UDFs and resources to the public mozfun GCP project."""
import fnmatch
import glob
import json
import os
import re
@ -9,20 +10,14 @@ from argparse import ArgumentParser
from google.cloud import storage # type: ignore
from google.cloud import bigquery
from bigquery_etl.config import ConfigLoader
from bigquery_etl.routine.parse_routine import accumulate_dependencies, read_routine_dir
from bigquery_etl.util import standard_args
from bigquery_etl.util.common import project_dirs
DEFAULT_UDF_DEPENDENCY_DIR = "udf_js_lib/"
DEFAULT_GCS_BUCKET = "moz-fx-data-prod-bigquery-etl"
DEFAULT_GCS_PATH = ""
DEFAULT_PROJECT = "sql/moz-fx-data-shared-prod"
SQL_DIR = "sql/"
OPTIONS_LIB_RE = re.compile(r'library = "gs://[^"]+/([^"]+)"')
OPTIONS_RE = re.compile(r"OPTIONS(\n|\s)*\(")
SKIP = ["sql/moz-fx-data-shared-prod/udf/main_summary_scalars/udf.sql"]
parser = ArgumentParser(description=__doc__)
parser.add_argument(
@ -34,26 +29,27 @@ parser.add_argument(
)
parser.add_argument(
"--target",
default=DEFAULT_PROJECT,
default=ConfigLoader.get("default", "sql_dir", fallback="sql/")
+ ConfigLoader.get("default", "project"),
required=False,
help="Path to project directory.",
)
parser.add_argument(
"--dependency-dir",
"--dependency_dir",
default=DEFAULT_UDF_DEPENDENCY_DIR,
default=ConfigLoader.get("routine", "dependency_dir"),
help="The directory JavaScript dependency files for UDFs are stored.",
)
parser.add_argument(
"--gcs-bucket",
"--gcs_bucket",
default=DEFAULT_GCS_BUCKET,
default=ConfigLoader.get("routine", "publish", "gcs_bucket"),
help="The GCS bucket where dependency files are uploaded to.",
)
parser.add_argument(
"--gcs-path",
"--gcs_path",
default=DEFAULT_GCS_PATH,
default=ConfigLoader.get("routine", "publish", "gcs_path"),
help="The GCS path in the bucket where dependency files are uploaded to.",
)
parser.add_argument(
@ -83,7 +79,7 @@ def main():
publish(
args.target,
args.project_id,
os.path.join(SQL_DIR, project, args.dependency_dir),
os.path.join(ConfigLoader.get("sql_dir"), project, args.dependency_dir),
args.gcs_bucket,
args.gcs_path,
args.public,
@ -91,6 +87,18 @@ def main():
)
def skipped_routines():
"""Get skipped routines from config."""
return {
file
for skip in ConfigLoader.get("dry_run", "skip", fallback=[])
for file in glob.glob(
skip,
recursive=True,
)
}
def publish(
target,
project_id,
@ -123,7 +131,10 @@ def publish(
udfs_to_publish.append(raw_routine)
for dep in udfs_to_publish:
if dep not in published_routines and raw_routines[dep].filepath not in SKIP:
if (
dep not in published_routines
and raw_routines[dep].filepath not in skipped_routines()
):
publish_routine(
raw_routines[dep],
client,
@ -185,7 +196,10 @@ def publish_routine(
)
# add UDF descriptions
if raw_routine.filepath not in SKIP and not raw_routine.is_stored_procedure:
if (
raw_routine.filepath not in skipped_routines()
and not raw_routine.is_stored_procedure
):
# descriptions need to be escaped since quotation marks and other
# characters, such as \x01, will make the query invalid otherwise
escaped_description = json.dumps(str(raw_routine.description))

Просмотреть файл

@ -1,6 +1,10 @@
default:
project: moz-fx-data-shared-prod
sql_dir: sql/
test_project: bigquery-etl-integration-test
dry_run:
function: https://us-central1-moz-fx-data-shared-prod.cloudfunctions.net/bigquery-etl-dryrun
test_project: bigquery-etl-integration-test
skip:
# Access Denied
- sql/moz-fx-data-shared-prod/account_ecosystem_derived/ecosystem_client_id_lookup_v1/query.sql
@ -301,3 +305,14 @@ format:
- sql/moz-fx-data-shared-prod/udf_legacy/date_trunc.sql
- sql/moz-fx-data-shared-prod/udf_legacy/to_iso8601.sql
- stored_procedures/safe_crc32_uuid.sql
routine:
dependency_dir: udf_js_lib/
publish:
gcs_bucket: moz-fx-data-prod-bigquery-etl
gcs_path: ""
skip:
- sql/moz-fx-data-shared-prod/udf/main_summary_scalars/udf.sql
assert_udf_dir: tests/assert
example_dir: examples
project: mozfun

Просмотреть файл

@ -1,6 +1,8 @@
default:
test_project: bigquery-etl-integration-test
dry_run:
function: https://us-central1-moz-fx-data-shared-prod.cloudfunctions.net/bigquery-etl-dryrun
test_project: bigquery-etl-integration-test
skip:
# Access Denied
- sql/moz-fx-data-shared-prod/test_derived/some_query_v1/query.sql

Просмотреть файл

@ -14,14 +14,14 @@ class TestConfig:
ConfigLoader.set_project_dir(TEST_DIR / "data")
assert "function" in ConfigLoader.get("dry_run")
assert "test_project" in ConfigLoader.get("dry_run")
assert "skip" in ConfigLoader.get("dry_run")
assert (
ConfigLoader.get("dry_run", "test_project")
ConfigLoader.get("default", "test_project")
== "bigquery-etl-integration-test"
)
assert len(ConfigLoader.get("dry_run", "skip")) == 2
assert len(ConfigLoader.get("dry_run", "skip")) > 0
assert "dry_run" in ConfigLoader.get()