Make validate-metadata run on all files (#6891)

This commit is contained in:
Ben Wu 2025-01-29 10:49:00 -05:00 коммит произвёл GitHub
Родитель fa5b93dc8b
Коммит 2ee41e836f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 42 добавлений и 12 удалений

Просмотреть файл

@ -297,7 +297,7 @@ jobs:
command: |
# TODO: Add check here to make sure all queries have metadata.yaml
PATH="venv/bin:$PATH" script/bqetl query validate \
--respect-dryrun-skip
--no-dryrun --skip-format-sql
- *copy_debug_sql
- *store_debug_artifacts
- unless:

Просмотреть файл

@ -99,7 +99,7 @@ def dryrun(
if not sql_files:
click.echo("Skipping dry run because no queries matched")
sys.exit(0)
return
if not use_cloud_function and not is_authenticated():
click.echo(

Просмотреть файл

@ -1359,6 +1359,7 @@ def _run_part(
)
@respect_dryrun_skip_option(default=False)
@no_dryrun_option(default=False)
@click.option("--skip_format_sql", "--skip-format-sql", is_flag=True, default=False)
@click.pass_context
def validate(
ctx,
@ -1369,6 +1370,7 @@ def validate(
validate_schemas,
respect_dryrun_skip,
no_dryrun,
skip_format_sql,
):
"""Validate queries by dry running, formatting and checking scheduling configs."""
if name is None:
@ -1376,8 +1378,11 @@ def validate(
query_files = paths_matching_name_pattern(name, sql_dir, project_id)
dataset_dirs = set()
errors = []
for query in query_files:
ctx.invoke(format, paths=[str(query)])
click.echo(f"Validating metadata for {query}")
if not skip_format_sql:
ctx.invoke(format, paths=[str(query)])
if not no_dryrun:
ctx.invoke(
@ -1389,14 +1394,27 @@ def validate(
respect_skip=respect_dryrun_skip,
)
validate_metadata.validate(query.parent)
try:
validate_metadata.validate(query.parent)
except validate_metadata.MetadataValidationError as e:
errors.append(str(e))
dataset_dirs.add(query.parent.parent)
if no_dryrun:
click.echo("Dry run skipped for query files.")
for dataset_dir in dataset_dirs:
validate_metadata.validate_datasets(dataset_dir)
try:
validate_metadata.validate_datasets(dataset_dir)
except validate_metadata.MetadataValidationError as e:
errors.append(str(e))
if len(errors) > 0:
click.echo(
f"Failed to validate {len(errors)} metadata files (see above for error messages):"
)
click.echo("\n".join(errors))
sys.exit(1)
def _initialize_in_parallel(

Просмотреть файл

@ -2,7 +2,6 @@
import logging
import os
import sys
from argparse import ArgumentParser
from pathlib import Path
@ -111,6 +110,14 @@ def validate_shredder_mitigation(query_dir, metadata):
if has_shredder_mitigation:
schema_file = Path(query_dir) / SCHEMA_FILE
if not schema_file.exists():
click.echo(
click.style(
f"Table {query_dir} does not have schema.yaml required for shredder mitigation.",
fg="yellow",
)
)
return False
schema = Schema.from_schema_file(schema_file).to_bigquery_schema()
# This label requires that the query doesn't have id-level columns,
@ -281,6 +288,10 @@ def validate_retention_policy_based_on_table_type(metadata, path):
return is_valid
class MetadataValidationError(Exception):
"""Metadata validation failed."""
def validate(target):
"""Validate metadata files."""
failed = False
@ -322,11 +333,11 @@ def validate(target):
# todo more validation
# e.g. https://github.com/mozilla/bigquery-etl/issues/924
else:
logging.error(f"Invalid target: {target}, target must be a directory.")
sys.exit(1)
raise ValueError(f"Invalid target: {target}, target must be a directory.")
if failed:
sys.exit(1)
# TODO: add failed checks to message
raise MetadataValidationError(f"Metadata validation failed for {target}")
def validate_datasets(target):
@ -340,11 +351,12 @@ def validate_datasets(target):
path = os.path.join(root, file)
_ = DatasetMetadata.from_file(path)
else:
logging.error(f"Invalid target: {target}, target must be a directory.")
sys.exit(1)
raise ValueError(f"Invalid target: {target}, target must be a directory.")
if failed:
sys.exit(1)
raise MetadataValidationError(
f"Dataset metadata validation failed for {target}"
)
def main():