DENG1381 - Add bqetl support for deprecation metadata (#4213)

* Support bq dataset deprecation process (metadata)

* Add bqetl metadata cli command

* Initial draft for adding deprecation support to bqetl

* Incorporate Anna's feedback

* Fix based on whd's feedback

* Fix ci issues

* Remove unnecessary logic from metadata.py

* Add dataset metadata yaml for ga_derived

* Ignore dirs that do not have dataset_metadata yaml

* Remove unwanted dataset metadata yamls

* Update bigquery_etl/cli/metadata.py

Co-authored-by: whd <whd@users.noreply.github.com>

---------

Co-authored-by: whd <whd@users.noreply.github.com>
This commit is contained in:
Alekhya 2023-09-12 14:47:54 -04:00 коммит произвёл GitHub
Родитель 199f027b87
Коммит 2e916eb856
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 245 добавлений и 2 удалений

Просмотреть файл

@ -334,6 +334,9 @@ jobs:
PATH="venv/bin:$PATH" script/bqetl dependency record \
--skip-existing \
"/tmp/workspace/generated-sql/sql/"
PATH="venv/bin:$PATH" script/bqetl metadata update \
--sql-dir /tmp/workspace/generated-sql/sql/ \
/tmp/workspace/generated-sql/sql/
- persist_to_workspace:
root: /tmp/workspace
paths:
@ -454,6 +457,9 @@ jobs:
PATH="venv/bin:$PATH" script/bqetl dependency record \
--skip-existing \
"/tmp/workspace/private-generated-sql/sql/"
PATH="venv/bin:$PATH" script/bqetl metadata update \
--sql-dir /tmp/workspace/private-generated-sql/sql/ \
/tmp/workspace/private-generated-sql/sql/
- persist_to_workspace:
root: /tmp/workspace
paths:

Просмотреть файл

@ -16,6 +16,7 @@ from ..cli.dag import dag
from ..cli.dryrun import dryrun
from ..cli.format import format
from ..cli.generate import generate
from ..cli.metadata import metadata
from ..cli.query import query
from ..cli.routine import mozfun, routine
from ..cli.stage import stage
@ -52,6 +53,7 @@ def cli(prog_name=None):
"stage": stage,
"backfill": backfill,
"check": check,
"metadata": metadata,
}
@click.group(commands=commands)

Просмотреть файл

@ -0,0 +1,73 @@
"""bigquery-etl CLI metadata command."""
from pathlib import Path
from typing import Optional
import click
from bigquery_etl.metadata.parse_metadata import DatasetMetadata, Metadata
from ..cli.utils import paths_matching_name_pattern, project_id_option, sql_dir_option
@click.group(
help="""
Commands for managing bqetl metadata.
"""
)
@click.pass_context
def metadata(ctx):
"""Create the CLI group for the metadata command."""
pass
@metadata.command(
help="""
Update metadata yaml files.
Updates workgroup access metadata based on the dataset_metadata.yaml and
deprecation metadata.
Example:
./bqetl metadata update ga_derived.downloads_with_attribution_v2
""",
context_settings=dict(
ignore_unknown_options=True,
allow_extra_args=True,
),
)
@click.argument("name")
@project_id_option()
@sql_dir_option
def update(name: str, sql_dir: Optional[str], project_id: Optional[str]) -> None:
"""Update metadata yaml file."""
table_metadata_files = paths_matching_name_pattern(
name, sql_dir, project_id=project_id, files=["metadata.yaml"]
)
dataset_metadata_path = None
# create and populate the dataset metadata yaml file if it does not exist
for table_metadata_file in table_metadata_files:
dataset_metadata_path = (
Path(table_metadata_file).parent.parent / "dataset_metadata.yaml"
)
if not dataset_metadata_path.exists():
continue
dataset_metadata = DatasetMetadata.from_file(dataset_metadata_path)
table_metadata = Metadata.from_file(table_metadata_file)
# set dataset metadata default_table_workgroup_access to table_workgroup_access if not set
if not dataset_metadata.default_table_workgroup_access:
dataset_metadata.default_table_workgroup_access = (
dataset_metadata.workgroup_access
)
dataset_metadata.write(dataset_metadata_path)
if table_metadata.deprecated:
# set workgroup: [] if table has been tagged as deprecated
# this overwrites existing workgroups
table_metadata.workgroup_access = []
else:
if table_metadata.workgroup_access is None:
table_metadata.workgroup_access = (
dataset_metadata.default_table_workgroup_access
)
table_metadata.write(table_metadata_file)
click.echo(f"Updated {table_metadata_file}")
return None

Просмотреть файл

@ -17,6 +17,7 @@ DATASET_METADATA_FILE = "dataset_metadata.yaml"
DEFAULT_WORKGROUP_ACCESS = [
dict(role="roles/bigquery.dataViewer", members=["workgroup:mozilla-confidential"])
]
DEFAULT_TABLE_WORKGROUP_ACCESS = DEFAULT_WORKGROUP_ACCESS
class Literal(str):
@ -149,6 +150,7 @@ class Metadata:
workgroup_access: Optional[List[WorkgroupAccessMetadata]] = attr.ib(None)
references: Dict = attr.ib({})
external_data: Optional[ExternalDataMetadata] = attr.ib(None)
deprecated: bool = attr.ib(False)
@owners.validator
def validate_owners(self, attribute, value):
@ -223,6 +225,7 @@ class Metadata:
workgroup_access = None
references = {}
external_data = None
deprecated = False
with open(metadata_file, "r") as yaml_stream:
try:
@ -283,6 +286,8 @@ class Metadata:
external_data = converter.structure(
metadata["external_data"], ExternalDataMetadata
)
if "deprecated" in metadata:
deprecated = metadata["deprecated"]
return cls(
friendly_name,
@ -295,6 +300,7 @@ class Metadata:
workgroup_access,
references,
external_data,
deprecated,
)
except yaml.YAMLError as e:
raise e
@ -405,8 +411,14 @@ class DatasetMetadata:
dataset_base_acl: str = attr.ib()
user_facing: bool = attr.ib(False)
labels: Dict = attr.ib({})
default_table_workgroup_access: Optional[List[Dict[str, Any]]] = attr.ib(None)
workgroup_access: list = attr.ib(DEFAULT_WORKGROUP_ACCESS)
def __attrs_post_init__(self):
"""Set default table workgroup access to workgroup access."""
if self.default_table_workgroup_access is None:
self.default_table_workgroup_access = self.workgroup_access
@staticmethod
def is_dataset_metadata_file(file_path):
"""
@ -420,7 +432,6 @@ class DatasetMetadata:
def write(self, file):
"""Write dataset metadata information to the provided file."""
metadata_dict = self.__dict__
if metadata_dict["labels"]:
for label_key, label_value in metadata_dict["labels"].items():
# handle tags
@ -430,6 +441,11 @@ class DatasetMetadata:
if "description" in metadata_dict:
metadata_dict["description"] = Literal(metadata_dict["description"])
if "default_table_workgroup_access" in metadata_dict:
metadata_dict["default_table_workgroup_access"] = metadata_dict[
"default_table_workgroup_access"
]
converter = cattrs.BaseConverter()
file.write_text(
yaml.dump(

Просмотреть файл

@ -50,6 +50,9 @@ def publish_metadata(client, dataset, table, metadata):
if isinstance(value, str)
}
if metadata.deprecated is True:
table.labels["deprecated"] = "true"
client.update_table(table, ["friendly_name", "description", "labels"])
except yaml.YAMLError as e:
print(e)

Просмотреть файл

@ -1,12 +1,18 @@
import distutils
import os
import tempfile
from pathlib import Path
import pytest
import yaml
from click.testing import CliRunner
from bigquery_etl.cli.metadata import update
from bigquery_etl.metadata.parse_metadata import Metadata
from bigquery_etl.metadata.validate_metadata import validate_change_control
TEST_DIR = Path(__file__).parent.parent
class TestMetadata:
test_path = "sql/moz-fx-data-shared-prod/telemetry_derived/query_v1"
@ -175,3 +181,59 @@ class TestMetadata:
codeowners_conf=codeowners,
expected_result=False,
)
def test_metadata_update_with_no_deprecation(self, runner):
with tempfile.TemporaryDirectory() as tmpdirname:
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
name = [
str(tmpdirname)
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/"
]
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
with open(
tmpdirname
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/metadata.yaml",
"r",
) as stream:
metadata = yaml.safe_load(stream)
assert metadata["workgroup_access"][0]["role"] == "roles/bigquery.dataViewer"
assert metadata["workgroup_access"][0]["members"] == [
"workgroup:mozilla-confidential"
]
assert not metadata["deprecated"]
def test_metadata_update_with_deprecation(self, runner):
with tempfile.TemporaryDirectory() as tmpdirname:
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
name = [
str(tmpdirname)
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_scalar_aggregates_v1/"
]
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
with open(
tmpdirname
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_scalar_aggregates_v1/metadata.yaml",
"r",
) as stream:
metadata = yaml.safe_load(stream)
assert metadata["workgroup_access"] == []
assert metadata["deprecated"]
def test_metadata_update_do_not_update(self, runner):
with tempfile.TemporaryDirectory() as tmpdirname:
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
name = [
str(tmpdirname)
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_keyed_scalar_aggregates_v1/"
]
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
with open(
tmpdirname
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_keyed_scalar_aggregates_v1/metadata.yaml",
"r",
) as stream:
metadata = yaml.safe_load(stream)
print(metadata)
assert metadata["workgroup_access"][0]["role"] == "roles/bigquery.dataViewer"
assert metadata["workgroup_access"][0]["members"] == ["workgroup:revenue/cat4"]
assert not metadata["deprecated"]

Просмотреть файл

@ -0,0 +1,14 @@
friendly_name: Test Dataset
description: |-
Use for testing.
dataset_base_acl: view
user_facing: true
labels: {}
default_table_workgroup_access:
- role: roles/bigquery.dataViewer
members:
- test_default_member
workgroup_access:
- role: roles/bigquery.metadataViewer
members:
- test_member

Просмотреть файл

@ -19,3 +19,4 @@ workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:dataops-managed/taar
deprecated: true

Просмотреть файл

@ -2,7 +2,11 @@ from pathlib import Path
import pytest
from bigquery_etl.metadata.parse_metadata import Metadata, PartitionType
from bigquery_etl.metadata.parse_metadata import (
DatasetMetadata,
Metadata,
PartitionType,
)
TEST_DIR = Path(__file__).parent.parent
@ -179,3 +183,31 @@ class TestParseMetadata(object):
assert metadata.bigquery.time_partitioning.require_partition_filter
assert metadata.bigquery.time_partitioning.expiration_days == 2
assert metadata.bigquery.time_partitioning.expiration_ms == 2 * 86400000
def test_of_deprecated_metadata(self):
metadata = Metadata.of_table(
"test",
"non_incremental_query",
"v1",
TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project",
)
assert metadata.deprecated
def test_of_dataset_metadata(self):
metadata = DatasetMetadata.from_file(
TEST_DIR
/ "data"
/ "test_sql"
/ "moz-fx-data-test-project"
/ "test"
/ "dataset_metadata.yaml",
)
assert metadata.default_table_workgroup_access[0]["members"] == [
"test_default_member"
]
assert (
metadata.default_table_workgroup_access[0]["role"]
== "roles/bigquery.dataViewer"
)

Просмотреть файл

@ -0,0 +1,9 @@
friendly_name: Test metadata.yaml
description: |-
Clustering fields: `column1`
owners:
- test@mozilla.com
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:revenue/cat4

Просмотреть файл

@ -0,0 +1,6 @@
friendly_name: Test metadata.yaml
description: |-
Clustering fields: `column1`
owners:
- test@mozilla.com
deprecated: true

Просмотреть файл

@ -0,0 +1,6 @@
friendly_name: Test metadata.yaml
description: |-
Clustering fields: `column1`
owners:
- test@mozilla.com

Просмотреть файл

@ -0,0 +1,13 @@
friendly_name: Test telemetry data
description: |-
Test Derived data based on pings
dataset_base_acl: derived
user_facing: false
workgroup_access:
- role: roles/bigquery.metadataViewer
members:
- workgroup:mozilla-confidential
default_table_workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential