DENG1381 - Add bqetl support for deprecation metadata (#4213)
* Support bq dataset deprecation process (metadata) * Add bqetl metadata cli command * Initial draft for adding deprecation support to bqetl * Incorporate Anna's feedback * Fix based on whd's feedback * Fix ci issues * Remove unnecessary logic from metadata.py * Add dataset metadata yaml for ga_derived * Ignore dirs that do not have dataset_metadata yaml * Remove unwanted dataset metadata yamls * Update bigquery_etl/cli/metadata.py Co-authored-by: whd <whd@users.noreply.github.com> --------- Co-authored-by: whd <whd@users.noreply.github.com>
This commit is contained in:
Родитель
199f027b87
Коммит
2e916eb856
|
@ -334,6 +334,9 @@ jobs:
|
|||
PATH="venv/bin:$PATH" script/bqetl dependency record \
|
||||
--skip-existing \
|
||||
"/tmp/workspace/generated-sql/sql/"
|
||||
PATH="venv/bin:$PATH" script/bqetl metadata update \
|
||||
--sql-dir /tmp/workspace/generated-sql/sql/ \
|
||||
/tmp/workspace/generated-sql/sql/
|
||||
- persist_to_workspace:
|
||||
root: /tmp/workspace
|
||||
paths:
|
||||
|
@ -454,6 +457,9 @@ jobs:
|
|||
PATH="venv/bin:$PATH" script/bqetl dependency record \
|
||||
--skip-existing \
|
||||
"/tmp/workspace/private-generated-sql/sql/"
|
||||
PATH="venv/bin:$PATH" script/bqetl metadata update \
|
||||
--sql-dir /tmp/workspace/private-generated-sql/sql/ \
|
||||
/tmp/workspace/private-generated-sql/sql/
|
||||
- persist_to_workspace:
|
||||
root: /tmp/workspace
|
||||
paths:
|
||||
|
|
|
@ -16,6 +16,7 @@ from ..cli.dag import dag
|
|||
from ..cli.dryrun import dryrun
|
||||
from ..cli.format import format
|
||||
from ..cli.generate import generate
|
||||
from ..cli.metadata import metadata
|
||||
from ..cli.query import query
|
||||
from ..cli.routine import mozfun, routine
|
||||
from ..cli.stage import stage
|
||||
|
@ -52,6 +53,7 @@ def cli(prog_name=None):
|
|||
"stage": stage,
|
||||
"backfill": backfill,
|
||||
"check": check,
|
||||
"metadata": metadata,
|
||||
}
|
||||
|
||||
@click.group(commands=commands)
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
"""bigquery-etl CLI metadata command."""
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
|
||||
from bigquery_etl.metadata.parse_metadata import DatasetMetadata, Metadata
|
||||
|
||||
from ..cli.utils import paths_matching_name_pattern, project_id_option, sql_dir_option
|
||||
|
||||
|
||||
@click.group(
|
||||
help="""
|
||||
Commands for managing bqetl metadata.
|
||||
"""
|
||||
)
|
||||
@click.pass_context
|
||||
def metadata(ctx):
|
||||
"""Create the CLI group for the metadata command."""
|
||||
pass
|
||||
|
||||
|
||||
@metadata.command(
|
||||
help="""
|
||||
Update metadata yaml files.
|
||||
Updates workgroup access metadata based on the dataset_metadata.yaml and
|
||||
deprecation metadata.
|
||||
|
||||
Example:
|
||||
./bqetl metadata update ga_derived.downloads_with_attribution_v2
|
||||
""",
|
||||
context_settings=dict(
|
||||
ignore_unknown_options=True,
|
||||
allow_extra_args=True,
|
||||
),
|
||||
)
|
||||
@click.argument("name")
|
||||
@project_id_option()
|
||||
@sql_dir_option
|
||||
def update(name: str, sql_dir: Optional[str], project_id: Optional[str]) -> None:
|
||||
"""Update metadata yaml file."""
|
||||
table_metadata_files = paths_matching_name_pattern(
|
||||
name, sql_dir, project_id=project_id, files=["metadata.yaml"]
|
||||
)
|
||||
dataset_metadata_path = None
|
||||
# create and populate the dataset metadata yaml file if it does not exist
|
||||
for table_metadata_file in table_metadata_files:
|
||||
dataset_metadata_path = (
|
||||
Path(table_metadata_file).parent.parent / "dataset_metadata.yaml"
|
||||
)
|
||||
if not dataset_metadata_path.exists():
|
||||
continue
|
||||
dataset_metadata = DatasetMetadata.from_file(dataset_metadata_path)
|
||||
table_metadata = Metadata.from_file(table_metadata_file)
|
||||
|
||||
# set dataset metadata default_table_workgroup_access to table_workgroup_access if not set
|
||||
if not dataset_metadata.default_table_workgroup_access:
|
||||
dataset_metadata.default_table_workgroup_access = (
|
||||
dataset_metadata.workgroup_access
|
||||
)
|
||||
dataset_metadata.write(dataset_metadata_path)
|
||||
if table_metadata.deprecated:
|
||||
# set workgroup: [] if table has been tagged as deprecated
|
||||
# this overwrites existing workgroups
|
||||
table_metadata.workgroup_access = []
|
||||
else:
|
||||
if table_metadata.workgroup_access is None:
|
||||
table_metadata.workgroup_access = (
|
||||
dataset_metadata.default_table_workgroup_access
|
||||
)
|
||||
table_metadata.write(table_metadata_file)
|
||||
click.echo(f"Updated {table_metadata_file}")
|
||||
return None
|
|
@ -17,6 +17,7 @@ DATASET_METADATA_FILE = "dataset_metadata.yaml"
|
|||
DEFAULT_WORKGROUP_ACCESS = [
|
||||
dict(role="roles/bigquery.dataViewer", members=["workgroup:mozilla-confidential"])
|
||||
]
|
||||
DEFAULT_TABLE_WORKGROUP_ACCESS = DEFAULT_WORKGROUP_ACCESS
|
||||
|
||||
|
||||
class Literal(str):
|
||||
|
@ -149,6 +150,7 @@ class Metadata:
|
|||
workgroup_access: Optional[List[WorkgroupAccessMetadata]] = attr.ib(None)
|
||||
references: Dict = attr.ib({})
|
||||
external_data: Optional[ExternalDataMetadata] = attr.ib(None)
|
||||
deprecated: bool = attr.ib(False)
|
||||
|
||||
@owners.validator
|
||||
def validate_owners(self, attribute, value):
|
||||
|
@ -223,6 +225,7 @@ class Metadata:
|
|||
workgroup_access = None
|
||||
references = {}
|
||||
external_data = None
|
||||
deprecated = False
|
||||
|
||||
with open(metadata_file, "r") as yaml_stream:
|
||||
try:
|
||||
|
@ -283,6 +286,8 @@ class Metadata:
|
|||
external_data = converter.structure(
|
||||
metadata["external_data"], ExternalDataMetadata
|
||||
)
|
||||
if "deprecated" in metadata:
|
||||
deprecated = metadata["deprecated"]
|
||||
|
||||
return cls(
|
||||
friendly_name,
|
||||
|
@ -295,6 +300,7 @@ class Metadata:
|
|||
workgroup_access,
|
||||
references,
|
||||
external_data,
|
||||
deprecated,
|
||||
)
|
||||
except yaml.YAMLError as e:
|
||||
raise e
|
||||
|
@ -405,8 +411,14 @@ class DatasetMetadata:
|
|||
dataset_base_acl: str = attr.ib()
|
||||
user_facing: bool = attr.ib(False)
|
||||
labels: Dict = attr.ib({})
|
||||
default_table_workgroup_access: Optional[List[Dict[str, Any]]] = attr.ib(None)
|
||||
workgroup_access: list = attr.ib(DEFAULT_WORKGROUP_ACCESS)
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
"""Set default table workgroup access to workgroup access."""
|
||||
if self.default_table_workgroup_access is None:
|
||||
self.default_table_workgroup_access = self.workgroup_access
|
||||
|
||||
@staticmethod
|
||||
def is_dataset_metadata_file(file_path):
|
||||
"""
|
||||
|
@ -420,7 +432,6 @@ class DatasetMetadata:
|
|||
def write(self, file):
|
||||
"""Write dataset metadata information to the provided file."""
|
||||
metadata_dict = self.__dict__
|
||||
|
||||
if metadata_dict["labels"]:
|
||||
for label_key, label_value in metadata_dict["labels"].items():
|
||||
# handle tags
|
||||
|
@ -430,6 +441,11 @@ class DatasetMetadata:
|
|||
if "description" in metadata_dict:
|
||||
metadata_dict["description"] = Literal(metadata_dict["description"])
|
||||
|
||||
if "default_table_workgroup_access" in metadata_dict:
|
||||
metadata_dict["default_table_workgroup_access"] = metadata_dict[
|
||||
"default_table_workgroup_access"
|
||||
]
|
||||
|
||||
converter = cattrs.BaseConverter()
|
||||
file.write_text(
|
||||
yaml.dump(
|
||||
|
|
|
@ -50,6 +50,9 @@ def publish_metadata(client, dataset, table, metadata):
|
|||
if isinstance(value, str)
|
||||
}
|
||||
|
||||
if metadata.deprecated is True:
|
||||
table.labels["deprecated"] = "true"
|
||||
|
||||
client.update_table(table, ["friendly_name", "description", "labels"])
|
||||
except yaml.YAMLError as e:
|
||||
print(e)
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
import distutils
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from click.testing import CliRunner
|
||||
|
||||
from bigquery_etl.cli.metadata import update
|
||||
from bigquery_etl.metadata.parse_metadata import Metadata
|
||||
from bigquery_etl.metadata.validate_metadata import validate_change_control
|
||||
|
||||
TEST_DIR = Path(__file__).parent.parent
|
||||
|
||||
|
||||
class TestMetadata:
|
||||
test_path = "sql/moz-fx-data-shared-prod/telemetry_derived/query_v1"
|
||||
|
@ -175,3 +181,59 @@ class TestMetadata:
|
|||
codeowners_conf=codeowners,
|
||||
expected_result=False,
|
||||
)
|
||||
|
||||
def test_metadata_update_with_no_deprecation(self, runner):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
|
||||
name = [
|
||||
str(tmpdirname)
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/"
|
||||
]
|
||||
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
|
||||
with open(
|
||||
tmpdirname
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_v6/metadata.yaml",
|
||||
"r",
|
||||
) as stream:
|
||||
metadata = yaml.safe_load(stream)
|
||||
assert metadata["workgroup_access"][0]["role"] == "roles/bigquery.dataViewer"
|
||||
assert metadata["workgroup_access"][0]["members"] == [
|
||||
"workgroup:mozilla-confidential"
|
||||
]
|
||||
assert not metadata["deprecated"]
|
||||
|
||||
def test_metadata_update_with_deprecation(self, runner):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
|
||||
name = [
|
||||
str(tmpdirname)
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_scalar_aggregates_v1/"
|
||||
]
|
||||
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
|
||||
with open(
|
||||
tmpdirname
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_scalar_aggregates_v1/metadata.yaml",
|
||||
"r",
|
||||
) as stream:
|
||||
metadata = yaml.safe_load(stream)
|
||||
assert metadata["workgroup_access"] == []
|
||||
assert metadata["deprecated"]
|
||||
|
||||
def test_metadata_update_do_not_update(self, runner):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
distutils.dir_util.copy_tree(str(TEST_DIR), str(tmpdirname))
|
||||
name = [
|
||||
str(tmpdirname)
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_keyed_scalar_aggregates_v1/"
|
||||
]
|
||||
runner.invoke(update, name, "--sql_dir=" + str(tmpdirname) + "/sql")
|
||||
with open(
|
||||
tmpdirname
|
||||
+ "/sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_keyed_scalar_aggregates_v1/metadata.yaml",
|
||||
"r",
|
||||
) as stream:
|
||||
metadata = yaml.safe_load(stream)
|
||||
print(metadata)
|
||||
assert metadata["workgroup_access"][0]["role"] == "roles/bigquery.dataViewer"
|
||||
assert metadata["workgroup_access"][0]["members"] == ["workgroup:revenue/cat4"]
|
||||
assert not metadata["deprecated"]
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
friendly_name: Test Dataset
|
||||
description: |-
|
||||
Use for testing.
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
default_table_workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- test_default_member
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.metadataViewer
|
||||
members:
|
||||
- test_member
|
|
@ -19,3 +19,4 @@ workgroup_access:
|
|||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:dataops-managed/taar
|
||||
deprecated: true
|
||||
|
|
|
@ -2,7 +2,11 @@ from pathlib import Path
|
|||
|
||||
import pytest
|
||||
|
||||
from bigquery_etl.metadata.parse_metadata import Metadata, PartitionType
|
||||
from bigquery_etl.metadata.parse_metadata import (
|
||||
DatasetMetadata,
|
||||
Metadata,
|
||||
PartitionType,
|
||||
)
|
||||
|
||||
TEST_DIR = Path(__file__).parent.parent
|
||||
|
||||
|
@ -179,3 +183,31 @@ class TestParseMetadata(object):
|
|||
assert metadata.bigquery.time_partitioning.require_partition_filter
|
||||
assert metadata.bigquery.time_partitioning.expiration_days == 2
|
||||
assert metadata.bigquery.time_partitioning.expiration_ms == 2 * 86400000
|
||||
|
||||
def test_of_deprecated_metadata(self):
|
||||
metadata = Metadata.of_table(
|
||||
"test",
|
||||
"non_incremental_query",
|
||||
"v1",
|
||||
TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project",
|
||||
)
|
||||
|
||||
assert metadata.deprecated
|
||||
|
||||
def test_of_dataset_metadata(self):
|
||||
metadata = DatasetMetadata.from_file(
|
||||
TEST_DIR
|
||||
/ "data"
|
||||
/ "test_sql"
|
||||
/ "moz-fx-data-test-project"
|
||||
/ "test"
|
||||
/ "dataset_metadata.yaml",
|
||||
)
|
||||
|
||||
assert metadata.default_table_workgroup_access[0]["members"] == [
|
||||
"test_default_member"
|
||||
]
|
||||
assert (
|
||||
metadata.default_table_workgroup_access[0]["role"]
|
||||
== "roles/bigquery.dataViewer"
|
||||
)
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
friendly_name: Test metadata.yaml
|
||||
description: |-
|
||||
Clustering fields: `column1`
|
||||
owners:
|
||||
- test@mozilla.com
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:revenue/cat4
|
|
@ -0,0 +1,6 @@
|
|||
friendly_name: Test metadata.yaml
|
||||
description: |-
|
||||
Clustering fields: `column1`
|
||||
owners:
|
||||
- test@mozilla.com
|
||||
deprecated: true
|
|
@ -0,0 +1,6 @@
|
|||
friendly_name: Test metadata.yaml
|
||||
description: |-
|
||||
|
||||
Clustering fields: `column1`
|
||||
owners:
|
||||
- test@mozilla.com
|
|
@ -0,0 +1,13 @@
|
|||
friendly_name: Test telemetry data
|
||||
description: |-
|
||||
Test Derived data based on pings
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.metadataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
||||
default_table_workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
Загрузка…
Ссылка в новой задаче