Bug 1708264 - Generate dataset metadata (#1988)

* Generate dataset_metadata.yaml based on doctypes

* Exclude pioneer from dataset_metadata.yaml creation

* Mark `static` as user-facing

* Use namespace in friendly names, and use default descriptions
This commit is contained in:
Jeff Klukas 2021-05-03 11:47:43 -04:00 коммит произвёл GitHub
Родитель e81d4ff988
Коммит 7851b5d37a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
34 изменённых файлов: 409 добавлений и 11 удалений

Просмотреть файл

@ -25,6 +25,7 @@ from typing import List
from bigquery_etl.dryrun import DryRun
from bigquery_etl.format_sql.formatter import reformat
from bigquery_etl.metadata.parse_metadata import DatasetMetadata
from bigquery_etl.util import standard_args
MPS_URI = "https://github.com/mozilla-services/mozilla-pipeline-schemas"
@ -109,6 +110,55 @@ class SchemaFile:
)
def write_dataset_metadata_if_not_exists(
target_project: str, sql_dir: Path, schema: SchemaFile
):
"""Write default dataset_metadata.yaml files where none exist.
This function expects to be handed one representative `SchemaFile`
object representing the dataset.
"""
dataset_family = schema.bq_dataset_family
project_dir = sql_dir / target_project
# Derived dataset
dataset_name = f"{dataset_family}_derived"
target = project_dir / dataset_name / "dataset_metadata.yaml"
target.parent.mkdir(parents=True, exist_ok=True)
if not target.exists():
print(f"Creating {target}")
DatasetMetadata(
friendly_name=f"{schema.document_namespace} Derived",
description=(
f"Derived tables related to document namespace"
f" {schema.document_namespace},"
f" usually populated via queries defined in"
f" https://github.com/mozilla/bigquery-etl"
f" and managed by Airflow"
),
dataset_base_acl="derived",
user_facing=False,
).write(target)
# User-facing dataset
dataset_name = dataset_family
target = project_dir / dataset_name / "dataset_metadata.yaml"
target.parent.mkdir(parents=True, exist_ok=True)
if not target.exists():
print(f"Creating {target}")
DatasetMetadata(
friendly_name=f"{schema.document_namespace}",
description=(
f"User-facing views related to document namespace"
f" {schema.document_namespace}; see https://github.com/"
f"mozilla-services/mozilla-pipeline-schemas/tree/"
f"generated-schemas/schemas/{schema.document_namespace}"
),
dataset_base_acl="view",
user_facing=True,
).write(target)
def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaFile):
"""If a view.sql does not already exist, write one to the target directory."""
target_dir = (
@ -117,16 +167,12 @@ def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaF
/ schema.bq_dataset_family
/ schema.bq_table_unversioned
)
target_file = target_dir / "view.sql"
if target_file.exists():
return
# Exclude doctypes maintained in separate projects.
for prefix in SKIP_PREFIXES:
if schema.bq_dataset_family.startswith(prefix):
return
full_source_id = f"{target_project}.{schema.stable_table}"
full_view_id = f"{target_project}.{schema.user_facing_view}"
replacements = ["mozfun.norm.metadata(metadata) AS metadata"]
@ -205,17 +251,29 @@ def get_stable_table_schemas() -> List[SchemaFile]:
document_version=version,
)
)
# Exclude doctypes maintained in separate projects.
for prefix in SKIP_PREFIXES:
schemas = [
schema
for schema in schemas
if not schema.document_namespace.startswith(prefix)
]
# Retain only the highest version per doctype.
schemas = sorted(
schemas,
key=lambda t: f"{t.document_namespace}/{t.document_type}/{t.document_version:03d}",
)
return [
schemas = [
last
for k, (*_, last) in groupby(
schemas, lambda t: f"{t.document_namespace}/{t.document_type}"
)
]
return schemas
def prod_schemas_uri():
"""Return URI for the schemas tarball deployed to shared-prod.
@ -250,6 +308,9 @@ def main():
parser.error(f"argument --log-level: {e}")
schemas = get_stable_table_schemas()
one_schema_per_dataset = [
last for k, (*_, last) in groupby(schemas, lambda t: t.bq_dataset_family)
]
with ThreadPool(args.parallelism) as pool:
pool.map(
@ -261,6 +322,15 @@ def main():
schemas,
chunksize=1,
)
pool.map(
partial(
write_dataset_metadata_if_not_exists,
args.target_project,
Path(args.sql_dir),
),
one_schema_per_dataset,
chunksize=1,
)
if __name__ == "__main__":

Просмотреть файл

@ -1,7 +1,7 @@
friendly_name: Activity Stream
# yamllint disable rule:line-length
description: |-
Views related to data from the activity-stream namespace,
capturing activity on the desktop Firefox newtab page.
User-facing views related to document namespace activity-stream; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/activity-stream
dataset_base_acl: view
user_facing: true
labels: {}

Просмотреть файл

@ -1,7 +1,7 @@
friendly_name: Activity Stream
friendly_name: Activity Stream Derived
# yamllint disable rule:line-length
description: |-
Derived data from the activity-stream namespace,
capturing activity on the desktop Firefox newtab page.
Derived tables related to document namespace activity-stream, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
dataset_base_acl: derived
user_facing: false
labels: {}

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: AMO Stats Dev
description: |-
Derived data used to power the dev instance of the AMO stats dashboards
dataset_base_acl: stable
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
- workgroup:amo/nonprod

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: AMO Stats Prod
description: |-
Derived data used to power the AMO stats dashboards
dataset_base_acl: stable
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
- workgroup:amo/prod

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Analysis
description: |-
User-generated tables for analysis
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,12 @@
friendly_name: Blocklist ADI
description: |-
Historical data for Firefox active daily installations.
See https://wiki.mozilla.org/ADI
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: contextual-services
# yamllint disable rule:line-length
description: |-
User-facing views related to document namespace contextual-services; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/contextual-services
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:contextual-services

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: contextual-services Derived
# yamllint disable rule:line-length
description: |-
Derived tables related to document namespace contextual-services, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:contextual-services

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Firefox Accounts
description: |-
Data related to Firefox Accounts
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Firefox Accounts Derived
description: |-
Derived tables for FxA data
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,15 @@
friendly_name: Internet Outages
# yamllint disable rule:line-length
description: |-
Derived data useful for detecting regional internet outages,
shared with some trusted external partners.
See https://docs.google.com/document/d/15HvdPS3UwGhAir6HyWHDLDRaxUFycx4jagJ_ZrHr9D8/edit
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
- workgroup:internet-outages/external

Просмотреть файл

@ -0,0 +1,13 @@
friendly_name: Looker Derived
description: |-
Cache tables populated and used by Looker
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
- role: roles/bigquery.dataEditor
members:
- workgroup:dataops-managed/looker

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Monitoring
description: |-
User-facing views for pipeline monitoring,
including views on `payload_bytes` tables
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Monitoring Derived
description: |-
Derived tables used for various pipeline monitoring purposes.
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Mozilla VPN
description: |-
Data related to the Mozilla VPN service
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Mozilla VPN Derived
description: |-
Derived data related to the Mozilla VPN service
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,7 @@
friendly_name: Mozilla VPN External
description: |-
Data extracted from the Mozilla VPN service database.
dataset_base_acl: restricted
user_facing: false
labels: {}
workgroup_access: []

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: regrets-reporter
# yamllint disable rule:line-length
description: |-
User-facing views related to document namespace regrets-reporter; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/regrets-reporter
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:regrets-reporter

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: regrets-reporter Derived
# yamllint disable rule:line-length
description: |-
Derived tables related to document namespace regrets-reporter, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:regrets-reporter

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Revenue Derived
description: |-
Derived tables related to Mozilla revenue work
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Search
description: |-
Views related to client search counts
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Search Derived
description: |-
Derived tables related to client search counts
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Static Data
description: |-
Static tables, often useful for data-enriching joins
dataset_base_acl: udf
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Stripe
description: |-
Views related to data extracted from payment provider Stripe
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Stripe Derived
description: |-
Derived data based on extract from payment provider Stripe
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Stripe External
description: |-
API extracts from Stripe, a payments partner
dataset_base_acl: restricted
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:stripe
- workgroup:data-science/stripe

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Subscription Platform
description: |-
Combined subscription information from multiple products
and payment platforms; see bug 1703340
dataset_base_acl: view_restricted
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,8 @@
friendly_name: Subscription Platform Derived
description: |-
Tables combining subscription information from multiple products
and payment platforms
dataset_base_acl: derived_restricted
user_facing: false
labels: {}
workgroup_access: []

Просмотреть файл

@ -0,0 +1,12 @@
friendly_name: Telemetry
description: |-
Views on data from legacy Firefox telemetry, plus many other
general-purpose datasets
dataset_base_acl: view
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:dataops-managed/taar
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: Telemetry Derived
description: |-
Derived data based on pings from legacy Firefox telemetry, plus many other
general-purpose derived tables
dataset_base_acl: derived
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: User-Defined Functions
description: |-
Persistent user-defined functions
dataset_base_acl: udf
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: User-Defined Functions (Javascript)
description: |-
Persistent user-defined functions written in Javascript
dataset_base_acl: udf
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: User-Defined Functions (Legacy)
description: |-
Persistent user-defined functions intended for compatibility with queries
from legacy AWS infrastructure
dataset_base_acl: udf
user_facing: true
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential