Bug 1708264 - Generate dataset metadata (#1988)
* Generate dataset_metadata.yaml based on doctypes * Exclude pioneer from dataset_metadata.yaml creation * Mark `static` as user-facing * Use namespace in friendly names, and use default descriptions
This commit is contained in:
Родитель
e81d4ff988
Коммит
7851b5d37a
|
@ -25,6 +25,7 @@ from typing import List
|
|||
|
||||
from bigquery_etl.dryrun import DryRun
|
||||
from bigquery_etl.format_sql.formatter import reformat
|
||||
from bigquery_etl.metadata.parse_metadata import DatasetMetadata
|
||||
from bigquery_etl.util import standard_args
|
||||
|
||||
MPS_URI = "https://github.com/mozilla-services/mozilla-pipeline-schemas"
|
||||
|
@ -109,6 +110,55 @@ class SchemaFile:
|
|||
)
|
||||
|
||||
|
||||
def write_dataset_metadata_if_not_exists(
|
||||
target_project: str, sql_dir: Path, schema: SchemaFile
|
||||
):
|
||||
"""Write default dataset_metadata.yaml files where none exist.
|
||||
|
||||
This function expects to be handed one representative `SchemaFile`
|
||||
object representing the dataset.
|
||||
"""
|
||||
dataset_family = schema.bq_dataset_family
|
||||
project_dir = sql_dir / target_project
|
||||
|
||||
# Derived dataset
|
||||
dataset_name = f"{dataset_family}_derived"
|
||||
target = project_dir / dataset_name / "dataset_metadata.yaml"
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not target.exists():
|
||||
print(f"Creating {target}")
|
||||
DatasetMetadata(
|
||||
friendly_name=f"{schema.document_namespace} Derived",
|
||||
description=(
|
||||
f"Derived tables related to document namespace"
|
||||
f" {schema.document_namespace},"
|
||||
f" usually populated via queries defined in"
|
||||
f" https://github.com/mozilla/bigquery-etl"
|
||||
f" and managed by Airflow"
|
||||
),
|
||||
dataset_base_acl="derived",
|
||||
user_facing=False,
|
||||
).write(target)
|
||||
|
||||
# User-facing dataset
|
||||
dataset_name = dataset_family
|
||||
target = project_dir / dataset_name / "dataset_metadata.yaml"
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not target.exists():
|
||||
print(f"Creating {target}")
|
||||
DatasetMetadata(
|
||||
friendly_name=f"{schema.document_namespace}",
|
||||
description=(
|
||||
f"User-facing views related to document namespace"
|
||||
f" {schema.document_namespace}; see https://github.com/"
|
||||
f"mozilla-services/mozilla-pipeline-schemas/tree/"
|
||||
f"generated-schemas/schemas/{schema.document_namespace}"
|
||||
),
|
||||
dataset_base_acl="view",
|
||||
user_facing=True,
|
||||
).write(target)
|
||||
|
||||
|
||||
def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaFile):
|
||||
"""If a view.sql does not already exist, write one to the target directory."""
|
||||
target_dir = (
|
||||
|
@ -117,16 +167,12 @@ def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaF
|
|||
/ schema.bq_dataset_family
|
||||
/ schema.bq_table_unversioned
|
||||
)
|
||||
|
||||
target_file = target_dir / "view.sql"
|
||||
|
||||
if target_file.exists():
|
||||
return
|
||||
|
||||
# Exclude doctypes maintained in separate projects.
|
||||
for prefix in SKIP_PREFIXES:
|
||||
if schema.bq_dataset_family.startswith(prefix):
|
||||
return
|
||||
|
||||
full_source_id = f"{target_project}.{schema.stable_table}"
|
||||
full_view_id = f"{target_project}.{schema.user_facing_view}"
|
||||
replacements = ["mozfun.norm.metadata(metadata) AS metadata"]
|
||||
|
@ -205,17 +251,29 @@ def get_stable_table_schemas() -> List[SchemaFile]:
|
|||
document_version=version,
|
||||
)
|
||||
)
|
||||
|
||||
# Exclude doctypes maintained in separate projects.
|
||||
for prefix in SKIP_PREFIXES:
|
||||
schemas = [
|
||||
schema
|
||||
for schema in schemas
|
||||
if not schema.document_namespace.startswith(prefix)
|
||||
]
|
||||
|
||||
# Retain only the highest version per doctype.
|
||||
schemas = sorted(
|
||||
schemas,
|
||||
key=lambda t: f"{t.document_namespace}/{t.document_type}/{t.document_version:03d}",
|
||||
)
|
||||
return [
|
||||
schemas = [
|
||||
last
|
||||
for k, (*_, last) in groupby(
|
||||
schemas, lambda t: f"{t.document_namespace}/{t.document_type}"
|
||||
)
|
||||
]
|
||||
|
||||
return schemas
|
||||
|
||||
|
||||
def prod_schemas_uri():
|
||||
"""Return URI for the schemas tarball deployed to shared-prod.
|
||||
|
@ -250,6 +308,9 @@ def main():
|
|||
parser.error(f"argument --log-level: {e}")
|
||||
|
||||
schemas = get_stable_table_schemas()
|
||||
one_schema_per_dataset = [
|
||||
last for k, (*_, last) in groupby(schemas, lambda t: t.bq_dataset_family)
|
||||
]
|
||||
|
||||
with ThreadPool(args.parallelism) as pool:
|
||||
pool.map(
|
||||
|
@ -261,6 +322,15 @@ def main():
|
|||
schemas,
|
||||
chunksize=1,
|
||||
)
|
||||
pool.map(
|
||||
partial(
|
||||
write_dataset_metadata_if_not_exists,
|
||||
args.target_project,
|
||||
Path(args.sql_dir),
|
||||
),
|
||||
one_schema_per_dataset,
|
||||
chunksize=1,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
friendly_name: Activity Stream
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
Views related to data from the activity-stream namespace,
|
||||
capturing activity on the desktop Firefox newtab page.
|
||||
User-facing views related to document namespace activity-stream; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/activity-stream
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
friendly_name: Activity Stream
|
||||
friendly_name: Activity Stream Derived
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
Derived data from the activity-stream namespace,
|
||||
capturing activity on the desktop Firefox newtab page.
|
||||
Derived tables related to document namespace activity-stream, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: AMO Stats Dev
|
||||
description: |-
|
||||
Derived data used to power the dev instance of the AMO stats dashboards
|
||||
dataset_base_acl: stable
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
||||
- workgroup:amo/nonprod
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: AMO Stats Prod
|
||||
description: |-
|
||||
Derived data used to power the AMO stats dashboards
|
||||
dataset_base_acl: stable
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
||||
- workgroup:amo/prod
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Analysis
|
||||
description: |-
|
||||
User-generated tables for analysis
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,12 @@
|
|||
friendly_name: Blocklist ADI
|
||||
description: |-
|
||||
Historical data for Firefox active daily installations.
|
||||
|
||||
See https://wiki.mozilla.org/ADI
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: contextual-services
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
User-facing views related to document namespace contextual-services; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/contextual-services
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:contextual-services
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: contextual-services Derived
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
Derived tables related to document namespace contextual-services, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:contextual-services
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Firefox Accounts
|
||||
description: |-
|
||||
Data related to Firefox Accounts
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Firefox Accounts Derived
|
||||
description: |-
|
||||
Derived tables for FxA data
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,15 @@
|
|||
friendly_name: Internet Outages
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
Derived data useful for detecting regional internet outages,
|
||||
shared with some trusted external partners.
|
||||
|
||||
See https://docs.google.com/document/d/15HvdPS3UwGhAir6HyWHDLDRaxUFycx4jagJ_ZrHr9D8/edit
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
||||
- workgroup:internet-outages/external
|
|
@ -0,0 +1,13 @@
|
|||
friendly_name: Looker Derived
|
||||
description: |-
|
||||
Cache tables populated and used by Looker
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
||||
- role: roles/bigquery.dataEditor
|
||||
members:
|
||||
- workgroup:dataops-managed/looker
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Monitoring
|
||||
description: |-
|
||||
User-facing views for pipeline monitoring,
|
||||
including views on `payload_bytes` tables
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Monitoring Derived
|
||||
description: |-
|
||||
Derived tables used for various pipeline monitoring purposes.
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Mozilla VPN
|
||||
description: |-
|
||||
Data related to the Mozilla VPN service
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Mozilla VPN Derived
|
||||
description: |-
|
||||
Derived data related to the Mozilla VPN service
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,7 @@
|
|||
friendly_name: Mozilla VPN External
|
||||
description: |-
|
||||
Data extracted from the Mozilla VPN service database.
|
||||
dataset_base_acl: restricted
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access: []
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: regrets-reporter
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
User-facing views related to document namespace regrets-reporter; see https://github.com/mozilla-services/mozilla-pipeline-schemas/tree/generated-schemas/schemas/regrets-reporter
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:regrets-reporter
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: regrets-reporter Derived
|
||||
# yamllint disable rule:line-length
|
||||
description: |-
|
||||
Derived tables related to document namespace regrets-reporter, usually populated via queries defined in https://github.com/mozilla/bigquery-etl and managed by Airflow
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:regrets-reporter
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Revenue Derived
|
||||
description: |-
|
||||
Derived tables related to Mozilla revenue work
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Search
|
||||
description: |-
|
||||
Views related to client search counts
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Search Derived
|
||||
description: |-
|
||||
Derived tables related to client search counts
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Static Data
|
||||
description: |-
|
||||
Static tables, often useful for data-enriching joins
|
||||
dataset_base_acl: udf
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Stripe
|
||||
description: |-
|
||||
Views related to data extracted from payment provider Stripe
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Stripe Derived
|
||||
description: |-
|
||||
Derived data based on extract from payment provider Stripe
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Stripe External
|
||||
description: |-
|
||||
API extracts from Stripe, a payments partner
|
||||
dataset_base_acl: restricted
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:stripe
|
||||
- workgroup:data-science/stripe
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Subscription Platform
|
||||
description: |-
|
||||
Combined subscription information from multiple products
|
||||
and payment platforms; see bug 1703340
|
||||
dataset_base_acl: view_restricted
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,8 @@
|
|||
friendly_name: Subscription Platform Derived
|
||||
description: |-
|
||||
Tables combining subscription information from multiple products
|
||||
and payment platforms
|
||||
dataset_base_acl: derived_restricted
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access: []
|
|
@ -0,0 +1,12 @@
|
|||
friendly_name: Telemetry
|
||||
description: |-
|
||||
Views on data from legacy Firefox telemetry, plus many other
|
||||
general-purpose datasets
|
||||
dataset_base_acl: view
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:dataops-managed/taar
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: Telemetry Derived
|
||||
description: |-
|
||||
Derived data based on pings from legacy Firefox telemetry, plus many other
|
||||
general-purpose derived tables
|
||||
dataset_base_acl: derived
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: User-Defined Functions
|
||||
description: |-
|
||||
Persistent user-defined functions
|
||||
dataset_base_acl: udf
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: User-Defined Functions (Javascript)
|
||||
description: |-
|
||||
Persistent user-defined functions written in Javascript
|
||||
dataset_base_acl: udf
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,11 @@
|
|||
friendly_name: User-Defined Functions (Legacy)
|
||||
description: |-
|
||||
Persistent user-defined functions intended for compatibility with queries
|
||||
from legacy AWS infrastructure
|
||||
dataset_base_acl: udf
|
||||
user_facing: true
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
Загрузка…
Ссылка в новой задаче