From 0bc5c7f5ad8582bd3fc3484f4fb3704bdc2e4eee Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Wed, 17 Mar 2021 10:57:57 -0700 Subject: [PATCH 1/2] Differentiate stable and derived views (#19) --- generator/namespaces.py | 20 +++++++++++--- tests/test_namespaces.py | 58 ++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/generator/namespaces.py b/generator/namespaces.py index a6bb94a..498a424 100644 --- a/generator/namespaces.py +++ b/generator/namespaces.py @@ -29,10 +29,16 @@ def _get_views(uri): views = defaultdict(dict) with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: for tarinfo in tar: - if tarinfo.name.endswith("/view.sql"): + if tarinfo.name.endswith("/metadata.yaml"): + metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) + references = metadata.get("references", {}) + if "view.sql" not in references: + continue *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") if project == "moz-fx-data-shared-prod": - views[dataset_id][view_id] = tar.extractfile(tarinfo.name) + views[dataset_id][view_id] = [ + ref.split(".") for ref in references["view.sql"] + ] return views @@ -74,15 +80,21 @@ def namespaces(custom_namespaces, generated_sql_uri, app_listings_uri): for app in group: if app.get("deprecated"): continue - if canonical_app_name is None or app.get("app_channel") == "release": + is_release = app.get("app_channel") == "release" + if canonical_app_name is None or is_release: canonical_app_name = app["canonical_app_name"] dataset_id = app["bq_dataset_family"] - for view_id in view_definitions[dataset_id]: + for view_id, references in view_definitions[dataset_id].items(): if view_id in OMIT_VIEWS: continue table = {"table": f"mozdata.{dataset_id}.{view_id}"} if "app_channel" in app: table["channel"] = app["app_channel"] + if len(references) == 1 and references[0][-2] == f"{dataset_id}_stable": + # view references a single table in the stable dataset + table["is_ping_table"] = True + elif not is_release: + continue # ignore non-ping tables from non-release datasets views[view_id].append(table) namespaces[app_name] = { diff --git a/tests/test_namespaces.py b/tests/test_namespaces.py index bc124f9..12f7156 100644 --- a/tests/test_namespaces.py +++ b/tests/test_namespaces.py @@ -1,7 +1,8 @@ import gzip import json +import sys import tarfile -import traceback +from io import BytesIO from pathlib import Path from textwrap import dedent @@ -39,22 +40,31 @@ def generated_sql_uri(tmp_path): dest = tmp_path / "bigquery_etl.tar.gz" with tarfile.open(dest, "w:gz") as tar: for dataset in ("glean_app", "glean_app_beta"): - tar.addfile( - tarfile.TarInfo( - name=f"sql/moz-fx-data-shared-prod/{dataset}/baseline/view.sql" - ), - dedent( - f""" - CREATE OR REPLACE VIEW - `moz-fx-data-shared-prod`.{dataset}.baseline - AS - SELECT - * - FROM - `moz-fx-data-shared-prod`.{dataset}_stable.baseline - """ - ).lstrip(), + content = dedent( + f""" + references: + view.sql: + - moz-fx-data-shared-prod.{dataset}_derived.baseline_clients_daily_v1 + """ + ).lstrip() + info = tarfile.TarInfo( + f"sql/moz-fx-data-shared-prod/{dataset}/" + "baseline_clients_daily/metadata.yaml" ) + info.size = len(content) + tar.addfile(info, BytesIO(content.encode())) + content = dedent( + f""" + references: + view.sql: + - moz-fx-data-shared-prod.{dataset}_stable.baseline_v1 + """ + ).lstrip() + info = tarfile.TarInfo( + f"sql/moz-fx-data-shared-prod/{dataset}/baseline/metadata.yaml" + ) + info.size = len(content) + tar.addfile(info, BytesIO(content.encode())) return dest.absolute().as_uri() @@ -97,11 +107,14 @@ def test_namespaces(runner, custom_namespaces, generated_sql_uri, app_listings_u app_listings_uri, ], ) + sys.stdout.write(result.stdout) + if result.stderr_bytes is not None: + sys.stderr.write(result.stderr) try: - assert not result.exception - except Exception: - traceback.print_tb(result.exc_info[2]) - raise + assert result.exit_code == 0 + except Exception as e: + # use exception chaining to expose original traceback + raise e from result.exception assert ( dedent( """ @@ -116,9 +129,14 @@ def test_namespaces(runner, custom_namespaces, generated_sql_uri, app_listings_u views: baseline: - channel: release + is_ping_table: true table: mozdata.glean_app.baseline - channel: beta + is_ping_table: true table: mozdata.glean_app_beta.baseline + baseline_clients_daily: + - channel: release + table: mozdata.glean_app.baseline_clients_daily """ ).lstrip() == Path("namespaces.yaml").read_text() From cef888e86d92659608e2ddf8230ac7d258e6913a Mon Sep 17 00:00:00 2001 From: Frank Bertsch Date: Mon, 22 Mar 2021 11:19:25 -0400 Subject: [PATCH 2/2] Include explore info in namespaces.yaml --- generator/explores.py | 34 ++++++++++++++++++++++++++++++++++ generator/namespaces.py | 13 +++++++++++++ tests/test_namespaces.py | 5 +++++ 3 files changed, 52 insertions(+) create mode 100644 generator/explores.py diff --git a/generator/explores.py b/generator/explores.py new file mode 100644 index 0000000..977a2e0 --- /dev/null +++ b/generator/explores.py @@ -0,0 +1,34 @@ +"""All possible generated explores.""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Iterator, List + + +@dataclass +class Explore: + """A generic explore.""" + + name: str + type: str + views: Dict[str, str] + + def to_dict(self) -> dict: + """Explore instance represented as a dict.""" + return {self.name: {"type": self.type, "views": self.views}} + + +@dataclass +class PingExplore(Explore): + """A Ping Table explore.""" + + @staticmethod + def from_views(views: Dict[str, List[Dict[str, str]]]) -> Iterator[PingExplore]: + """Generate all possible PingExplores from the views.""" + for view, channel_infos in views.items(): + is_ping_tbl = all((c.get("is_ping_table", False) for c in channel_infos)) + if is_ping_tbl: + yield PingExplore(view, "ping_explore", {"base_view": view}) + + +explore_types = [PingExplore] diff --git a/generator/namespaces.py b/generator/namespaces.py index 498a424..230ac5a 100644 --- a/generator/namespaces.py +++ b/generator/namespaces.py @@ -10,10 +10,13 @@ from io import BytesIO from itertools import groupby from operator import itemgetter from pathlib import Path +from typing import Dict, List import click import yaml +from .explores import explore_types + PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org" OMIT_VIEWS = {"deletion_request"} @@ -42,6 +45,15 @@ def _get_views(uri): return views +def _get_explores(views: Dict[str, List[Dict[str, str]]]) -> dict: + explores = {} + for klass in explore_types: + for explore in klass.from_views(views): + explores.update(explore.to_dict()) + + return explores + + @click.command(help=__doc__) @click.option( "--custom-namespaces", @@ -100,6 +112,7 @@ def namespaces(custom_namespaces, generated_sql_uri, app_listings_uri): namespaces[app_name] = { "canonical_app_name": canonical_app_name, "views": dict(views), + "explores": _get_explores(dict(views)), } if custom_namespaces is not None: diff --git a/tests/test_namespaces.py b/tests/test_namespaces.py index 12f7156..4a4bf6d 100644 --- a/tests/test_namespaces.py +++ b/tests/test_namespaces.py @@ -126,6 +126,11 @@ def test_namespaces(runner, custom_namespaces, generated_sql_uri, app_listings_u table: mozdata.custom.baseline glean-app: canonical_app_name: Glean App + explores: + baseline: + type: ping_explore + views: + base_view: baseline views: baseline: - channel: release