Merge pull request #1 from acmiyaguchi/update

Remove bigquery-etl resolution
2020-12-22 17:03:23 -08:00 · 2020-12-22 17:03:23 -08:00 · 0921e9f9ac
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "bigquery-etl"]
-	path = bigquery-etl
-	url = git@github.com:mozilla/bigquery-etl.git
--- a/1
+++ b/1
@ -1 +0,0 @@
-Subproject commit cd51e987c365b60ed28e84ca4926bee2b7a5c9b7
--- a/etl-graph/main.py
+++ b/etl-graph/main.py
@ -8,7 +8,6 @@ from .config import *
 from .crawler import (
    fetch_dataset_listing,
    fetch_table_listing,
-    resolve_bigquery_etl_references,
    resolve_view_references,
 )
 from .utils import ensure_folder, ndjson_load, print_json, qualify, run, run_query
@ -37,15 +36,6 @@ def crawl():
    resolve_view_references(views_listing, data_root / project)


-@cli.command()
-def etl():
-    """Crawl bigquery-etl."""
-    # this is basically dryrun, but with some data collection baked in.
-    resolve_bigquery_etl_references(
-        ROOT / "bigquery-etl", ensure_folder(ROOT / "data" / "bigquery_etl")
-    )
-
-
@cli.command()
 def query_logs():
    """Create edgelist from jobs by project query logs."""
--- a/etl-graph/crawler.py
+++ b/etl-graph/crawler.py
@ -97,46 +97,6 @@ def _view_dryrun(view_root, view):
        json.dump(subset, fp, indent=2)


-def _bigquery_etl_dryrun(output_root: Path, query: Path):
-    # this makes the assumption that the query writes to a destination table
-    # relative to the path in the repository
-    project_id = "moz-fx-data-shared-prod"
-    dataset_id = query.parent.parent.name
-    table_id = query.parent.name
-    base_query = query.read_text()
-    data = None
-    # TODO: set the following parameters
-    # submission_date, n_clients, sample_size, min_sample_id, max_sample_id
-    try:
-        result = run(
-            [
-                "bq",
-                "query",
-                f"--project_id={project_id}",
-                "--format=json",
-                "--use_legacy_sql=false",
-                "--dry_run",
-                base_query,
-            ]
-        )
-        data = json.loads(result)
-    except Exception as e:
-        print(e)
-    if not data:
-        print(
-            f"unable to resolve query {query.relative_to(query.parent.parent.parent)}"
-        )
-        return
-    with (output_root / f"{dataset_id}.{table_id}.json").open("w") as fp:
-        subset = data["statistics"]
-        del subset["query"]["schema"]
-        subset = {
-            **dict(projectId=project_id, tableId=table_id, datasetId=dataset_id),
-            **subset,
-        }
-        json.dump(subset, fp, indent=2)
-
-
 def resolve_view_references(view_listing, project_root):
    # we don't really care about intermediate files
    view_root = Path(tempfile.mkdtemp())
@ -154,10 +114,3 @@ def resolve_view_references(view_listing, project_root):
            data = json.load(view.open())
            fp.write(json.dumps(data))
            fp.write("\n")
-
-
-def resolve_bigquery_etl_references(bigquery_etl_root: Path, output_root: Path):
-    queries = list(bigquery_etl_root.glob("sql/**/query.sql"))
-    query_root = ensure_folder(output_root / "query")
-    for query in queries:
-        _bigquery_etl_dryrun(query_root, query)
				`@ -1 +0,0 @@`
				`Subproject commit cd51e987c365b60ed28e84ca4926bee2b7a5c9b7`