Merge pull request #1 from acmiyaguchi/update

Remove bigquery-etl resolution
This commit is contained in:
Anthony Miyaguchi 2020-12-22 17:03:23 -08:00 коммит произвёл GitHub
Родитель 3801239a13 7204f8ef24
Коммит 0921e9f9ac
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 0 добавлений и 61 удалений

3
.gitmodules поставляемый
Просмотреть файл

@ -1,3 +0,0 @@
[submodule "bigquery-etl"]
path = bigquery-etl
url = git@github.com:mozilla/bigquery-etl.git

@ -1 +0,0 @@
Subproject commit cd51e987c365b60ed28e84ca4926bee2b7a5c9b7

Просмотреть файл

@ -8,7 +8,6 @@ from .config import *
from .crawler import (
fetch_dataset_listing,
fetch_table_listing,
resolve_bigquery_etl_references,
resolve_view_references,
)
from .utils import ensure_folder, ndjson_load, print_json, qualify, run, run_query
@ -37,15 +36,6 @@ def crawl():
resolve_view_references(views_listing, data_root / project)
@cli.command()
def etl():
"""Crawl bigquery-etl."""
# this is basically dryrun, but with some data collection baked in.
resolve_bigquery_etl_references(
ROOT / "bigquery-etl", ensure_folder(ROOT / "data" / "bigquery_etl")
)
@cli.command()
def query_logs():
"""Create edgelist from jobs by project query logs."""

Просмотреть файл

@ -97,46 +97,6 @@ def _view_dryrun(view_root, view):
json.dump(subset, fp, indent=2)
def _bigquery_etl_dryrun(output_root: Path, query: Path):
# this makes the assumption that the query writes to a destination table
# relative to the path in the repository
project_id = "moz-fx-data-shared-prod"
dataset_id = query.parent.parent.name
table_id = query.parent.name
base_query = query.read_text()
data = None
# TODO: set the following parameters
# submission_date, n_clients, sample_size, min_sample_id, max_sample_id
try:
result = run(
[
"bq",
"query",
f"--project_id={project_id}",
"--format=json",
"--use_legacy_sql=false",
"--dry_run",
base_query,
]
)
data = json.loads(result)
except Exception as e:
print(e)
if not data:
print(
f"unable to resolve query {query.relative_to(query.parent.parent.parent)}"
)
return
with (output_root / f"{dataset_id}.{table_id}.json").open("w") as fp:
subset = data["statistics"]
del subset["query"]["schema"]
subset = {
**dict(projectId=project_id, tableId=table_id, datasetId=dataset_id),
**subset,
}
json.dump(subset, fp, indent=2)
def resolve_view_references(view_listing, project_root):
# we don't really care about intermediate files
view_root = Path(tempfile.mkdtemp())
@ -154,10 +114,3 @@ def resolve_view_references(view_listing, project_root):
data = json.load(view.open())
fp.write(json.dumps(data))
fp.write("\n")
def resolve_bigquery_etl_references(bigquery_etl_root: Path, output_root: Path):
queries = list(bigquery_etl_root.glob("sql/**/query.sql"))
query_root = ensure_folder(output_root / "query")
for query in queries:
_bigquery_etl_dryrun(query_root, query)