зеркало из https://github.com/mozilla/docker-etl.git
Merge pull request #1 from acmiyaguchi/update
Remove bigquery-etl resolution
This commit is contained in:
Коммит
0921e9f9ac
|
@ -1,3 +0,0 @@
|
|||
[submodule "bigquery-etl"]
|
||||
path = bigquery-etl
|
||||
url = git@github.com:mozilla/bigquery-etl.git
|
|
@ -1 +0,0 @@
|
|||
Subproject commit cd51e987c365b60ed28e84ca4926bee2b7a5c9b7
|
|
@ -8,7 +8,6 @@ from .config import *
|
|||
from .crawler import (
|
||||
fetch_dataset_listing,
|
||||
fetch_table_listing,
|
||||
resolve_bigquery_etl_references,
|
||||
resolve_view_references,
|
||||
)
|
||||
from .utils import ensure_folder, ndjson_load, print_json, qualify, run, run_query
|
||||
|
@ -37,15 +36,6 @@ def crawl():
|
|||
resolve_view_references(views_listing, data_root / project)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def etl():
|
||||
"""Crawl bigquery-etl."""
|
||||
# this is basically dryrun, but with some data collection baked in.
|
||||
resolve_bigquery_etl_references(
|
||||
ROOT / "bigquery-etl", ensure_folder(ROOT / "data" / "bigquery_etl")
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def query_logs():
|
||||
"""Create edgelist from jobs by project query logs."""
|
||||
|
|
|
@ -97,46 +97,6 @@ def _view_dryrun(view_root, view):
|
|||
json.dump(subset, fp, indent=2)
|
||||
|
||||
|
||||
def _bigquery_etl_dryrun(output_root: Path, query: Path):
|
||||
# this makes the assumption that the query writes to a destination table
|
||||
# relative to the path in the repository
|
||||
project_id = "moz-fx-data-shared-prod"
|
||||
dataset_id = query.parent.parent.name
|
||||
table_id = query.parent.name
|
||||
base_query = query.read_text()
|
||||
data = None
|
||||
# TODO: set the following parameters
|
||||
# submission_date, n_clients, sample_size, min_sample_id, max_sample_id
|
||||
try:
|
||||
result = run(
|
||||
[
|
||||
"bq",
|
||||
"query",
|
||||
f"--project_id={project_id}",
|
||||
"--format=json",
|
||||
"--use_legacy_sql=false",
|
||||
"--dry_run",
|
||||
base_query,
|
||||
]
|
||||
)
|
||||
data = json.loads(result)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
if not data:
|
||||
print(
|
||||
f"unable to resolve query {query.relative_to(query.parent.parent.parent)}"
|
||||
)
|
||||
return
|
||||
with (output_root / f"{dataset_id}.{table_id}.json").open("w") as fp:
|
||||
subset = data["statistics"]
|
||||
del subset["query"]["schema"]
|
||||
subset = {
|
||||
**dict(projectId=project_id, tableId=table_id, datasetId=dataset_id),
|
||||
**subset,
|
||||
}
|
||||
json.dump(subset, fp, indent=2)
|
||||
|
||||
|
||||
def resolve_view_references(view_listing, project_root):
|
||||
# we don't really care about intermediate files
|
||||
view_root = Path(tempfile.mkdtemp())
|
||||
|
@ -154,10 +114,3 @@ def resolve_view_references(view_listing, project_root):
|
|||
data = json.load(view.open())
|
||||
fp.write(json.dumps(data))
|
||||
fp.write("\n")
|
||||
|
||||
|
||||
def resolve_bigquery_etl_references(bigquery_etl_root: Path, output_root: Path):
|
||||
queries = list(bigquery_etl_root.glob("sql/**/query.sql"))
|
||||
query_root = ensure_folder(output_root / "query")
|
||||
for query in queries:
|
||||
_bigquery_etl_dryrun(query_root, query)
|
||||
|
|
Загрузка…
Ссылка в новой задаче