Support alternate projects in publish_static
This commit is contained in:
Родитель
0a141f57bf
Коммит
f6c67c25f2
|
@ -3,9 +3,10 @@
|
|||
import os
|
||||
import json
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from google.cloud import bigquery
|
||||
|
||||
from bigquery_etl.util.common import project_dirs
|
||||
|
||||
DATA_FILENAME = "data.csv"
|
||||
SCHEMA_FILENAME = "schema.json"
|
||||
DESCRIPTION_FILENAME = "description.txt"
|
||||
|
@ -18,7 +19,9 @@ def _parse_args():
|
|||
default="sql/",
|
||||
help="Path containing CSV's containing static data",
|
||||
)
|
||||
parser.add_argument("--project-id", help="Project to publish tables to")
|
||||
parser.add_argument(
|
||||
"--project-id", "--project_id", help="Project to publish tables to"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
@ -27,10 +30,12 @@ def _load_table(
|
|||
):
|
||||
client = bigquery.Client()
|
||||
|
||||
# Assume path is .../dataset/table/data.csv
|
||||
# Assume path is ...project/data_dir/dataset/table/data.csv
|
||||
path_split = os.path.normcase(data_file_path).split("/")
|
||||
dataset_id = path_split[-3]
|
||||
table_id = path_split[-2]
|
||||
if not project:
|
||||
project = path_split[0]
|
||||
dataset_ref = client.dataset(dataset_id, project=project)
|
||||
table_ref = dataset_ref.table(table_id)
|
||||
|
||||
|
@ -75,26 +80,29 @@ def _load_table(
|
|||
def main():
|
||||
"""Publish csv files as BigQuery tables."""
|
||||
args = _parse_args()
|
||||
projects = project_dirs(args.project_id)
|
||||
data_dirs = [os.path.join(project, args.data_dir) for project in projects]
|
||||
|
||||
for root, dirs, files in os.walk(args.data_dir):
|
||||
for filename in files:
|
||||
if filename == DATA_FILENAME:
|
||||
schema_file_path = (
|
||||
os.path.join(root, SCHEMA_FILENAME)
|
||||
if SCHEMA_FILENAME in files
|
||||
else None
|
||||
)
|
||||
description_file_path = (
|
||||
os.path.join(root, DESCRIPTION_FILENAME)
|
||||
if DESCRIPTION_FILENAME in files
|
||||
else None
|
||||
)
|
||||
_load_table(
|
||||
os.path.join(root, filename),
|
||||
schema_file_path,
|
||||
description_file_path,
|
||||
args.project_id,
|
||||
)
|
||||
for data_dir in data_dirs:
|
||||
for root, dirs, files in os.walk(data_dir):
|
||||
for filename in files:
|
||||
if filename == DATA_FILENAME:
|
||||
schema_file_path = (
|
||||
os.path.join(root, SCHEMA_FILENAME)
|
||||
if SCHEMA_FILENAME in files
|
||||
else None
|
||||
)
|
||||
description_file_path = (
|
||||
os.path.join(root, DESCRIPTION_FILENAME)
|
||||
if DESCRIPTION_FILENAME in files
|
||||
else None
|
||||
)
|
||||
_load_table(
|
||||
os.path.join(root, filename),
|
||||
schema_file_path,
|
||||
description_file_path,
|
||||
args.project_id,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -10,6 +10,7 @@ from google.cloud import storage
|
|||
|
||||
from bigquery_etl.util import standard_args
|
||||
from bigquery_etl.udf.parse_udf import read_udf_dirs, accumulate_dependencies
|
||||
from bigquery_etl.util.common import project_dirs
|
||||
|
||||
DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
|
||||
DEFAULT_UDF_DIR = ["udf/", "udf_js/"]
|
||||
|
@ -81,15 +82,7 @@ def get_udf_dirs(udf_dirs, project_id):
|
|||
if project_id != "mozfun":
|
||||
# for non-mozfun projects, the default UDF directories are udf/ and udf_js/
|
||||
# the project needs to be pre-pended to these paths
|
||||
if project_id is None:
|
||||
# publish for all projects
|
||||
projects = [
|
||||
project_dir
|
||||
for project_dir in os.listdir()
|
||||
if project_dir.startswith("moz-fx-")
|
||||
]
|
||||
else:
|
||||
projects = [project_id]
|
||||
projects = project_dirs(project_id)
|
||||
|
||||
udf_dirs = [
|
||||
os.path.join(project, d)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
"""Generic utility functions."""
|
||||
import os
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
|
||||
# Search for all camelCase situations in reverse with arbitrary lookaheads.
|
||||
|
@ -22,3 +24,15 @@ def snake_case(line: str) -> str:
|
|||
words = REV_WORD_BOUND_PAT.split(subbed)
|
||||
# filter spaces between words and snake_case and reverse again
|
||||
return "_".join([w.lower() for w in words if w.strip()])[::-1]
|
||||
|
||||
|
||||
def project_dirs(project_id=None) -> List[str]:
|
||||
"""Return all project directories, except mozfun."""
|
||||
if project_id is None:
|
||||
return [
|
||||
project_dir
|
||||
for project_dir in os.listdir()
|
||||
if project_dir.startswith("moz-fx-")
|
||||
]
|
||||
else:
|
||||
return [project_id]
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
from bigquery_etl.util.common import project_dirs
|
||||
|
||||
|
||||
class TestUtilCommon:
|
||||
def test_project_dirs(self):
|
||||
assert project_dirs("test") == ["test"]
|
||||
|
||||
existing_projects = project_dirs()
|
||||
assert "moz-fx-data-shared-prod" in existing_projects
|
||||
assert "mozfun" not in existing_projects
|
Загрузка…
Ссылка в новой задаче