Support alternate projects in publish_static

This commit is contained in:
Anna Scholtz 2020-09-28 14:22:40 -07:00
Родитель 0a141f57bf
Коммит f6c67c25f2
4 изменённых файлов: 56 добавлений и 31 удалений

Просмотреть файл

@ -3,9 +3,10 @@
import os
import json
from argparse import ArgumentParser
from google.cloud import bigquery
from bigquery_etl.util.common import project_dirs
DATA_FILENAME = "data.csv"
SCHEMA_FILENAME = "schema.json"
DESCRIPTION_FILENAME = "description.txt"
@ -18,7 +19,9 @@ def _parse_args():
default="sql/",
help="Path containing CSV's containing static data",
)
parser.add_argument("--project-id", help="Project to publish tables to")
parser.add_argument(
"--project-id", "--project_id", help="Project to publish tables to"
)
return parser.parse_args()
@ -27,10 +30,12 @@ def _load_table(
):
client = bigquery.Client()
# Assume path is .../dataset/table/data.csv
# Assume path is ...project/data_dir/dataset/table/data.csv
path_split = os.path.normcase(data_file_path).split("/")
dataset_id = path_split[-3]
table_id = path_split[-2]
if not project:
project = path_split[0]
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)
@ -75,26 +80,29 @@ def _load_table(
def main():
"""Publish csv files as BigQuery tables."""
args = _parse_args()
projects = project_dirs(args.project_id)
data_dirs = [os.path.join(project, args.data_dir) for project in projects]
for root, dirs, files in os.walk(args.data_dir):
for filename in files:
if filename == DATA_FILENAME:
schema_file_path = (
os.path.join(root, SCHEMA_FILENAME)
if SCHEMA_FILENAME in files
else None
)
description_file_path = (
os.path.join(root, DESCRIPTION_FILENAME)
if DESCRIPTION_FILENAME in files
else None
)
_load_table(
os.path.join(root, filename),
schema_file_path,
description_file_path,
args.project_id,
)
for data_dir in data_dirs:
for root, dirs, files in os.walk(data_dir):
for filename in files:
if filename == DATA_FILENAME:
schema_file_path = (
os.path.join(root, SCHEMA_FILENAME)
if SCHEMA_FILENAME in files
else None
)
description_file_path = (
os.path.join(root, DESCRIPTION_FILENAME)
if DESCRIPTION_FILENAME in files
else None
)
_load_table(
os.path.join(root, filename),
schema_file_path,
description_file_path,
args.project_id,
)
if __name__ == "__main__":

Просмотреть файл

@ -10,6 +10,7 @@ from google.cloud import storage
from bigquery_etl.util import standard_args
from bigquery_etl.udf.parse_udf import read_udf_dirs, accumulate_dependencies
from bigquery_etl.util.common import project_dirs
DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
DEFAULT_UDF_DIR = ["udf/", "udf_js/"]
@ -81,15 +82,7 @@ def get_udf_dirs(udf_dirs, project_id):
if project_id != "mozfun":
# for non-mozfun projects, the default UDF directories are udf/ and udf_js/
# the project needs to be pre-pended to these paths
if project_id is None:
# publish for all projects
projects = [
project_dir
for project_dir in os.listdir()
if project_dir.startswith("moz-fx-")
]
else:
projects = [project_id]
projects = project_dirs(project_id)
udf_dirs = [
os.path.join(project, d)

Просмотреть файл

@ -1,5 +1,7 @@
"""Generic utility functions."""
import os
import re
from typing import List
# Search for all camelCase situations in reverse with arbitrary lookaheads.
@ -22,3 +24,15 @@ def snake_case(line: str) -> str:
words = REV_WORD_BOUND_PAT.split(subbed)
# filter spaces between words and snake_case and reverse again
return "_".join([w.lower() for w in words if w.strip()])[::-1]
def project_dirs(project_id=None) -> List[str]:
"""Return all project directories, except mozfun."""
if project_id is None:
return [
project_dir
for project_dir in os.listdir()
if project_dir.startswith("moz-fx-")
]
else:
return [project_id]

10
tests/util/test_common.py Normal file
Просмотреть файл

@ -0,0 +1,10 @@
from bigquery_etl.util.common import project_dirs
class TestUtilCommon:
def test_project_dirs(self):
assert project_dirs("test") == ["test"]
existing_projects = project_dirs()
assert "moz-fx-data-shared-prod" in existing_projects
assert "mozfun" not in existing_projects