Run black and refactor --only and --except args

This commit is contained in:
Jeff Klukas 2019-07-29 16:26:41 -04:00
Родитель ccb65d6d18
Коммит 00cef9d7e9
1 изменённых файлов: 29 добавлений и 20 удалений

Просмотреть файл

@ -12,6 +12,7 @@ or to process only a specific list of tables.
from argparse import ArgumentParser
from datetime import datetime
from fnmatch import fnmatch
from google.cloud import bigquery
@ -49,25 +50,29 @@ parser.add_argument(
parser.add_argument(
"--dry-run",
action="store_true",
help=("Do not run queries, but instead print the query job config "
"and bytes that would be processed"),
help=(
"Do not run queries, but instead print the query job config "
"and bytes that would be processed"
),
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--only",
nargs="+",
dest="only_tables",
default=[],
help=("Process only the given tables; "
"pass names like 'telemetry_live.main_v4'"),
help=(
"Process only the given tables; "
"pass names or globs like 'telemetry_live.main_v*' "
),
)
group.add_argument(
"--except",
nargs="+",
dest="except_tables",
default=[],
help=("Process all tables in *_live datasets except for the given tables; "
"pass names like 'telemetry_live.main_v4'"),
help=(
"Process all tables in *_live datasets except for the given tables; "
"pass names or globs like 'telemetry_live.main_v*'"
),
)
@ -75,9 +80,7 @@ def sql_full_table_id(table):
return table.full_table_id.replace(":", ".")
def run_deduplication_query(
client, live_table, stable_table, date, dry_run
):
def run_deduplication_query(client, live_table, stable_table, date, dry_run):
sql = QUERY_TEMPLATE.format(source_table_spec=sql_full_table_id(live_table))
destination = f"{sql_full_table_id(stable_table)}${date:%Y%m%d}"
@ -111,24 +114,30 @@ def main():
client = bigquery.Client()
live_datasets = [
d for d in client.list_datasets(args.project_id) if d.dataset_id.endswith("_live")
d
for d in client.list_datasets(args.project_id)
if d.dataset_id.endswith("_live")
]
for live_dataset in live_datasets:
stable_dataset_id = live_dataset.dataset_id[:-5] + "_stable"
for live_table in client.list_tables(live_dataset.reference):
live_table_spec = f"{live_table.dataset_id}.{live_table.table_id}"
stable_table = client.get_table('.'.join([args.project_id, stable_dataset_id, live_table.table_id]))
if live_table_spec not in args.only_tables:
stable_table = client.get_table(
".".join([args.project_id, stable_dataset_id, live_table.table_id])
)
if args.except_tables is not None and any(
fnmatch(live_table_spec, pattern) for pattern in args.except_tables
):
print(f"Skipping {live_table_spec} due to --except argument")
continue
if live_table_spec in args.except_tables:
if args.only_tables is not None and not any(
fnmatch(live_table_spec, pattern) for pattern in args.only_tables
):
print(f"Skipping {live_table_spec} due to --only argument")
continue
run_deduplication_query(
client,
live_table,
stable_table,
args.date,
args.dry_run,
client, live_table, stable_table, args.date, args.dry_run
)