101 строка
3.1 KiB
Python
Executable File
101 строка
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import os
|
|
import json
|
|
from argparse import ArgumentParser
|
|
|
|
from google.cloud import bigquery
|
|
|
|
DATA_FILENAME = "data.csv"
|
|
SCHEMA_FILENAME = "schema.json"
|
|
DESCRIPTION_FILENAME = "description.txt"
|
|
|
|
|
|
def parse_args():
|
|
parser = ArgumentParser()
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
default="sql/",
|
|
help="Path containing CSV's containing static data",
|
|
)
|
|
parser.add_argument("--project-id", help="Project to publish tables to")
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_table(
|
|
data_file_path, schema_file_path=None, description_file_path=None, project=None
|
|
):
|
|
client = bigquery.Client()
|
|
|
|
# Assume path is .../dataset/table/data.csv
|
|
path_split = os.path.normcase(data_file_path).split("/")
|
|
dataset_id = path_split[-3]
|
|
table_id = path_split[-2]
|
|
dataset_ref = client.dataset(dataset_id, project=project)
|
|
table_ref = dataset_ref.table(table_id)
|
|
|
|
job_config = bigquery.LoadJobConfig(
|
|
source_format=bigquery.SourceFormat.CSV,
|
|
skip_leading_rows=1,
|
|
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
|
|
)
|
|
with open(data_file_path, "rb") as data_file:
|
|
if schema_file_path is None:
|
|
fields = data_file.readline().decode().strip().split(",")
|
|
# Assume all fields are strings and nullable
|
|
job_config.schema = [
|
|
bigquery.SchemaField(field, field_type="STRING") for field in fields
|
|
]
|
|
data_file.seek(0)
|
|
else:
|
|
with open(schema_file_path) as schema_file:
|
|
fields = json.load(schema_file)
|
|
job_config.schema = [
|
|
bigquery.SchemaField(
|
|
field["name"],
|
|
field_type=field.get("type", "STRING"),
|
|
mode=field.get("mode", "NULLABLE"),
|
|
description=field.get("description"),
|
|
)
|
|
for field in fields
|
|
]
|
|
|
|
job = client.load_table_from_file(data_file, table_ref, job_config=job_config)
|
|
|
|
job.result()
|
|
|
|
if description_file_path is not None:
|
|
with open(description_file_path) as description_file:
|
|
description = description_file.read()
|
|
table = client.get_table(table_ref)
|
|
table.description = description
|
|
client.update_table(table, ["description"])
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
for root, dirs, files in os.walk(args.data_dir):
|
|
for filename in files:
|
|
if filename == DATA_FILENAME:
|
|
schema_file_path = (
|
|
os.path.join(root, SCHEMA_FILENAME)
|
|
if SCHEMA_FILENAME in files
|
|
else None
|
|
)
|
|
description_file_path = (
|
|
os.path.join(root, DESCRIPTION_FILENAME)
|
|
if DESCRIPTION_FILENAME in files
|
|
else None
|
|
)
|
|
load_table(
|
|
os.path.join(root, filename),
|
|
schema_file_path,
|
|
description_file_path,
|
|
args.project_id,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|