Refactory publish_json script

2020-04-14 16:12:55 -07:00 · 2020-04-14 16:12:55 -07:00 · 9b6aa5fb46
--- a/bigquery_etl/public_data/publish_json.py
+++ b/bigquery_etl/public_data/publish_json.py
@ -1,5 +1,7 @@
 """Machinery for exporting query results as JSON to Cloud storage."""

+from argparse import ArgumentParser
+from google.cloud import storage
 from google.cloud import bigquery
 import smart_open
 import logging
@ -16,6 +18,8 @@ MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024  # 1 GB as max. size of exported JSON fil
 MAX_FILE_COUNT = 10_000
 # exported file name format: 000000000000.json.gz, 000000000001.json.gz, ...
 MAX_JSON_NAME_LENGTH = 12
+DEFAULT_BUCKET = "mozilla-public-data-http"
+DEFAULT_API_VERSION = "v1"

 logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s: %(levelname)s: %(message)s"
@ -229,3 +233,62 @@ class JsonPublisher:
            )
            query_job = self.client.query(sql, job_config=job_config)
            query_job.result()
+
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target-bucket",
+    "--target_bucket",
+    default=DEFAULT_BUCKET,
+    help="GCP bucket JSON data is exported to",
+)
+parser.add_argument(
+    "--project_id",
+    default="mozilla-public-data",
+    help="Run query in the target project",
+)
+parser.add_argument(
+    "--api_version",
+    "--api-version",
+    default=DEFAULT_API_VERSION,
+    help="API version data is published under in the storage bucket",
+)
+parser.add_argument(
+    "--parameter", action="append", help="Query parameters, such as submission_date"
+)
+parser.add_argument(
+    "--query-file", "--query_file", help="File path to query to be executed"
+)
+
+
+def main():
+    """Publish query data as JSON to GCS."""
+    args, query_arguments = parser.parse_known_args()
+
+    try:
+        metadata = Metadata.of_sql_file(args.query_file)
+    except FileNotFoundError:
+        print("No metadata file for: {}".format(args.query_file))
+        return
+
+    # check if the data should be published as JSON
+    if not metadata.is_public_json():
+        return
+
+    storage_client = storage.Client()
+    client = bigquery.Client(args.project_id)
+
+    publisher = JsonPublisher(
+        client,
+        storage_client,
+        args.project_id,
+        args.query_file,
+        args.api_version,
+        args.target_bucket,
+        args.parameter,
+    )
+    publisher.publish_json()
+
+
+if __name__ == "__main__":
+    main()
--- a/script/publish_public_data_json
+++ b/script/publish_public_data_json
@ -1,84 +1,11 @@
-#!/usr/bin/env python3
+#!/bin/sh

-"""
-Export query result data as JSON to a publicly accessible bucket.
-
-Data of the query is exported if "json_export" is set in
-the corresponding metadata file.
-"""
-
-from argparse import ArgumentParser
-import os
-import sys
-
-from google.cloud import storage
-from google.cloud import bigquery
-
-# sys.path needs to be modified to enable package imports from parent
-# and sibling directories. Also see:
-# https://stackoverflow.com/questions/6323860/sibling-package-imports/23542795#23542795
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from bigquery_etl.parse_metadata import Metadata  # noqa E402
-from bigquery_etl.publish_json import JsonPublisher  # noqa E402
+# Export query result data as JSON to a publicly accessible bucket.
+#
+# Data of the query is exported if "json_export" is set in
+# the corresponding metadata file.


-DEFAULT_BUCKET = "mozilla-public-data-http"
-DEFAULT_API_VERSION = "v1"
+cd "$(dirname "$0")/.."

-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target-bucket",
-    "--target_bucket",
-    default=DEFAULT_BUCKET,
-    help="GCP bucket JSON data is exported to",
-)
-parser.add_argument(
-    "--project_id",
-    default="mozilla-public-data",
-    help="Run query in the target project",
-)
-parser.add_argument(
-    "--api_version",
-    "--api-version",
-    default=DEFAULT_API_VERSION,
-    help="API version data is published under in the storage bucket",
-)
-parser.add_argument(
-    "--parameter", action="append", help="Query parameters, such as submission_date"
-)
-parser.add_argument(
-    "--query-file", "--query_file", help="File path to query to be executed"
-)
-
-
-def main():
-    args, query_arguments = parser.parse_known_args()
-
-    try:
-        metadata = Metadata.of_sql_file(args.query_file)
-    except FileNotFoundError:
-        print("No metadata file for: {}".format(args.query_file))
-        return
-
-    # check if the data should be published as JSON
-    if not metadata.is_public_json():
-        return
-
-    storage_client = storage.Client()
-    client = bigquery.Client(args.project_id)
-
-    publisher = JsonPublisher(
-        client,
-        storage_client,
-        args.project_id,
-        args.query_file,
-        args.api_version,
-        args.target_bucket,
-        args.parameter,
-    )
-    publisher.publish_json()
-
-
-if __name__ == "__main__":
-    main()
+exec python3 -m bigquery_etl.public_data.publish_json "$@"
--- a/tests/public_data/test_publish_json.py
+++ b/tests/public_data/test_publish_json.py
@ -2,7 +2,7 @@ import pytest
 import smart_open
 from unittest.mock import call, Mock, MagicMock

-from bigquery_etl.publish_json import JsonPublisher
+from bigquery_etl.public_data.publish_json import JsonPublisher


 class TestPublishJson(object):