DENG-1025: gfx_telemetry r/w to GCS instead of S3

2023-08-03 17:11:52 -04:00 · 2023-08-03 17:11:52 -04:00 · 78218d3274
--- a/mozetl/graphics/graphics_telemetry_dashboard.py
+++ b/mozetl/graphics/graphics_telemetry_dashboard.py
@ -2,7 +2,6 @@
 # pip install:
 # python_moztelemetry
 # git+https://github.com/FirefoxGraphics/telemetry.git#egg=pkg&subdirectory=analyses/bigquery_shim
-# boto3==1.16.20
 # six==1.15.0

 import argparse
@ -12,12 +11,13 @@ import os
 import sys
 import time

-import boto3
 from bigquery_shim import dashboard, snake_case
 from moztelemetry import get_pings_properties
 from pyspark import SparkContext
 from pyspark.sql import SparkSession

+from google.cloud import storage
+

 def fmt_date(d):
    return d.strftime("%Y%m%d")
@ -27,7 +27,7 @@ def repartition(pipeline):
    return pipeline.repartition(MaxPartitions).cache()


-s3_client = boto3.client("s3")
+storage_client = storage.Client()

 sc = SparkContext.getOrCreate()
 spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
@ -45,7 +45,9 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--default-time-window", type=int, default=14)
    parser.add_argument("--release-fraction", type=float, default=0.003)
-    parser.add_argument("--output-bucket", default="telemetry-public-analysis-2")
+    parser.add_argument(
+        "--output-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
+    )
    parser.add_argument("--output-prefix", default="gfx/telemetry-data/")
    return parser.parse_args()

@ -61,12 +63,12 @@ OUTPUT_PREFIX = args.output_prefix
 DefaultTimeWindow = args.default_time_window
 ReleaseFraction = args.release_fraction

-existing_objects = [
-    obj["Key"]
-    for obj in s3_client.list_objects_v2(Bucket=OUTPUT_BUCKET, Prefix=OUTPUT_PREFIX)[
-        "Contents"
-    ]
-]
+bucket = storage_client.get_bucket(OUTPUT_BUCKET)
+# List all objects in the GCS bucket with the given prefix
+blobs = bucket.list_blobs(prefix=OUTPUT_PREFIX)
+# Extract object names from the Blob objects
+existing_objects = [blob.name for blob in blobs]
+
 print(f"Existing objects: {existing_objects}")

 # List of keys for properties on session pings that we care about.
@ -324,10 +326,14 @@ def export(filename, obj, **kwargs):
    full_filename = os.path.join(OUTPUT_PREFIX, f"{filename}.json")
    print("Writing to {0}".format(full_filename))
    # serialize snake case dicts via their underlying dict
-    s3_client.put_object(
-        Bucket=OUTPUT_BUCKET,
-        Key=full_filename,
-        Body=bytes(json.dumps(obj, cls=snake_case.SnakeCaseEncoder), encoding="utf-8"),
+    bucket = storage_client.get_bucket(OUTPUT_BUCKET)
+    blob = bucket.blob(full_filename)
+
+    # Convert the 'obj' dictionary to JSON and upload it to GCS
+    # serialize snake case dicts via their underlying dict
+    blob.upload_from_string(
+        json.dumps(obj, cls=snake_case.SnakeCaseEncoder),
+        content_type="application/json",
    )


--- a/mozetl/graphics/graphics_telemetry_trends.py
+++ b/mozetl/graphics/graphics_telemetry_trends.py
@ -12,13 +12,14 @@ import os
 import sys
 import time

-import boto3
 import requests
 from bigquery_shim import trends
 from moztelemetry import get_one_ping_per_client
 from pyspark import SparkContext
 from pyspark.sql import SparkSession

+from google.cloud import storage
+

 def fmt_date(d):
    return d.strftime("%Y%m%d")
@ -32,7 +33,7 @@ def repartition(pipeline):
    return pipeline.repartition(MaxPartitions).cache()


-s3_client = boto3.client("s3")
+storage_client = storage.Client()

 sc = SparkContext.getOrCreate()
 spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
@ -45,13 +46,13 @@ WeeklyFraction = 0.003
 # Amount of days Telemetry keeps.
 MaxHistoryInDays = datetime.timedelta(days=210)

-# Bucket we'll drop files into on S3. If this is None, we won't attempt any
-# S3 uploads, and the analysis will start from scratch.
-S3_BUCKET = "telemetry-public-analysis-2"
-S3_PREFIX = "gfx/telemetry-data/"
+# Bucket we'll drop files into on GCS. If this is None, we won't attempt any
+# GCS uploads, and the analysis will start from scratch.
+GCS_BUCKET = "moz-fx-data-static-websit-f7e0-analysis-output"
+GCS_PREFIX = "gfx/telemetry-data/"
 GITHUB_REPO = "https://raw.githubusercontent.com/FirefoxGraphics/moz-gfx-telemetry"

-# List of jobs allowed to have a first-run (meaning no S3 content).
+# List of jobs allowed to have a first-run (meaning no GCS content).
 BrandNewJobs = []

 # If true, backfill up to MaxHistoryInDays rather than the last update.
@ -329,7 +330,7 @@ class TrendGroup(TrendBase):

 # A Trend object takes a new set of pings for a week's worth of data,
 # analyzes it, and adds the result to the trend set. Trend sets are
-# cached in S3 as JSON.
+# cached in GCS as JSON.
 #
 # If the latest entry in the cache covers less than a full week of
 # data, the entry is removed so that week can be re-queried.
@ -431,25 +432,22 @@ class Trend(TrendBase):
        with open(self.local_path, "w") as fp:
            fp.write(text)

-        if S3_BUCKET is not None:
+        if GCS_BUCKET is not None:
            try:
-                s3_client.upload_file(
-                    Filename=self.local_path,
-                    Bucket=S3_BUCKET,
-                    Key=os.path.join(S3_PREFIX, self.name),
-                )
+                bucket = storage_client.get_bucket(GCS_BUCKET)
+                blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
+                blob.upload_from_filename(self.local_path)
            except Exception as e:
-                print("Failed s3 upload: {0}".format(e))
+                print("Failed gcs upload: {0}".format(e))

    def fetch_json(self):
        print("Reading file {0}".format(self.local_path))
-        if S3_BUCKET is not None:
+        if GCS_BUCKET is not None:
            try:
-                s3_client.download_file(
-                    Bucket=S3_BUCKET,
-                    Key=os.path.join(S3_PREFIX, self.name),
-                    Filename=self.local_path,
-                )
+                storage_client = storage.Client()
+                bucket = storage_client.get_bucket(GCS_BUCKET)
+                blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
+                blob.download_to_filename(self.local_path)
                with open(self.local_path, "r") as fp:
                    return json.load(fp)
            except Exception:
@ -595,8 +593,10 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--force-max-backfill", action="store_true")
    parser.add_argument("--weekly-fraction", type=float, default=0.003)
-    parser.add_argument("--s3-bucket", default="telemetry-public-analysis-2")
-    parser.add_argument("--s3-prefix", default="gfx/telemetry-data/")
+    parser.add_argument(
+        "--gcs-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
+    )
+    parser.add_argument("--gcs-prefix", default="gfx/telemetry-data/")
    parser.add_argument("--max-history-in-days", type=int, default=210)
    parser.add_argument("--brand-new-jobs", action="append", default=[])
    return parser.parse_args()
@ -606,8 +606,8 @@ if __name__ == "__main__":
    args = parse_args()
    ForceMaxBackfill = args.force_max_backfill
    WeeklyFraction = args.weekly_fraction
-    S3_BUCKET = args.s3_bucket
-    S3_PREFIX = args.s3_prefix
+    GCS_BUCKET = args.gcs_bucket
+    GCS_PREFIX = args.gcs_prefix
    MaxHistoryInDays = datetime.timedelta(days=args.max_history_in_days)
    BrandNewJobs = args.brand_new_jobs

--- a/setup.py
+++ b/setup.py
@ -33,6 +33,7 @@ setup(
        'botocore==1.19.20',
        'click==7.1.2',
        'click_datetime==0.2',
+        'google-cloud-storage==2.7.0',
        'numpy==1.19.4',
        'pandas==1.1.4',
        # NOTE: this is pinned to 3.20.3 because protos changed and we can't regenerate them.